From a563694b68459f89a575b39fa7cbae173f5e332e Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Wed, 20 Feb 2019 17:51:14 -0600 Subject: [PATCH 01/52] listmaking WIP --- src/tests/test_risklist.py | 25 ++++++++ src/triage/component/results_schema/schema.py | 3 +- src/triage/component/risklist.py | 60 +++++++++++++++++++ 3 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 src/tests/test_risklist.py create mode 100644 src/triage/component/risklist.py diff --git a/src/tests/test_risklist.py b/src/tests/test_risklist.py new file mode 100644 index 000000000..9b6015bad --- /dev/null +++ b/src/tests/test_risklist.py @@ -0,0 +1,25 @@ +from triage.component.risklist import generate_risk_list +from tests.utils import sample_config, populate_source_data +from triage.experiments import SingleThreadedExperiment +from triage.validation_primitives import table_should_have_data + + +def test_risklist(db_engine, project_storage): + # given a model id and as-of-date <= today + # and the model id is trained and is linked to an experiment with feature and cohort config + # generate records in listpredictions + # the # of records should equal the size of the cohort for that date + populate_source_data(db_engine) + SingleThreadedExperiment( + sample_config(), + db_engine=db_engine, + project_path=project_storage.project_path + ).run() + + model_id = 1 + as_of_date = '2013-01-01' + generate_risk_list(db_engine, model_id, as_of_date) + table_should_have_data( + db_engine=db_engine, + table_name="production.list_predictions", + ) diff --git a/src/triage/component/results_schema/schema.py b/src/triage/component/results_schema/schema.py index 3dc4f7c99..8676227ff 100644 --- a/src/triage/component/results_schema/schema.py +++ b/src/triage/component/results_schema/schema.py @@ -31,6 +31,7 @@ "CREATE SCHEMA IF NOT EXISTS model_metadata;" " CREATE SCHEMA IF NOT EXISTS test_results;" " CREATE SCHEMA IF NOT EXISTS train_results;" + " CREATE SCHEMA IF NOT EXISTS production;" ) event.listen(Base.metadata, "before_create", DDL(schemas)) @@ -86,7 +87,7 @@ class ModelGroup(Base): class ListPrediction(Base): __tablename__ = "list_predictions" - __table_args__ = {"schema": "model_metadata"} + __table_args__ = {"schema": "production"} model_id = Column( Integer, ForeignKey("model_metadata.models.model_id"), primary_key=True diff --git a/src/triage/component/risklist.py b/src/triage/component/risklist.py new file mode 100644 index 000000000..6f26befa9 --- /dev/null +++ b/src/triage/component/risklist.py @@ -0,0 +1,60 @@ +from triage.component.results_schema import upgrade_db +from triage.component.architect.cohort_table_generators import CohortTableGenerator +from triage.component.architect.features import FeatureGenerator, FeatureGroupCreator, FeatureGroupMixer, FeatureDictionaryCreator +from triage.util.conf import dt_from_str +import json + +def generate_risk_list(db_engine, model_id, as_of_date): + upgrade_db(db_engine=db_engine) + # 1. get feature and cohort config from database + get_experiment_query = """ + select experiments.config, matrices.matrix_metadata + from model_metadata.experiments + join model_metadata.experiment_matrices using (experiment_hash) + join model_metadata.matrices using (matrix_uuid) + join model_metadata.models on (models.train_matrix_uuid = matrices.matrix_uuid) + where model_id = %s + """ + results = list(db_engine.execute(get_experiment_query, model_id)) + experiment_config = results[0]['config'] + matrix_metadata = json.loads(results[0]['matrix_metadata']) + feature_config = experiment_config['feature_aggregations'] + cohort_config = experiment_config['cohort_config'] + timechop_config = experiment_config['temporal_config'] + feature_start_time = timechop_config['feature_start_time'] + feature_group = matrix_metadata['feature_groups'] + print(type(feature_group)) + print(feature_group) + cohort_table_name = f"production.cohort_{cohort_config['name']}" + cohort_table_generator = CohortTableGenerator( + db_engine=db_engine, + query=cohort_config['query'], + cohort_table_name=cohort_table_name + ) + feature_generator = FeatureGenerator( + db_engine=db_engine, + features_schema_name="production", + feature_start_time=feature_start_time, + ) + feature_dictionary_creator = FeatureDictionaryCreator( + features_schema_name="production", db_engine=db_engine + ) + feature_group_creator = FeatureGroupCreator(feature_group[0]) + + cohort_table_generator.generate_cohort_table([dt_from_str(as_of_date)]) + collate_aggregations = feature_generator.aggregations( + feature_aggregation_config=feature_config, + feature_dates=[as_of_date], + state_table=cohort_table_name + ) + feature_generator.process_table_tasks(feature_generator.generate_all_table_tasks(collate_aggregations, task_type='aggregation')) + imputation_table_tasks = feature_generator.generate_all_table_tasks(collate_aggregations, task_type='imputation') + feature_generator.process_table_tasks(imputation_table_tasks) + feature_dictionary = feature_dictionary_creator.feature_dictionary( + feature_table_names=imputation_table_tasks.keys(), + index_column_lookup=feature_generator.index_column_lookup( + collate_aggregations + ), + ) + smaller_dict = feature_group_creator.subsets(feature_dictionary) + print(feature_dictionary) From 9750c3ec9b2e366be5d83144c4639bc9199756e4 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Wed, 20 Feb 2019 17:52:07 -0600 Subject: [PATCH 02/52] forgot migraton --- .../1b990cbc04e4_production_schema.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py diff --git a/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py b/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py new file mode 100644 index 000000000..d30c24f2c --- /dev/null +++ b/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py @@ -0,0 +1,26 @@ +"""empty message + +Revision ID: 1b990cbc04e4 +Revises: 0bca1ba9706e +Create Date: 2019-02-20 16:41:22.810452 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '1b990cbc04e4' +down_revision = '0bca1ba9706e' +branch_labels = None +depends_on = None + + +def upgrade(): + op.execute("CREATE SCHEMA IF NOT EXISTS production") + op.execute("ALTER TABLE model_metadata.list_predictions SET SCHEMA production;") + + +def downgrade(): + op.execute("ALTER TABLE production.list_predictions SET SCHEMA model_metadata;") + op.execute("DROP SCHEMA IF EXISTS production") From 360f8f94efc7419433546f47c0bc7c3771aaf82a Mon Sep 17 00:00:00 2001 From: tweddielin Date: Thu, 21 Feb 2019 17:14:50 -0600 Subject: [PATCH 03/52] WIP --- src/tests/test_risklist.py | 7 +- src/triage/component/architect/builders.py | 57 +++++++------ src/triage/component/catwalk/storage.py | 21 ++++- src/triage/component/risklist.py | 95 +++++++++++++++++++--- 4 files changed, 141 insertions(+), 39 deletions(-) diff --git a/src/tests/test_risklist.py b/src/tests/test_risklist.py index 9b6015bad..5786a2831 100644 --- a/src/tests/test_risklist.py +++ b/src/tests/test_risklist.py @@ -18,7 +18,12 @@ def test_risklist(db_engine, project_storage): model_id = 1 as_of_date = '2013-01-01' - generate_risk_list(db_engine, model_id, as_of_date) + generate_risk_list( + db_engine=db_engine, + matrix_storage_engine=project_storage.matrix_storage_engine(), + model_storage_engine=project_storage.model_storage_engine(), + model_id=model_id, + as_of_date=as_of_date) table_should_have_data( db_engine=db_engine, table_name="production.list_predictions", diff --git a/src/triage/component/architect/builders.py b/src/triage/component/architect/builders.py index c1d75fb12..498e15a60 100644 --- a/src/triage/component/architect/builders.py +++ b/src/triage/component/architect/builders.py @@ -29,6 +29,7 @@ def __init__( self.replace = replace self.include_missing_labels_in_train_as = include_missing_labels_in_train_as self.run_id = run_id + self.includes_labels = 'labels_table_name' in self.db_config @property def sessionmaker(self): @@ -134,7 +135,7 @@ def make_entity_date_table( """ as_of_time_strings = [str(as_of_time) for as_of_time in as_of_times] - if matrix_type == "test" or self.include_missing_labels_in_train_as is not None: + if matrix_type == "test" or matrix_type == "production" or self.include_missing_labels_in_train_as is not None: indices_query = self._all_valid_entity_dates_query( as_of_time_strings=as_of_time_strings, state=state ) @@ -253,17 +254,19 @@ def build_matrix( if self.run_id: errored_matrix(self.run_id, self.db_engine) return - if not table_has_data( - "{}.{}".format( - self.db_config["labels_schema_name"], - self.db_config["labels_table_name"], - ), - self.db_engine, - ): - logging.warning("labels table is not populated, cannot build matrix") + + if self.includes_labels: + if not table_has_data( + "{}.{}".format( + self.db_config["labels_schema_name"], + self.db_config["labels_table_name"], + ), + self.db_engine, + ): + logging.warning("labels table is not populated, cannot build matrix") + return if self.run_id: errored_matrix(self.run_id, self.db_engine) - return matrix_store = self.matrix_storage_engine.get_store(matrix_uuid) if not self.replace and matrix_store.exists: @@ -287,7 +290,7 @@ def build_matrix( matrix_metadata["state"], matrix_type, matrix_uuid, - matrix_metadata["label_timespan"], + matrix_metadata.get("label_timespan", None), ) except ValueError as e: logging.warning( @@ -305,20 +308,21 @@ def build_matrix( as_of_times, feature_dictionary, entity_date_table_name, matrix_uuid ) logging.info(f"Feature data extracted for matrix {matrix_uuid}") - logging.info( - "Extracting label data from database into file for " "matrix %s", - matrix_uuid, - ) - labels_df = self.load_labels_data( - label_name, - label_type, - entity_date_table_name, - matrix_uuid, - matrix_metadata["label_timespan"], - ) - dataframes.insert(0, labels_df) - logging.info(f"Label data extracted for matrix {matrix_uuid}") + if self.includes_labels: + logging.info( + "Extracting label data from database into file for " "matrix %s", + matrix_uuid, + ) + labels_df = self.load_labels_data( + label_name, + label_type, + entity_date_table_name, + matrix_uuid, + matrix_metadata["label_timespan"], + ) + dataframes.insert(0, labels_df) + logging.info(f"Label data extracted for matrix {matrix_uuid}") # stitch together the csvs logging.info("Merging feature files for matrix %s", matrix_uuid) output = self.merge_feature_csvs(dataframes, matrix_uuid) @@ -326,7 +330,10 @@ def build_matrix( matrix_store.metadata = matrix_metadata # store the matrix - labels = output.pop(matrix_store.label_column_name) + if self.includes_labels: + labels = output.pop(matrix_store.label_column_name) + else: + labels = None matrix_store.matrix_label_tuple = output, labels matrix_store.save() logging.info("Matrix %s saved", matrix_uuid) diff --git a/src/triage/component/catwalk/storage.py b/src/triage/component/catwalk/storage.py index d357ce90b..5b6ed882d 100644 --- a/src/triage/component/catwalk/storage.py +++ b/src/triage/component/catwalk/storage.py @@ -12,6 +12,7 @@ TrainEvaluation, TestPrediction, TrainPrediction, + ListPrediction ) from triage.util.pandas import downcast_matrix @@ -371,7 +372,10 @@ def _preprocess_and_split_matrix(self, matrix_with_labels): if matrix_with_labels.index.levels[index_of_date].dtype != "datetime64[ns]": raise ValueError(f"Woah is {matrix_with_labels.index.levels[index_of_date].dtype}") matrix_with_labels = downcast_matrix(matrix_with_labels) - labels = matrix_with_labels.pop(self.label_column_name) + if self.metadata['matrix_type'] != 'production': + labels = matrix_with_labels.pop(self.label_column_name) + else: + labels = None design_matrix = matrix_with_labels return design_matrix, labels @@ -435,7 +439,7 @@ def columns(self, include_label=False): if include_label: return columns else: - return [col for col in columns if col != self.metadata["label_name"]] + return [col for col in columns if col != self.metadata.get("label_name", None)] @property def label_column_name(self): @@ -479,6 +483,8 @@ def matrix_type(self): return TrainMatrixType elif self.metadata["matrix_type"] == "test": return TestMatrixType + elif self.metadata["matrix_type"] == "production": + return ProductionMatrixType else: raise Exception( """matrix metadata for matrix {} must contain 'matrix_type' @@ -525,7 +531,10 @@ def matrix_with_sorted_columns(self, columns): @property def full_matrix_for_saving(self): - return self.design_matrix.assign(**{self.label_column_name: self.labels}) + if self.labels is not None: + return self.design_matrix.assign(**{self.label_column_name: self.labels}) + else: + return self.design_matrix def load_metadata(self): """Load metadata from storage""" @@ -644,3 +653,9 @@ class TrainMatrixType(object): evaluation_obj = TrainEvaluation prediction_obj = TrainPrediction is_test = False + + +class ProductionMatrixType(object): + string_name = "production" + prediction_obj = ListPrediction + diff --git a/src/triage/component/risklist.py b/src/triage/component/risklist.py index 6f26befa9..00ba79750 100644 --- a/src/triage/component/risklist.py +++ b/src/triage/component/risklist.py @@ -1,14 +1,20 @@ from triage.component.results_schema import upgrade_db -from triage.component.architect.cohort_table_generators import CohortTableGenerator +from triage.component.architect.cohort_table_generators import CohortTableGenerator, DEFAULT_ACTIVE_STATE from triage.component.architect.features import FeatureGenerator, FeatureGroupCreator, FeatureGroupMixer, FeatureDictionaryCreator +from triage.component.architect.builders import MatrixBuilder +from triage.component.catwalk.predictors import Predictor +from triage.component import metta from triage.util.conf import dt_from_str + import json +import re + -def generate_risk_list(db_engine, model_id, as_of_date): +def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, model_id, as_of_date): upgrade_db(db_engine=db_engine) # 1. get feature and cohort config from database get_experiment_query = """ - select experiments.config, matrices.matrix_metadata + select experiments.config, matrices.matrix_metadata, matrix_uuid from model_metadata.experiments join model_metadata.experiment_matrices using (experiment_hash) join model_metadata.matrices using (matrix_uuid) @@ -17,14 +23,21 @@ def generate_risk_list(db_engine, model_id, as_of_date): """ results = list(db_engine.execute(get_experiment_query, model_id)) experiment_config = results[0]['config'] + original_matrix_uuid = results[0]['matrix_uuid'] matrix_metadata = json.loads(results[0]['matrix_metadata']) feature_config = experiment_config['feature_aggregations'] cohort_config = experiment_config['cohort_config'] timechop_config = experiment_config['temporal_config'] feature_start_time = timechop_config['feature_start_time'] feature_group = matrix_metadata['feature_groups'] - print(type(feature_group)) print(feature_group) + # Convert feature_group (list of string) to dictionary + f_dict = {} + for fg in feature_group: + key, v = re.split(r'\W+', fg) + f_dict[key] = v + feature_group = f_dict + cohort_table_name = f"production.cohort_{cohort_config['name']}" cohort_table_generator = CohortTableGenerator( db_engine=db_engine, @@ -39,16 +52,23 @@ def generate_risk_list(db_engine, model_id, as_of_date): feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name="production", db_engine=db_engine ) - feature_group_creator = FeatureGroupCreator(feature_group[0]) - + feature_group_creator = FeatureGroupCreator(feature_group) cohort_table_generator.generate_cohort_table([dt_from_str(as_of_date)]) collate_aggregations = feature_generator.aggregations( feature_aggregation_config=feature_config, feature_dates=[as_of_date], state_table=cohort_table_name ) - feature_generator.process_table_tasks(feature_generator.generate_all_table_tasks(collate_aggregations, task_type='aggregation')) - imputation_table_tasks = feature_generator.generate_all_table_tasks(collate_aggregations, task_type='imputation') + feature_generator.process_table_tasks( + feature_generator.generate_all_table_tasks( + collate_aggregations, + task_type='aggregation' + ) + ) + imputation_table_tasks = feature_generator.generate_all_table_tasks( + collate_aggregations, + task_type='imputation' + ) feature_generator.process_table_tasks(imputation_table_tasks) feature_dictionary = feature_dictionary_creator.feature_dictionary( feature_table_names=imputation_table_tasks.keys(), @@ -56,5 +76,60 @@ def generate_risk_list(db_engine, model_id, as_of_date): collate_aggregations ), ) - smaller_dict = feature_group_creator.subsets(feature_dictionary) - print(feature_dictionary) + + db_config = { + "features_schema_name": "production", + "labels_schema_name": "public", + "cohort_table_name": cohort_table_name, + } + + matrix_builder = MatrixBuilder( + db_config=db_config, + matrix_storage_engine=matrix_storage_engine, + engine=db_engine, + experiment_hash=None, + replace=True, + ) + + feature_groups = feature_group_creator.subsets(feature_dictionary) + print(feature_groups) + master_feature_dict = FeatureGroupMixer(["all"]).generate(feature_groups)[0] + print(master_feature_dict) + for f in master_feature_dict['zip_code_features_aggregation_imputed']: + print(f) + matrix_metadata = { + 'as_of_times': [as_of_date], + 'matrix_id': str(as_of_date) + '_prediction', + 'state': DEFAULT_ACTIVE_STATE, + 'test_duration': '1y', + 'matrix_type': 'production', + 'label_timespan': None, + 'indices': ["entity_id", "as_of_date"], + 'feature_start_time': feature_start_time, + } + + matrix_uuid = metta.generate_uuid(matrix_metadata) + + matrix_builder.build_matrix( + as_of_times=[as_of_date], + label_name=None, + label_type=None, + feature_dictionary=master_feature_dict, + matrix_metadata=matrix_metadata, + matrix_uuid=matrix_uuid, + matrix_type="production", + ) + + predictor = Predictor( + model_storage_engine=model_storage_engine, + db_engine=db_engine + ) + + + predictor.predict( + model_id=model_id, + matrix_store=matrix_storage_engine.get_store(matrix_uuid), + misc_db_parameters={}, + train_matrix_columns=matrix_storage_engine.get_store(original_matrix_uuid).columns() + ) + From 999a46f2ec4495b2d405d7ae79ac7d16da4b035f Mon Sep 17 00:00:00 2001 From: tweddielin Date: Tue, 26 Feb 2019 14:33:54 -0600 Subject: [PATCH 04/52] alembic add label_value to list_predictions table --- src/triage/component/architect/builders.py | 12 ++-- .../component/architect/feature_generators.py | 8 ++- src/triage/component/catwalk/storage.py | 5 +- ...e85_add_label_value_to_prodcution_table.py | 54 ++++++++++++++ src/triage/component/results_schema/schema.py | 1 + src/triage/component/risklist.py | 71 ++++++++++--------- 6 files changed, 107 insertions(+), 44 deletions(-) create mode 100644 src/triage/component/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py diff --git a/src/triage/component/architect/builders.py b/src/triage/component/architect/builders.py index 498e15a60..2bb304ca7 100644 --- a/src/triage/component/architect/builders.py +++ b/src/triage/component/architect/builders.py @@ -309,6 +309,8 @@ def build_matrix( ) logging.info(f"Feature data extracted for matrix {matrix_uuid}") + # dataframes add label_name + if self.includes_labels: logging.info( "Extracting label data from database into file for " "matrix %s", @@ -323,17 +325,17 @@ def build_matrix( ) dataframes.insert(0, labels_df) logging.info(f"Label data extracted for matrix {matrix_uuid}") + else: + labels_df = pandas.DataFrame(index=dataframes[0].index, columns=[label_name]) + dataframes.insert(0, labels_df) + # stitch together the csvs logging.info("Merging feature files for matrix %s", matrix_uuid) output = self.merge_feature_csvs(dataframes, matrix_uuid) logging.info(f"Features data merged for matrix {matrix_uuid}") - matrix_store.metadata = matrix_metadata # store the matrix - if self.includes_labels: - labels = output.pop(matrix_store.label_column_name) - else: - labels = None + labels = output.pop(matrix_store.label_column_name) matrix_store.matrix_label_tuple = output, labels matrix_store.save() logging.info("Matrix %s saved", matrix_uuid) diff --git a/src/triage/component/architect/feature_generators.py b/src/triage/component/architect/feature_generators.py index d7c386d42..2008e4eb2 100644 --- a/src/triage/component/architect/feature_generators.py +++ b/src/triage/component/architect/feature_generators.py @@ -635,7 +635,7 @@ def _generate_agg_table_tasks_for(self, aggregation): return table_tasks - def _generate_imp_table_tasks_for(self, aggregation, drop_preagg=True): + def _generate_imp_table_tasks_for(self, aggregation, impute_cols=None, nonimpute_cols=None, drop_preagg=True): """Generate SQL statements for preparing, populating, and finalizing imputations, for each feature group table in the given aggregation. @@ -685,8 +685,10 @@ def _generate_imp_table_tasks_for(self, aggregation, drop_preagg=True): with self.db_engine.begin() as conn: results = conn.execute(aggregation.find_nulls()) null_counts = results.first().items() - impute_cols = [col for (col, val) in null_counts if val > 0] - nonimpute_cols = [col for (col, val) in null_counts if val == 0] + if impute_cols is None: + impute_cols = [col for (col, val) in null_counts if val > 0] + if nonimpute_cols is None: + nonimpute_cols = [col for (col, val) in null_counts if val == 0] # table tasks for imputed aggregation table, most of the work is done here # by collate's get_impute_create() diff --git a/src/triage/component/catwalk/storage.py b/src/triage/component/catwalk/storage.py index 5b6ed882d..614fab2b9 100644 --- a/src/triage/component/catwalk/storage.py +++ b/src/triage/component/catwalk/storage.py @@ -372,10 +372,7 @@ def _preprocess_and_split_matrix(self, matrix_with_labels): if matrix_with_labels.index.levels[index_of_date].dtype != "datetime64[ns]": raise ValueError(f"Woah is {matrix_with_labels.index.levels[index_of_date].dtype}") matrix_with_labels = downcast_matrix(matrix_with_labels) - if self.metadata['matrix_type'] != 'production': - labels = matrix_with_labels.pop(self.label_column_name) - else: - labels = None + labels = matrix_with_labels.pop(self.label_column_name) design_matrix = matrix_with_labels return design_matrix, labels diff --git a/src/triage/component/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py b/src/triage/component/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py new file mode 100644 index 000000000..4ae43a899 --- /dev/null +++ b/src/triage/component/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py @@ -0,0 +1,54 @@ +"""add label_value to prodcution table + +Revision ID: 264786a9fe85 +Revises: 1b990cbc04e4 +Create Date: 2019-02-26 13:17:05.365654 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '264786a9fe85' +down_revision = '1b990cbc04e4' +branch_labels = None +depends_on = None + + +def upgrade(): + op.drop_table("list_predictions", schema="production") + op.create_table( + "list_predictions", + sa.Column("model_id", sa.Integer(), nullable=False), + sa.Column("entity_id", sa.BigInteger(), nullable=False), + sa.Column("as_of_date", sa.DateTime(), nullable=False), + sa.Column("score", sa.Numeric(), nullable=True), + sa.Column('label_value', sa.Integer, nullable=True), + sa.Column("rank_abs", sa.Integer(), nullable=True), + sa.Column("rank_pct", sa.Float(), nullable=True), + sa.Column("matrix_uuid", sa.Text(), nullable=True), + sa.Column("test_label_window", sa.Interval(), nullable=True), + sa.ForeignKeyConstraint(["model_id"], ["model_metadata.models.model_id"]), + sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"), + schema="production", + ) + + +def downgrade(): + op.drop_table("list_predictions", schema="production") + op.create_table( + "list_predictions", + sa.Column("model_id", sa.Integer(), nullable=False), + sa.Column("entity_id", sa.BigInteger(), nullable=False), + sa.Column("as_of_date", sa.DateTime(), nullable=False), + sa.Column("score", sa.Numeric(), nullable=True), + sa.Column("rank_abs", sa.Integer(), nullable=True), + sa.Column("rank_pct", sa.Float(), nullable=True), + sa.Column("matrix_uuid", sa.Text(), nullable=True), + sa.Column("test_label_window", sa.Interval(), nullable=True), + sa.ForeignKeyConstraint(["model_id"], ["results.models.model_id"]), + sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"), + schema="results", + ) + diff --git a/src/triage/component/results_schema/schema.py b/src/triage/component/results_schema/schema.py index 8676227ff..1b369af42 100644 --- a/src/triage/component/results_schema/schema.py +++ b/src/triage/component/results_schema/schema.py @@ -95,6 +95,7 @@ class ListPrediction(Base): entity_id = Column(BigInteger, primary_key=True) as_of_date = Column(DateTime, primary_key=True) score = Column(Numeric) + label_value = Column(Integer) rank_abs = Column(Integer) rank_pct = Column(Float) matrix_uuid = Column(Text) diff --git a/src/triage/component/risklist.py b/src/triage/component/risklist.py index 00ba79750..221067e4f 100644 --- a/src/triage/component/risklist.py +++ b/src/triage/component/risklist.py @@ -1,11 +1,12 @@ from triage.component.results_schema import upgrade_db from triage.component.architect.cohort_table_generators import CohortTableGenerator, DEFAULT_ACTIVE_STATE -from triage.component.architect.features import FeatureGenerator, FeatureGroupCreator, FeatureGroupMixer, FeatureDictionaryCreator +from triage.component.architect.features import FeatureGenerator from triage.component.architect.builders import MatrixBuilder from triage.component.catwalk.predictors import Predictor from triage.component import metta from triage.util.conf import dt_from_str +from collections import OrderedDict import json import re @@ -23,20 +24,14 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m """ results = list(db_engine.execute(get_experiment_query, model_id)) experiment_config = results[0]['config'] + label_config = experiment_config['label_config'] original_matrix_uuid = results[0]['matrix_uuid'] matrix_metadata = json.loads(results[0]['matrix_metadata']) + feature_names = matrix_metadata['feature_names'] feature_config = experiment_config['feature_aggregations'] cohort_config = experiment_config['cohort_config'] timechop_config = experiment_config['temporal_config'] feature_start_time = timechop_config['feature_start_time'] - feature_group = matrix_metadata['feature_groups'] - print(feature_group) - # Convert feature_group (list of string) to dictionary - f_dict = {} - for fg in feature_group: - key, v = re.split(r'\W+', fg) - f_dict[key] = v - feature_group = f_dict cohort_table_name = f"production.cohort_{cohort_config['name']}" cohort_table_generator = CohortTableGenerator( @@ -44,38 +39,56 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m query=cohort_config['query'], cohort_table_name=cohort_table_name ) + feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name="production", feature_start_time=feature_start_time, ) - feature_dictionary_creator = FeatureDictionaryCreator( - features_schema_name="production", db_engine=db_engine - ) - feature_group_creator = FeatureGroupCreator(feature_group) + cohort_table_generator.generate_cohort_table([dt_from_str(as_of_date)]) collate_aggregations = feature_generator.aggregations( feature_aggregation_config=feature_config, feature_dates=[as_of_date], state_table=cohort_table_name ) + feature_generator.process_table_tasks( feature_generator.generate_all_table_tasks( collate_aggregations, task_type='aggregation' ) ) - imputation_table_tasks = feature_generator.generate_all_table_tasks( - collate_aggregations, - task_type='imputation' - ) + + reconstructed_feature_dictionary = {} + imputation_table_tasks = OrderedDict() + with db_engine.begin() as conn: + for aggregation in collate_aggregations: + feature_prefix = aggregation.prefix + feature_group = aggregation.get_table_name(imputed=True).split('.')[1] + feature_group = feature_group.replace('"', '') + feature_names_in_group = [f for f in feature_names if re.match(f'\A{feature_prefix}', f)] + reconstructed_feature_dictionary[feature_group] = feature_names_in_group + + feature_names_in_group = set(feature_names_in_group) + features_imputed_in_train = set(f for f in feature_names_in_group if f + '_imp' in feature_names_in_group) + + results = conn.execute(aggregation.find_nulls()) + null_counts = results.first().items() + + features_imputed_in_production = set([col for (col, val) in null_counts if val > 0]) + + total_impute_cols = features_imputed_in_production | features_imputed_in_train + total_nonimpute_cols = set(f for f in feature_names_in_group if '_imp' not in f) - total_impute_cols + task_generator = feature_generator._generate_imp_table_tasks_for + imputation_table_tasks.update(task_generator( + aggregation, + impute_cols=list(total_impute_cols), + nonimpute_cols=list(total_nonimpute_cols) + ) + ) + feature_generator.process_table_tasks(imputation_table_tasks) - feature_dictionary = feature_dictionary_creator.feature_dictionary( - feature_table_names=imputation_table_tasks.keys(), - index_column_lookup=feature_generator.index_column_lookup( - collate_aggregations - ), - ) db_config = { "features_schema_name": "production", @@ -91,12 +104,6 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m replace=True, ) - feature_groups = feature_group_creator.subsets(feature_dictionary) - print(feature_groups) - master_feature_dict = FeatureGroupMixer(["all"]).generate(feature_groups)[0] - print(master_feature_dict) - for f in master_feature_dict['zip_code_features_aggregation_imputed']: - print(f) matrix_metadata = { 'as_of_times': [as_of_date], 'matrix_id': str(as_of_date) + '_prediction', @@ -104,6 +111,7 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m 'test_duration': '1y', 'matrix_type': 'production', 'label_timespan': None, + 'label_name': label_config['name'], 'indices': ["entity_id", "as_of_date"], 'feature_start_time': feature_start_time, } @@ -112,9 +120,9 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m matrix_builder.build_matrix( as_of_times=[as_of_date], - label_name=None, + label_name=label_config['name'], label_type=None, - feature_dictionary=master_feature_dict, + feature_dictionary=reconstructed_feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="production", @@ -125,7 +133,6 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m db_engine=db_engine ) - predictor.predict( model_id=model_id, matrix_store=matrix_storage_engine.get_store(matrix_uuid), From 372d9c855686af66e373aa5a3af82f6028901516 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Thu, 28 Feb 2019 11:52:35 -0600 Subject: [PATCH 05/52] add docstrings --- src/triage/component/risklist.py | 81 ++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 26 deletions(-) diff --git a/src/triage/component/risklist.py b/src/triage/component/risklist.py index 221067e4f..0c408d3f8 100644 --- a/src/triage/component/risklist.py +++ b/src/triage/component/risklist.py @@ -11,9 +11,15 @@ import re -def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, model_id, as_of_date): - upgrade_db(db_engine=db_engine) - # 1. get feature and cohort config from database +def get_required_info_from_config(db_engine, model_id): + """Get all information needed to make the risk list from model_id + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + + Returns: (dict) a dictionary of all information needed for making the risk list + + """ get_experiment_query = """ select experiments.config, matrices.matrix_metadata, matrix_uuid from model_metadata.experiments @@ -33,26 +39,51 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m timechop_config = experiment_config['temporal_config'] feature_start_time = timechop_config['feature_start_time'] - cohort_table_name = f"production.cohort_{cohort_config['name']}" + model_info = {} + model_info['cohort_config'] = cohort_config + model_info['feature_config'] = feature_config + model_info['feature_names'] = feature_names + model_info['feature_start_time'] = feature_start_time + model_info['original_matrix_uuid'] = original_matrix_uuid + model_info['label_config'] = label_config + + return model_info + + +def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, model_id, as_of_date): + """Generate the risk list based model_id and as_of_date + + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + matrix_storage_engine (catwalk.storage.matrix_storage_engine) + model_storage_engine (catwalk.storage.model_storage_engine) + + """ + upgrade_db(db_engine=db_engine) + # 1. Get feature and cohort config from database + model_info = get_required_info_from_config(db_engine, model_id) + + # 2. Generate cohort + cohort_table_name = f"production.cohort_{model_info['cohort_config']['name']}" cohort_table_generator = CohortTableGenerator( db_engine=db_engine, - query=cohort_config['query'], + query=model_info['cohort_config']['query'], cohort_table_name=cohort_table_name ) + cohort_table_generator.generate_cohort_table([dt_from_str(as_of_date)]) + # 3. Generate feature aggregations feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name="production", - feature_start_time=feature_start_time, + feature_start_time=model_info['feature_start_time'], ) - - cohort_table_generator.generate_cohort_table([dt_from_str(as_of_date)]) collate_aggregations = feature_generator.aggregations( - feature_aggregation_config=feature_config, + feature_aggregation_config=model_info['feature_config'], feature_dates=[as_of_date], state_table=cohort_table_name ) - feature_generator.process_table_tasks( feature_generator.generate_all_table_tasks( collate_aggregations, @@ -60,26 +91,24 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m ) ) + # 4. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dictionary = {} imputation_table_tasks = OrderedDict() with db_engine.begin() as conn: for aggregation in collate_aggregations: feature_prefix = aggregation.prefix - feature_group = aggregation.get_table_name(imputed=True).split('.')[1] - feature_group = feature_group.replace('"', '') - feature_names_in_group = [f for f in feature_names if re.match(f'\A{feature_prefix}', f)] + feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') + feature_names_in_group = [f for f in model_info['feature_names'] if re.match(f'\A{feature_prefix}', f)] reconstructed_feature_dictionary[feature_group] = feature_names_in_group - feature_names_in_group = set(feature_names_in_group) - features_imputed_in_train = set(f for f in feature_names_in_group if f + '_imp' in feature_names_in_group) - + # Make sure that the features imputed in training should also be imputed in production + features_imputed_in_train = [f for f in set(feature_names_in_group) if f + '_imp' in feature_names_in_group] results = conn.execute(aggregation.find_nulls()) null_counts = results.first().items() - features_imputed_in_production = set([col for (col, val) in null_counts if val > 0]) - - total_impute_cols = features_imputed_in_production | features_imputed_in_train - total_nonimpute_cols = set(f for f in feature_names_in_group if '_imp' not in f) - total_impute_cols + features_imputed_in_production = [col for (col, val) in null_counts if val > 0] + total_impute_cols = set(features_imputed_in_production) | set(features_imputed_in_train) + total_nonimpute_cols = set(f for f in set(feature_names_in_group) if '_imp' not in f) - total_impute_cols task_generator = feature_generator._generate_imp_table_tasks_for imputation_table_tasks.update(task_generator( aggregation, @@ -87,9 +116,9 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m nonimpute_cols=list(total_nonimpute_cols) ) ) - feature_generator.process_table_tasks(imputation_table_tasks) + # 5. Build matrix db_config = { "features_schema_name": "production", "labels_schema_name": "public", @@ -111,16 +140,16 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m 'test_duration': '1y', 'matrix_type': 'production', 'label_timespan': None, - 'label_name': label_config['name'], + 'label_name': model_info['label_config']['name'], 'indices': ["entity_id", "as_of_date"], - 'feature_start_time': feature_start_time, + 'feature_start_time': model_info['feature_start_time'], } matrix_uuid = metta.generate_uuid(matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], - label_name=label_config['name'], + label_name=model_info['label_config']['name'], label_type=None, feature_dictionary=reconstructed_feature_dictionary, matrix_metadata=matrix_metadata, @@ -128,6 +157,7 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m matrix_type="production", ) + # 6. Predict the risk score for production predictor = Predictor( model_storage_engine=model_storage_engine, db_engine=db_engine @@ -137,6 +167,5 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m model_id=model_id, matrix_store=matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, - train_matrix_columns=matrix_storage_engine.get_store(original_matrix_uuid).columns() + train_matrix_columns=matrix_storage_engine.get_store(model_info['original_matrix_uuid']).columns() ) - From 16645bccc9f343c2aa285cd9f6b844eccaff829f Mon Sep 17 00:00:00 2001 From: tweddielin Date: Wed, 13 Mar 2019 13:39:12 -0500 Subject: [PATCH 06/52] move risklist a layer above --- src/tests/test_risklist.py | 2 +- src/triage/component/risklist.py | 171 ------------------------------- 2 files changed, 1 insertion(+), 172 deletions(-) delete mode 100644 src/triage/component/risklist.py diff --git a/src/tests/test_risklist.py b/src/tests/test_risklist.py index 5786a2831..1c53ab6f6 100644 --- a/src/tests/test_risklist.py +++ b/src/tests/test_risklist.py @@ -1,4 +1,4 @@ -from triage.component.risklist import generate_risk_list +from triage.risklist import generate_risk_list from tests.utils import sample_config, populate_source_data from triage.experiments import SingleThreadedExperiment from triage.validation_primitives import table_should_have_data diff --git a/src/triage/component/risklist.py b/src/triage/component/risklist.py deleted file mode 100644 index 0c408d3f8..000000000 --- a/src/triage/component/risklist.py +++ /dev/null @@ -1,171 +0,0 @@ -from triage.component.results_schema import upgrade_db -from triage.component.architect.cohort_table_generators import CohortTableGenerator, DEFAULT_ACTIVE_STATE -from triage.component.architect.features import FeatureGenerator -from triage.component.architect.builders import MatrixBuilder -from triage.component.catwalk.predictors import Predictor -from triage.component import metta -from triage.util.conf import dt_from_str - -from collections import OrderedDict -import json -import re - - -def get_required_info_from_config(db_engine, model_id): - """Get all information needed to make the risk list from model_id - Args: - db_engine (sqlalchemy.db.engine) - model_id (int) The id of a given model in the database - - Returns: (dict) a dictionary of all information needed for making the risk list - - """ - get_experiment_query = """ - select experiments.config, matrices.matrix_metadata, matrix_uuid - from model_metadata.experiments - join model_metadata.experiment_matrices using (experiment_hash) - join model_metadata.matrices using (matrix_uuid) - join model_metadata.models on (models.train_matrix_uuid = matrices.matrix_uuid) - where model_id = %s - """ - results = list(db_engine.execute(get_experiment_query, model_id)) - experiment_config = results[0]['config'] - label_config = experiment_config['label_config'] - original_matrix_uuid = results[0]['matrix_uuid'] - matrix_metadata = json.loads(results[0]['matrix_metadata']) - feature_names = matrix_metadata['feature_names'] - feature_config = experiment_config['feature_aggregations'] - cohort_config = experiment_config['cohort_config'] - timechop_config = experiment_config['temporal_config'] - feature_start_time = timechop_config['feature_start_time'] - - model_info = {} - model_info['cohort_config'] = cohort_config - model_info['feature_config'] = feature_config - model_info['feature_names'] = feature_names - model_info['feature_start_time'] = feature_start_time - model_info['original_matrix_uuid'] = original_matrix_uuid - model_info['label_config'] = label_config - - return model_info - - -def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, model_id, as_of_date): - """Generate the risk list based model_id and as_of_date - - Args: - db_engine (sqlalchemy.db.engine) - model_id (int) The id of a given model in the database - matrix_storage_engine (catwalk.storage.matrix_storage_engine) - model_storage_engine (catwalk.storage.model_storage_engine) - - """ - upgrade_db(db_engine=db_engine) - # 1. Get feature and cohort config from database - model_info = get_required_info_from_config(db_engine, model_id) - - # 2. Generate cohort - cohort_table_name = f"production.cohort_{model_info['cohort_config']['name']}" - cohort_table_generator = CohortTableGenerator( - db_engine=db_engine, - query=model_info['cohort_config']['query'], - cohort_table_name=cohort_table_name - ) - cohort_table_generator.generate_cohort_table([dt_from_str(as_of_date)]) - - # 3. Generate feature aggregations - feature_generator = FeatureGenerator( - db_engine=db_engine, - features_schema_name="production", - feature_start_time=model_info['feature_start_time'], - ) - collate_aggregations = feature_generator.aggregations( - feature_aggregation_config=model_info['feature_config'], - feature_dates=[as_of_date], - state_table=cohort_table_name - ) - feature_generator.process_table_tasks( - feature_generator.generate_all_table_tasks( - collate_aggregations, - task_type='aggregation' - ) - ) - - # 4. Reconstruct feature disctionary from feature_names and generate imputation - reconstructed_feature_dictionary = {} - imputation_table_tasks = OrderedDict() - with db_engine.begin() as conn: - for aggregation in collate_aggregations: - feature_prefix = aggregation.prefix - feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') - feature_names_in_group = [f for f in model_info['feature_names'] if re.match(f'\A{feature_prefix}', f)] - reconstructed_feature_dictionary[feature_group] = feature_names_in_group - - # Make sure that the features imputed in training should also be imputed in production - features_imputed_in_train = [f for f in set(feature_names_in_group) if f + '_imp' in feature_names_in_group] - results = conn.execute(aggregation.find_nulls()) - null_counts = results.first().items() - - features_imputed_in_production = [col for (col, val) in null_counts if val > 0] - total_impute_cols = set(features_imputed_in_production) | set(features_imputed_in_train) - total_nonimpute_cols = set(f for f in set(feature_names_in_group) if '_imp' not in f) - total_impute_cols - task_generator = feature_generator._generate_imp_table_tasks_for - imputation_table_tasks.update(task_generator( - aggregation, - impute_cols=list(total_impute_cols), - nonimpute_cols=list(total_nonimpute_cols) - ) - ) - feature_generator.process_table_tasks(imputation_table_tasks) - - # 5. Build matrix - db_config = { - "features_schema_name": "production", - "labels_schema_name": "public", - "cohort_table_name": cohort_table_name, - } - - matrix_builder = MatrixBuilder( - db_config=db_config, - matrix_storage_engine=matrix_storage_engine, - engine=db_engine, - experiment_hash=None, - replace=True, - ) - - matrix_metadata = { - 'as_of_times': [as_of_date], - 'matrix_id': str(as_of_date) + '_prediction', - 'state': DEFAULT_ACTIVE_STATE, - 'test_duration': '1y', - 'matrix_type': 'production', - 'label_timespan': None, - 'label_name': model_info['label_config']['name'], - 'indices': ["entity_id", "as_of_date"], - 'feature_start_time': model_info['feature_start_time'], - } - - matrix_uuid = metta.generate_uuid(matrix_metadata) - - matrix_builder.build_matrix( - as_of_times=[as_of_date], - label_name=model_info['label_config']['name'], - label_type=None, - feature_dictionary=reconstructed_feature_dictionary, - matrix_metadata=matrix_metadata, - matrix_uuid=matrix_uuid, - matrix_type="production", - ) - - # 6. Predict the risk score for production - predictor = Predictor( - model_storage_engine=model_storage_engine, - db_engine=db_engine - ) - - predictor.predict( - model_id=model_id, - matrix_store=matrix_storage_engine.get_store(matrix_uuid), - misc_db_parameters={}, - train_matrix_columns=matrix_storage_engine.get_store(model_info['original_matrix_uuid']).columns() - ) From 914ad760e670179ee5e7e52d5abe85609cf0ce7c Mon Sep 17 00:00:00 2001 From: tweddielin Date: Wed, 13 Mar 2019 13:42:20 -0500 Subject: [PATCH 07/52] create risklist module --- src/triage/risklist/__init__.py | 171 ++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 src/triage/risklist/__init__.py diff --git a/src/triage/risklist/__init__.py b/src/triage/risklist/__init__.py new file mode 100644 index 000000000..0c408d3f8 --- /dev/null +++ b/src/triage/risklist/__init__.py @@ -0,0 +1,171 @@ +from triage.component.results_schema import upgrade_db +from triage.component.architect.cohort_table_generators import CohortTableGenerator, DEFAULT_ACTIVE_STATE +from triage.component.architect.features import FeatureGenerator +from triage.component.architect.builders import MatrixBuilder +from triage.component.catwalk.predictors import Predictor +from triage.component import metta +from triage.util.conf import dt_from_str + +from collections import OrderedDict +import json +import re + + +def get_required_info_from_config(db_engine, model_id): + """Get all information needed to make the risk list from model_id + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + + Returns: (dict) a dictionary of all information needed for making the risk list + + """ + get_experiment_query = """ + select experiments.config, matrices.matrix_metadata, matrix_uuid + from model_metadata.experiments + join model_metadata.experiment_matrices using (experiment_hash) + join model_metadata.matrices using (matrix_uuid) + join model_metadata.models on (models.train_matrix_uuid = matrices.matrix_uuid) + where model_id = %s + """ + results = list(db_engine.execute(get_experiment_query, model_id)) + experiment_config = results[0]['config'] + label_config = experiment_config['label_config'] + original_matrix_uuid = results[0]['matrix_uuid'] + matrix_metadata = json.loads(results[0]['matrix_metadata']) + feature_names = matrix_metadata['feature_names'] + feature_config = experiment_config['feature_aggregations'] + cohort_config = experiment_config['cohort_config'] + timechop_config = experiment_config['temporal_config'] + feature_start_time = timechop_config['feature_start_time'] + + model_info = {} + model_info['cohort_config'] = cohort_config + model_info['feature_config'] = feature_config + model_info['feature_names'] = feature_names + model_info['feature_start_time'] = feature_start_time + model_info['original_matrix_uuid'] = original_matrix_uuid + model_info['label_config'] = label_config + + return model_info + + +def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, model_id, as_of_date): + """Generate the risk list based model_id and as_of_date + + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + matrix_storage_engine (catwalk.storage.matrix_storage_engine) + model_storage_engine (catwalk.storage.model_storage_engine) + + """ + upgrade_db(db_engine=db_engine) + # 1. Get feature and cohort config from database + model_info = get_required_info_from_config(db_engine, model_id) + + # 2. Generate cohort + cohort_table_name = f"production.cohort_{model_info['cohort_config']['name']}" + cohort_table_generator = CohortTableGenerator( + db_engine=db_engine, + query=model_info['cohort_config']['query'], + cohort_table_name=cohort_table_name + ) + cohort_table_generator.generate_cohort_table([dt_from_str(as_of_date)]) + + # 3. Generate feature aggregations + feature_generator = FeatureGenerator( + db_engine=db_engine, + features_schema_name="production", + feature_start_time=model_info['feature_start_time'], + ) + collate_aggregations = feature_generator.aggregations( + feature_aggregation_config=model_info['feature_config'], + feature_dates=[as_of_date], + state_table=cohort_table_name + ) + feature_generator.process_table_tasks( + feature_generator.generate_all_table_tasks( + collate_aggregations, + task_type='aggregation' + ) + ) + + # 4. Reconstruct feature disctionary from feature_names and generate imputation + reconstructed_feature_dictionary = {} + imputation_table_tasks = OrderedDict() + with db_engine.begin() as conn: + for aggregation in collate_aggregations: + feature_prefix = aggregation.prefix + feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') + feature_names_in_group = [f for f in model_info['feature_names'] if re.match(f'\A{feature_prefix}', f)] + reconstructed_feature_dictionary[feature_group] = feature_names_in_group + + # Make sure that the features imputed in training should also be imputed in production + features_imputed_in_train = [f for f in set(feature_names_in_group) if f + '_imp' in feature_names_in_group] + results = conn.execute(aggregation.find_nulls()) + null_counts = results.first().items() + + features_imputed_in_production = [col for (col, val) in null_counts if val > 0] + total_impute_cols = set(features_imputed_in_production) | set(features_imputed_in_train) + total_nonimpute_cols = set(f for f in set(feature_names_in_group) if '_imp' not in f) - total_impute_cols + task_generator = feature_generator._generate_imp_table_tasks_for + imputation_table_tasks.update(task_generator( + aggregation, + impute_cols=list(total_impute_cols), + nonimpute_cols=list(total_nonimpute_cols) + ) + ) + feature_generator.process_table_tasks(imputation_table_tasks) + + # 5. Build matrix + db_config = { + "features_schema_name": "production", + "labels_schema_name": "public", + "cohort_table_name": cohort_table_name, + } + + matrix_builder = MatrixBuilder( + db_config=db_config, + matrix_storage_engine=matrix_storage_engine, + engine=db_engine, + experiment_hash=None, + replace=True, + ) + + matrix_metadata = { + 'as_of_times': [as_of_date], + 'matrix_id': str(as_of_date) + '_prediction', + 'state': DEFAULT_ACTIVE_STATE, + 'test_duration': '1y', + 'matrix_type': 'production', + 'label_timespan': None, + 'label_name': model_info['label_config']['name'], + 'indices': ["entity_id", "as_of_date"], + 'feature_start_time': model_info['feature_start_time'], + } + + matrix_uuid = metta.generate_uuid(matrix_metadata) + + matrix_builder.build_matrix( + as_of_times=[as_of_date], + label_name=model_info['label_config']['name'], + label_type=None, + feature_dictionary=reconstructed_feature_dictionary, + matrix_metadata=matrix_metadata, + matrix_uuid=matrix_uuid, + matrix_type="production", + ) + + # 6. Predict the risk score for production + predictor = Predictor( + model_storage_engine=model_storage_engine, + db_engine=db_engine + ) + + predictor.predict( + model_id=model_id, + matrix_store=matrix_storage_engine.get_store(matrix_uuid), + misc_db_parameters={}, + train_matrix_columns=matrix_storage_engine.get_store(model_info['original_matrix_uuid']).columns() + ) From c92bd8b9ec81207a033a0529c1a66c725656a9c1 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Wed, 13 Mar 2019 14:36:17 -0500 Subject: [PATCH 08/52] __init__lpy --- src/triage/risklist/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/triage/risklist/__init__.py b/src/triage/risklist/__init__.py index 0c408d3f8..0abd5bb2e 100644 --- a/src/triage/risklist/__init__.py +++ b/src/triage/risklist/__init__.py @@ -58,7 +58,7 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m model_id (int) The id of a given model in the database matrix_storage_engine (catwalk.storage.matrix_storage_engine) model_storage_engine (catwalk.storage.model_storage_engine) - + as_of_date (string) a date string like "YYYY-MM-DD" """ upgrade_db(db_engine=db_engine) # 1. Get feature and cohort config from database From d3c3ba97f413343819210d20832be3803a6b3772 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Wed, 13 Mar 2019 15:03:48 -0500 Subject: [PATCH 09/52] fix alembic reversion and replace metta.generate_uuid with filename_friendly_hash --- .../versions/1b990cbc04e4_production_schema.py | 2 +- src/triage/risklist/__init__.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py b/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py index d30c24f2c..aab5c9881 100644 --- a/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py +++ b/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py @@ -11,7 +11,7 @@ # revision identifiers, used by Alembic. revision = '1b990cbc04e4' -down_revision = '0bca1ba9706e' +down_revision = '50e1f1bc2cac' branch_labels = None depends_on = None diff --git a/src/triage/risklist/__init__.py b/src/triage/risklist/__init__.py index 0abd5bb2e..dade2ee35 100644 --- a/src/triage/risklist/__init__.py +++ b/src/triage/risklist/__init__.py @@ -1,9 +1,9 @@ from triage.component.results_schema import upgrade_db -from triage.component.architect.cohort_table_generators import CohortTableGenerator, DEFAULT_ACTIVE_STATE +from triage.component.architect.entity_date_table_generators import EntityDateTableGenerator, DEFAULT_ACTIVE_STATE from triage.component.architect.features import FeatureGenerator from triage.component.architect.builders import MatrixBuilder from triage.component.catwalk.predictors import Predictor -from triage.component import metta +from triage.component.catwalk.utils import filename_friendly_hash from triage.util.conf import dt_from_str from collections import OrderedDict @@ -66,12 +66,12 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m # 2. Generate cohort cohort_table_name = f"production.cohort_{model_info['cohort_config']['name']}" - cohort_table_generator = CohortTableGenerator( + cohort_table_generator = EntityDateTableGenerator( db_engine=db_engine, query=model_info['cohort_config']['query'], - cohort_table_name=cohort_table_name + entity_date_table_name=cohort_table_name ) - cohort_table_generator.generate_cohort_table([dt_from_str(as_of_date)]) + cohort_table_generator.generate_entity_date_table(as_of_dates=[dt_from_str(as_of_date)]) # 3. Generate feature aggregations feature_generator = FeatureGenerator( @@ -145,7 +145,7 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m 'feature_start_time': model_info['feature_start_time'], } - matrix_uuid = metta.generate_uuid(matrix_metadata) + matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], From 0e92fb09640106d133e0da8f59b16afae06256cf Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Fri, 12 Apr 2019 11:06:04 -0500 Subject: [PATCH 10/52] Fix down revision of production schema migration --- .../alembic/versions/1b990cbc04e4_production_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py b/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py index aab5c9881..2dedc9ad3 100644 --- a/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py +++ b/src/triage/component/results_schema/alembic/versions/1b990cbc04e4_production_schema.py @@ -11,7 +11,7 @@ # revision identifiers, used by Alembic. revision = '1b990cbc04e4' -down_revision = '50e1f1bc2cac' +down_revision = 'cfd5c3386014' branch_labels = None depends_on = None From f7d49e5a15e850e608ab43f2995c39795d67c717 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 5 Jan 2021 21:07:57 -0600 Subject: [PATCH 11/52] Enable github checks on this branch too --- .github/workflows/test.yaml | 45 +++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 .github/workflows/test.yaml diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml new file mode 100644 index 000000000..58bc12423 --- /dev/null +++ b/.github/workflows/test.yaml @@ -0,0 +1,45 @@ +name: Python package + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + services: + # Label used to access the service container + postgres: + # Docker Hub image + image: postgres + # Provide the password for postgres + env: + POSTGRES_PASSWORD: postgres + # Set health checks to wait until postgres has started + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + strategy: + matrix: + python-version: [3.6, 3.7] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install libblas-dev liblapack-dev libatlas-base-dev gfortran + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + pip install -r requirement/include/build.txt + pip install -r requirement/include/test-management.txt + - name: Test with tox + run: | + tox From dee930ff679f2a0dc376dccb0a5ad95c2af3ebc7 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 5 Jan 2021 23:01:25 -0600 Subject: [PATCH 12/52] Closer to getting tests to run --- ...e85_add_label_value_to_prodcution_table.py | 4 +-- src/triage/risklist/__init__.py | 29 ++++++++++++++----- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/triage/component/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py b/src/triage/component/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py index cf46034bb..fbeb48e6d 100644 --- a/src/triage/component/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py +++ b/src/triage/component/results_schema/alembic/versions/264786a9fe85_add_label_value_to_prodcution_table.py @@ -28,7 +28,7 @@ def upgrade(): sa.Column("rank_abs", sa.Integer(), nullable=True), sa.Column("rank_pct", sa.Float(), nullable=True), sa.Column("matrix_uuid", sa.Text(), nullable=True), - sa.Column("test_label_window", sa.Interval(), nullable=True), + sa.Column("test_label_timespan", sa.Interval(), nullable=True), sa.ForeignKeyConstraint(["model_id"], ["triage_metadata.models.model_id"]), sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"), schema="production", @@ -46,7 +46,7 @@ def downgrade(): sa.Column("rank_abs", sa.Integer(), nullable=True), sa.Column("rank_pct", sa.Float(), nullable=True), sa.Column("matrix_uuid", sa.Text(), nullable=True), - sa.Column("test_label_window", sa.Interval(), nullable=True), + sa.Column("test_label_timespan", sa.Interval(), nullable=True), sa.ForeignKeyConstraint(["model_id"], ["triage_metadata.models.model_id"]), sa.PrimaryKeyConstraint("model_id", "entity_id", "as_of_date"), schema="results", diff --git a/src/triage/risklist/__init__.py b/src/triage/risklist/__init__.py index dade2ee35..78e5ad7a3 100644 --- a/src/triage/risklist/__init__.py +++ b/src/triage/risklist/__init__.py @@ -10,6 +10,9 @@ import json import re +import verboselogs, logging +logger = verboselogs.VerboseLogger(__name__) + def get_required_info_from_config(db_engine, model_id): """Get all information needed to make the risk list from model_id @@ -22,17 +25,17 @@ def get_required_info_from_config(db_engine, model_id): """ get_experiment_query = """ select experiments.config, matrices.matrix_metadata, matrix_uuid - from model_metadata.experiments - join model_metadata.experiment_matrices using (experiment_hash) - join model_metadata.matrices using (matrix_uuid) - join model_metadata.models on (models.train_matrix_uuid = matrices.matrix_uuid) + from triage_metadata.experiments + join triage_metadata.experiment_matrices using (experiment_hash) + join triage_metadata.matrices using (matrix_uuid) + join triage_metadata.models on (models.train_matrix_uuid = matrices.matrix_uuid) where model_id = %s """ results = list(db_engine.execute(get_experiment_query, model_id)) experiment_config = results[0]['config'] label_config = experiment_config['label_config'] original_matrix_uuid = results[0]['matrix_uuid'] - matrix_metadata = json.loads(results[0]['matrix_metadata']) + matrix_metadata = results[0]['matrix_metadata'] feature_names = matrix_metadata['feature_names'] feature_config = experiment_config['feature_aggregations'] cohort_config = experiment_config['cohort_config'] @@ -60,6 +63,7 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m model_storage_engine (catwalk.storage.model_storage_engine) as_of_date (string) a date string like "YYYY-MM-DD" """ + logger.spam("In RISK LIST................") upgrade_db(db_engine=db_engine) # 1. Get feature and cohort config from database model_info = get_required_info_from_config(db_engine, model_id) @@ -97,12 +101,22 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m with db_engine.begin() as conn: for aggregation in collate_aggregations: feature_prefix = aggregation.prefix + logger.spam("Feature prefix = %s", feature_prefix) feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') + logger.spam("Feature group = %s", feature_group) feature_names_in_group = [f for f in model_info['feature_names'] if re.match(f'\A{feature_prefix}', f)] + logger.spam("Feature names in group = %s", feature_names_in_group) reconstructed_feature_dictionary[feature_group] = feature_names_in_group # Make sure that the features imputed in training should also be imputed in production - features_imputed_in_train = [f for f in set(feature_names_in_group) if f + '_imp' in feature_names_in_group] + #import pdb + #pdb.set_trace() + features_imputed_in_train = [ + f for f in set(feature_names_in_group) + if not f.endswith('_imp') + and'_'.join(f.split('_')[0:-1]) + '_imp' in feature_names_in_group + ] + logger.spam("Features imputed in train = %s", features_imputed_in_train) results = conn.execute(aggregation.find_nulls()) null_counts = results.first().items() @@ -160,7 +174,8 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m # 6. Predict the risk score for production predictor = Predictor( model_storage_engine=model_storage_engine, - db_engine=db_engine + db_engine=db_engine, + rank_order='best' ) predictor.predict( From 1769b008debe87d3820f900c710cf4c813c54b57 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Thu, 7 Jan 2021 21:36:40 -0600 Subject: [PATCH 13/52] Add CLI for risklist --- src/tests/test_cli.py | 11 +++++++++++ src/tests/test_risklist.py | 3 +-- src/triage/cli.py | 32 ++++++++++++++++++++++++++++++++ src/triage/risklist/__init__.py | 8 ++++---- 4 files changed, 48 insertions(+), 6 deletions(-) diff --git a/src/tests/test_cli.py b/src/tests/test_cli.py index 497059381..4577b4b8b 100644 --- a/src/tests/test_cli.py +++ b/src/tests/test_cli.py @@ -2,6 +2,7 @@ import triage.cli as cli from unittest.mock import Mock, patch import os +import datetime # we do not need a real database URL but one SQLalchemy thinks looks like a real one @@ -56,3 +57,13 @@ def test_featuretest(): try_command('featuretest', 'example/config/experiment.yaml', '2017-06-06') featuremock.assert_called_once() cohortmock.assert_called_once() + + +def test_cli_risklist(): + with patch('triage.cli.generate_risk_list', autospec=True) as mock: + try_command('risklist', '40', '2019-06-04') + mock.assert_called_once() + assert mock.call_args[0][0].url + assert mock.call_args[0][1].project_path + assert mock.call_args[0][2] == 40 + assert mock.call_args[0][3] == datetime.datetime(2019, 6, 4) diff --git a/src/tests/test_risklist.py b/src/tests/test_risklist.py index 1c53ab6f6..27f41bb79 100644 --- a/src/tests/test_risklist.py +++ b/src/tests/test_risklist.py @@ -20,8 +20,7 @@ def test_risklist(db_engine, project_storage): as_of_date = '2013-01-01' generate_risk_list( db_engine=db_engine, - matrix_storage_engine=project_storage.matrix_storage_engine(), - model_storage_engine=project_storage.model_storage_engine(), + project_storage=project_storage, model_id=model_id, as_of_date=as_of_date) table_should_have_data( diff --git a/src/triage/cli.py b/src/triage/cli.py index dcd4cad40..b53b1c4b4 100755 --- a/src/triage/cli.py +++ b/src/triage/cli.py @@ -20,6 +20,7 @@ MultiCoreExperiment, SingleThreadedExperiment, ) +from triage.risklist import generate_risk_list from triage.component.postmodeling.crosstabs import CrosstabsConfigLoader, run_crosstabs from triage.util.db import create_engine @@ -399,6 +400,37 @@ def __call__(self, args): run_crosstabs(db_engine, config) +@Triage.register +class Risklist(Command): + """Generate a list of risk scores from an already-trained model and new data""" + + def __init__(self, parser): + parser.add_argument( + "model_id", + type=natural_number, + help="The model_id of an existing trained model in the models table", + ) + parser.add_argument( + "as_of_date", + type=valid_date, + help="The date as of which to run features. Format YYYY-MM-DD", + ) + parser.add_argument( + "--project-path", + default=os.getcwd(), + help="path to store matrices and trained models", + ) + + def __call__(self, args): + db_engine = create_engine(self.root.db_url) + + generate_risk_list( + db_engine, + ProjectStorage(args.project_path), + args.model_id, + args.as_of_date + ) + @Triage.register class Db(Command): """Manage experiment database""" diff --git a/src/triage/risklist/__init__.py b/src/triage/risklist/__init__.py index 78e5ad7a3..7892fb827 100644 --- a/src/triage/risklist/__init__.py +++ b/src/triage/risklist/__init__.py @@ -53,18 +53,18 @@ def get_required_info_from_config(db_engine, model_id): return model_info -def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, model_id, as_of_date): +def generate_risk_list(db_engine, project_storage, model_id, as_of_date): """Generate the risk list based model_id and as_of_date Args: db_engine (sqlalchemy.db.engine) + project_storage (catwalk.storage.ProjectStorage) model_id (int) The id of a given model in the database - matrix_storage_engine (catwalk.storage.matrix_storage_engine) - model_storage_engine (catwalk.storage.model_storage_engine) as_of_date (string) a date string like "YYYY-MM-DD" """ logger.spam("In RISK LIST................") upgrade_db(db_engine=db_engine) + matrix_storage_engine = project_storage.matrix_storage_engine() # 1. Get feature and cohort config from database model_info = get_required_info_from_config(db_engine, model_id) @@ -173,7 +173,7 @@ def generate_risk_list(db_engine, matrix_storage_engine, model_storage_engine, m # 6. Predict the risk score for production predictor = Predictor( - model_storage_engine=model_storage_engine, + model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, rank_order='best' ) From 52c9ff02e76ed2a7b0f3d0606e8e5a51390cc27c Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Thu, 7 Jan 2021 21:45:36 -0600 Subject: [PATCH 14/52] Risklist docs stub --- docs/mkdocs.yml | 1 + docs/sources/risklist/index.md | 3 +++ 2 files changed, 4 insertions(+) create mode 100644 docs/sources/risklist/index.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 85be24e26..f3437e6ef 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -120,6 +120,7 @@ nav: - Using Postmodeling: postmodeling/index.md - Postmodeling & Crosstabs Configuration: postmodeling/postmodeling-config.md - Model governance: dirtyduck/ml_governance.md + - Risklist: risklist/index.md - Scaling up: dirtyduck/aws_batch.md - API Reference: - Audition: diff --git a/docs/sources/risklist/index.md b/docs/sources/risklist/index.md new file mode 100644 index 000000000..3d244420c --- /dev/null +++ b/docs/sources/risklist/index.md @@ -0,0 +1,3 @@ +# Risklist + +Here is a stub of the risklist page From 173167aa7e15efb956822acc187f1e65e44c300b Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Fri, 8 Jan 2021 21:02:56 -0600 Subject: [PATCH 15/52] Break up data gathering into experiment and matrix, use pytest fixtures to speed up subsequent tests --- src/tests/test_risklist.py | 32 ++++++++------ src/triage/risklist/__init__.py | 77 ++++++++++++++++----------------- 2 files changed, 56 insertions(+), 53 deletions(-) diff --git a/src/tests/test_risklist.py b/src/tests/test_risklist.py index 27f41bb79..7fbf16ca1 100644 --- a/src/tests/test_risklist.py +++ b/src/tests/test_risklist.py @@ -1,29 +1,33 @@ -from triage.risklist import generate_risk_list -from tests.utils import sample_config, populate_source_data -from triage.experiments import SingleThreadedExperiment +from triage.risklist import generate_risk_list, train_matrix_info_from_model_id, experiment_config_from_model_id from triage.validation_primitives import table_should_have_data -def test_risklist(db_engine, project_storage): +def test_risklist(finished_experiment): # given a model id and as-of-date <= today # and the model id is trained and is linked to an experiment with feature and cohort config # generate records in listpredictions # the # of records should equal the size of the cohort for that date - populate_source_data(db_engine) - SingleThreadedExperiment( - sample_config(), - db_engine=db_engine, - project_path=project_storage.project_path - ).run() - model_id = 1 as_of_date = '2013-01-01' generate_risk_list( - db_engine=db_engine, - project_storage=project_storage, + db_engine=finished_experiment.db_engine, + project_storage=finished_experiment.project_storage, model_id=model_id, as_of_date=as_of_date) table_should_have_data( - db_engine=db_engine, + db_engine=finished_experiment.db_engine, table_name="production.list_predictions", ) + + +def test_experiment_config_from_model_id(finished_experiment): + model_id = 1 + experiment_config = experiment_config_from_model_id(finished_experiment.db_engine, model_id) + assert experiment_config == finished_experiment.config + + +def test_train_matrix_info_from_model_id(finished_experiment): + model_id = 1 + (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(finished_experiment.db_engine, model_id) + assert train_matrix_uuid + assert matrix_metadata diff --git a/src/triage/risklist/__init__.py b/src/triage/risklist/__init__.py index 7892fb827..c02d36cba 100644 --- a/src/triage/risklist/__init__.py +++ b/src/triage/risklist/__init__.py @@ -1,10 +1,12 @@ -from triage.component.results_schema import upgrade_db +from triage.component.results_schema import upgrade_db, Experiment, ExperimentModel from triage.component.architect.entity_date_table_generators import EntityDateTableGenerator, DEFAULT_ACTIVE_STATE from triage.component.architect.features import FeatureGenerator from triage.component.architect.builders import MatrixBuilder from triage.component.catwalk.predictors import Predictor from triage.component.catwalk.utils import filename_friendly_hash from triage.util.conf import dt_from_str +from triage.util.db import scoped_session +from sqlalchemy import select from collections import OrderedDict import json @@ -14,43 +16,39 @@ logger = verboselogs.VerboseLogger(__name__) -def get_required_info_from_config(db_engine, model_id): - """Get all information needed to make the risk list from model_id +def experiment_config_from_model_id(db_engine, model_id): + """Get original experiment config from model_id Args: db_engine (sqlalchemy.db.engine) model_id (int) The id of a given model in the database - Returns: (dict) a dictionary of all information needed for making the risk list + Returns: (dict) experiment config + """ + get_experiment_query = '''select experiments.config + from triage_metadata.experiments + join triage_metadata.experiment_models using (experiment_hash) + join triage_metadata.models using (model_hash) + where model_id = %s + ''' + (config,) = db_engine.execute(get_experiment_query, model_id).first() + return config + + +def train_matrix_info_from_model_id(db_engine, model_id): + """Get original train matrix information from model_id + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + Returns: (str, dict) matrix uuid and matrix metadata """ - get_experiment_query = """ - select experiments.config, matrices.matrix_metadata, matrix_uuid - from triage_metadata.experiments - join triage_metadata.experiment_matrices using (experiment_hash) - join triage_metadata.matrices using (matrix_uuid) + get_train_matrix_query = """ + select matrix_uuid, matrices.matrix_metadata + from triage_metadata.matrices join triage_metadata.models on (models.train_matrix_uuid = matrices.matrix_uuid) where model_id = %s """ - results = list(db_engine.execute(get_experiment_query, model_id)) - experiment_config = results[0]['config'] - label_config = experiment_config['label_config'] - original_matrix_uuid = results[0]['matrix_uuid'] - matrix_metadata = results[0]['matrix_metadata'] - feature_names = matrix_metadata['feature_names'] - feature_config = experiment_config['feature_aggregations'] - cohort_config = experiment_config['cohort_config'] - timechop_config = experiment_config['temporal_config'] - feature_start_time = timechop_config['feature_start_time'] - - model_info = {} - model_info['cohort_config'] = cohort_config - model_info['feature_config'] = feature_config - model_info['feature_names'] = feature_names - model_info['feature_start_time'] = feature_start_time - model_info['original_matrix_uuid'] = original_matrix_uuid - model_info['label_config'] = label_config - - return model_info + return db_engine.execute(get_train_matrix_query, model_id).first() def generate_risk_list(db_engine, project_storage, model_id, as_of_date): @@ -66,13 +64,14 @@ def generate_risk_list(db_engine, project_storage, model_id, as_of_date): upgrade_db(db_engine=db_engine) matrix_storage_engine = project_storage.matrix_storage_engine() # 1. Get feature and cohort config from database - model_info = get_required_info_from_config(db_engine, model_id) + (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id) + experiment_config = experiment_config_from_model_id(db_engine, model_id) # 2. Generate cohort - cohort_table_name = f"production.cohort_{model_info['cohort_config']['name']}" + cohort_table_name = f"production.cohort_{experiment_config['cohort_config']['name']}" cohort_table_generator = EntityDateTableGenerator( db_engine=db_engine, - query=model_info['cohort_config']['query'], + query=experiment_config['cohort_config']['query'], entity_date_table_name=cohort_table_name ) cohort_table_generator.generate_entity_date_table(as_of_dates=[dt_from_str(as_of_date)]) @@ -81,10 +80,10 @@ def generate_risk_list(db_engine, project_storage, model_id, as_of_date): feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name="production", - feature_start_time=model_info['feature_start_time'], + feature_start_time=experiment_config['temporal_config']['feature_start_time'], ) collate_aggregations = feature_generator.aggregations( - feature_aggregation_config=model_info['feature_config'], + feature_aggregation_config=experiment_config['feature_aggregations'], feature_dates=[as_of_date], state_table=cohort_table_name ) @@ -104,7 +103,7 @@ def generate_risk_list(db_engine, project_storage, model_id, as_of_date): logger.spam("Feature prefix = %s", feature_prefix) feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') logger.spam("Feature group = %s", feature_group) - feature_names_in_group = [f for f in model_info['feature_names'] if re.match(f'\A{feature_prefix}', f)] + feature_names_in_group = [f for f in matrix_metadata['feature_names'] if re.match(f'\A{feature_prefix}', f)] logger.spam("Feature names in group = %s", feature_names_in_group) reconstructed_feature_dictionary[feature_group] = feature_names_in_group @@ -154,16 +153,16 @@ def generate_risk_list(db_engine, project_storage, model_id, as_of_date): 'test_duration': '1y', 'matrix_type': 'production', 'label_timespan': None, - 'label_name': model_info['label_config']['name'], + 'label_name': experiment_config['label_config']['name'], 'indices': ["entity_id", "as_of_date"], - 'feature_start_time': model_info['feature_start_time'], + 'feature_start_time': experiment_config['temporal_config']['feature_start_time'], } matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], - label_name=model_info['label_config']['name'], + label_name=experiment_config['label_config']['name'], label_type=None, feature_dictionary=reconstructed_feature_dictionary, matrix_metadata=matrix_metadata, @@ -182,5 +181,5 @@ def generate_risk_list(db_engine, project_storage, model_id, as_of_date): model_id=model_id, matrix_store=matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, - train_matrix_columns=matrix_storage_engine.get_store(model_info['original_matrix_uuid']).columns() + train_matrix_columns=matrix_storage_engine.get_store(train_matrix_uuid).columns() ) From f6b2d024b4dc64d00217abfe73aa5397a620f8f3 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Fri, 8 Jan 2021 22:32:26 -0600 Subject: [PATCH 16/52] Modify schema for list prediction metadata --- src/triage/component/catwalk/storage.py | 2 + .../component/results_schema/__init__.py | 2 + ...4eb2_add_production_prediction_metadata.py | 38 +++++++++++++++++++ ...50ffa8e2_break_ties_in_list_predictions.py | 34 +++++++++++++++++ src/triage/component/results_schema/schema.py | 19 +++++++++- 5 files changed, 93 insertions(+), 2 deletions(-) create mode 100644 src/triage/component/results_schema/alembic/versions/670289044eb2_add_production_prediction_metadata.py create mode 100644 src/triage/component/results_schema/alembic/versions/ce5b50ffa8e2_break_ties_in_list_predictions.py diff --git a/src/triage/component/catwalk/storage.py b/src/triage/component/catwalk/storage.py index 51f8b0d3b..bb9b03c2d 100644 --- a/src/triage/component/catwalk/storage.py +++ b/src/triage/component/catwalk/storage.py @@ -26,6 +26,7 @@ ListPrediction, TestPredictionMetadata, TrainPredictionMetadata, + ListPredictionMetadata, TestAequitas, TrainAequitas ) @@ -621,4 +622,5 @@ class TrainMatrixType: class ProductionMatrixType(object): string_name = "production" prediction_obj = ListPrediction + prediction_metadata_obj = ListPredictionMetadata diff --git a/src/triage/component/results_schema/__init__.py b/src/triage/component/results_schema/__init__.py index 40bf26007..97a8b27a9 100644 --- a/src/triage/component/results_schema/__init__.py +++ b/src/triage/component/results_schema/__init__.py @@ -30,6 +30,7 @@ TrainPrediction, TestPredictionMetadata, TrainPredictionMetadata, + ListPredictionMetadata, TrainAequitas, TestAequitas ) @@ -55,6 +56,7 @@ "TrainPrediction", "TestPredictionMetadata", "TrainPredictionMetadata", + "ListPredictionMetadata", "TestAequitas", "TrainAequitas", "mark_db_as_upgraded", diff --git a/src/triage/component/results_schema/alembic/versions/670289044eb2_add_production_prediction_metadata.py b/src/triage/component/results_schema/alembic/versions/670289044eb2_add_production_prediction_metadata.py new file mode 100644 index 000000000..7146142c2 --- /dev/null +++ b/src/triage/component/results_schema/alembic/versions/670289044eb2_add_production_prediction_metadata.py @@ -0,0 +1,38 @@ +"""Add production prediction metadata + +Revision ID: 670289044eb2 +Revises: ce5b50ffa8e2 +Create Date: 2021-01-08 22:27:23.433813 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '670289044eb2' +down_revision = 'ce5b50ffa8e2' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('prediction_metadata', + sa.Column('model_id', sa.Integer(), nullable=False), + sa.Column('matrix_uuid', sa.Text(), nullable=False), + sa.Column('tiebreaker_ordering', sa.Text(), nullable=True), + sa.Column('random_seed', sa.Integer(), nullable=True), + sa.Column('predictions_saved', sa.Boolean(), nullable=True), + sa.ForeignKeyConstraint(['matrix_uuid'], ['triage_metadata.matrices.matrix_uuid'], ), + sa.ForeignKeyConstraint(['model_id'], ['triage_metadata.models.model_id'], ), + sa.PrimaryKeyConstraint('model_id', 'matrix_uuid'), + schema='production' + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('prediction_metadata', schema='production') + # ### end Alembic commands ### diff --git a/src/triage/component/results_schema/alembic/versions/ce5b50ffa8e2_break_ties_in_list_predictions.py b/src/triage/component/results_schema/alembic/versions/ce5b50ffa8e2_break_ties_in_list_predictions.py new file mode 100644 index 000000000..6870ff9b7 --- /dev/null +++ b/src/triage/component/results_schema/alembic/versions/ce5b50ffa8e2_break_ties_in_list_predictions.py @@ -0,0 +1,34 @@ +"""Break ties in list predictions + +Revision ID: ce5b50ffa8e2 +Revises: 264786a9fe85 +Create Date: 2021-01-08 21:59:13.403934 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = 'ce5b50ffa8e2' +down_revision = '264786a9fe85' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('list_predictions', sa.Column('rank_abs_with_ties', sa.Integer(), nullable=True), schema='production') + op.add_column('list_predictions', sa.Column('rank_pct_with_ties', sa.Float(), nullable=True), schema='production') + op.alter_column('list_predictions', 'rank_abs', new_column_name='rank_abs_no_ties', schema='production') + op.alter_column('list_predictions', 'rank_pct', new_column_name='rank_pct_no_ties', schema='production') + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('list_predictions', 'rank_abs_no_ties', new_column_name='rank_abs', schema='production') + op.alter_column('list_predictions', 'rank_pct_no_ties', new_column_name='rank_pct', schema='production') + op.drop_column('list_predictions', 'rank_pct_with_ties', schema='production') + op.drop_column('list_predictions', 'rank_abs_with_ties', schema='production') + # ### end Alembic commands ### diff --git a/src/triage/component/results_schema/schema.py b/src/triage/component/results_schema/schema.py index f5665eaf4..d79647ea3 100644 --- a/src/triage/component/results_schema/schema.py +++ b/src/triage/component/results_schema/schema.py @@ -150,14 +150,29 @@ class ListPrediction(Base): as_of_date = Column(DateTime, primary_key=True) score = Column(Numeric) label_value = Column(Integer) - rank_abs = Column(Integer) - rank_pct = Column(Float) + rank_abs_no_ties = Column(Integer) + rank_abs_with_ties = Column(Integer) + rank_pct_no_ties = Column(Float) + rank_pct_with_ties = Column(Float) matrix_uuid = Column(Text) test_label_timespan = Column(Interval) model_rel = relationship("Model") +class ListPredictionMetadata(Base): + __tablename__ = "prediction_metadata" + __table_args__ = {"schema": "production"} + + model_id = Column( + Integer, ForeignKey("triage_metadata.models.model_id"), primary_key=True + ) + matrix_uuid = Column(Text, ForeignKey("triage_metadata.matrices.matrix_uuid"), primary_key=True) + tiebreaker_ordering = Column(Text) + random_seed = Column(Integer) + predictions_saved = Column(Boolean) + + class ExperimentMatrix(Base): __tablename__ = "experiment_matrices" __table_args__ = {"schema": "triage_metadata"} From acffa67a6b8aefd372f8e1ea0e08cba91554d344 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Sat, 9 Jan 2021 14:18:29 -0600 Subject: [PATCH 17/52] fix conflicts and add helper functions for getting imputed features --- src/triage/risklist/__init__.py | 58 +++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/src/triage/risklist/__init__.py b/src/triage/risklist/__init__.py index c02d36cba..d5ce92e97 100644 --- a/src/triage/risklist/__init__.py +++ b/src/triage/risklist/__init__.py @@ -51,6 +51,35 @@ def train_matrix_info_from_model_id(db_engine, model_id): return db_engine.execute(get_train_matrix_query, model_id).first() +def get_feature_names(aggregation, matrix_metadata): + """Returns a feature group name and a list of feature names from a SpacetimeAggregation object""" + feature_prefix = aggregation.prefix + logger.spam("Feature prefix = %s", feature_prefix) + feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') + logger.spam("Feature group = %s", feature_group) + feature_names_in_group = [f for f in matrix_metadata['feature_names'] if re.match(f'\\A{feature_prefix}', f)] + logger.spam("Feature names in group = %s", feature_names_in_group) + + return feature_group, feature_names_in_group + +def get_feature_needs_imputation_in_train(feature_names): + features_imputed_in_train = [ + f for f in set(feature_names) + if not f.endswith('_imp') + and '_'.join(f.split('_')[0:-1]) + '_imp' in feature_names + ] + logger.spam("Features imputed in train = %s", features_imputed_in_train) + return features_imputed_in_train + + +def get_feature_needs_imputation_in_production(aggregation, conn): + nulls_results = conn.execute(aggregation.find_nulls()) + null_counts = nulls_results.first().items() + features_imputed_in_production = [col for (col, val) in null_counts if val > 0] + + return features_imputed_in_production + + def generate_risk_list(db_engine, project_storage, model_id, as_of_date): """Generate the risk list based model_id and as_of_date @@ -99,30 +128,19 @@ def generate_risk_list(db_engine, project_storage, model_id, as_of_date): imputation_table_tasks = OrderedDict() with db_engine.begin() as conn: for aggregation in collate_aggregations: - feature_prefix = aggregation.prefix - logger.spam("Feature prefix = %s", feature_prefix) - feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') - logger.spam("Feature group = %s", feature_group) - feature_names_in_group = [f for f in matrix_metadata['feature_names'] if re.match(f'\A{feature_prefix}', f)] - logger.spam("Feature names in group = %s", feature_names_in_group) - reconstructed_feature_dictionary[feature_group] = feature_names_in_group + feature_group, feature_names = get_feature_names(aggregation, matrix_metadata) + reconstructed_feature_dictionary[feature_group] = feature_names # Make sure that the features imputed in training should also be imputed in production - #import pdb - #pdb.set_trace() - features_imputed_in_train = [ - f for f in set(feature_names_in_group) - if not f.endswith('_imp') - and'_'.join(f.split('_')[0:-1]) + '_imp' in feature_names_in_group - ] - logger.spam("Features imputed in train = %s", features_imputed_in_train) - results = conn.execute(aggregation.find_nulls()) - null_counts = results.first().items() + + features_imputed_in_train = get_feature_needs_imputation_in_train(feature_names) + features_imputed_in_production = get_feature_needs_imputation_in_production(aggregation, conn) - features_imputed_in_production = [col for (col, val) in null_counts if val > 0] total_impute_cols = set(features_imputed_in_production) | set(features_imputed_in_train) - total_nonimpute_cols = set(f for f in set(feature_names_in_group) if '_imp' not in f) - total_impute_cols + total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols + task_generator = feature_generator._generate_imp_table_tasks_for + imputation_table_tasks.update(task_generator( aggregation, impute_cols=list(total_impute_cols), @@ -141,7 +159,7 @@ def generate_risk_list(db_engine, project_storage, model_id, as_of_date): matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, - engine=db_engine, + engine=db_engine, experiment_hash=None, replace=True, ) From 43c191972dc77ffa74caf027cecf94d6d0364103 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Sat, 9 Jan 2021 23:11:10 -0600 Subject: [PATCH 18/52] Handle other imputation flag cases, fix tracking indentation error --- src/tests/collate_tests/test_collate.py | 52 ++++++++++++++++++++++ src/triage/component/architect/builders.py | 4 +- src/triage/component/collate/collate.py | 34 ++++++++++---- src/triage/risklist/__init__.py | 6 +-- 4 files changed, 83 insertions(+), 13 deletions(-) diff --git a/src/tests/collate_tests/test_collate.py b/src/tests/collate_tests/test_collate.py index a4585f20a..622b21582 100755 --- a/src/tests/collate_tests/test_collate.py +++ b/src/tests/collate_tests/test_collate.py @@ -4,6 +4,7 @@ Unit tests for `collate` module. """ +import pytest from triage.component.collate import Aggregate, Aggregation, Categorical def test_aggregate(): @@ -191,3 +192,54 @@ def test_distinct(): ), ) ) == ["count(distinct (x,y)) FILTER (WHERE date < '2012-01-01')"] + + +def test_Aggregation_colname_aggregate_lookup(): + n = Aggregate("x", "sum", {}) + d = Aggregate("1", "count", {}) + m = Aggregate("y", "avg", {}) + aggregation = Aggregation( + [n, d, m], + groups=['entity_id'], + from_obj="source", + prefix="mysource", + state_table="tbl" + ) + assert aggregation.colname_aggregate_lookup == { + 'mysource_entity_id_x_sum': 'sum', + 'mysource_entity_id_1_count': 'count', + 'mysource_entity_id_y_avg': 'avg' + } + +def test_Aggregation_colname_agg_function(): + n = Aggregate("x", "sum", {}) + d = Aggregate("1", "count", {}) + m = Aggregate("y", "stddev_samp", {}) + aggregation = Aggregation( + [n, d, m], + groups=['entity_id'], + from_obj="source", + prefix="mysource", + state_table="tbl" + ) + + assert aggregation.colname_agg_function('mysource_entity_id_x_sum') == 'sum' + assert aggregation.colname_agg_function('mysource_entity_id_y_stddev_samp') == 'stddev_samp' + + +def test_Aggregation_imputation_flag_base(): + n = Aggregate("x", ["sum", "count"], {}) + m = Aggregate("y", "stddev_samp", {}) + aggregation = Aggregation( + [n, m], + groups=['entity_id'], + from_obj="source", + prefix="mysource", + state_table="tbl" + ) + + assert aggregation.imputation_flag_base('mysource_entity_id_x_sum') == 'mysource_entity_id_x' + assert aggregation.imputation_flag_base('mysource_entity_id_x_count') == 'mysource_entity_id_x' + assert aggregation.imputation_flag_base('mysource_entity_id_y_stddev_samp') == 'mysource_entity_id_y_stddev_samp' + with pytest.raises(KeyError): + aggregation.imputation_flag_base('mysource_entity_id_x_stddev_samp') diff --git a/src/triage/component/architect/builders.py b/src/triage/component/architect/builders.py index 03d72adf8..dd77cf4f6 100644 --- a/src/triage/component/architect/builders.py +++ b/src/triage/component/architect/builders.py @@ -239,8 +239,8 @@ def build_matrix( self.db_engine, ): logger.warning("labels table is not populated, cannot build matrix") - if self.run_id: - errored_matrix(self.run_id, self.db_engine) + if self.run_id: + errored_matrix(self.run_id, self.db_engine) matrix_store = self.matrix_storage_engine.get_store(matrix_uuid) if not self.replace and matrix_store.exists: diff --git a/src/triage/component/collate/collate.py b/src/triage/component/collate/collate.py index 804642bc6..be925cb44 100644 --- a/src/triage/component/collate/collate.py +++ b/src/triage/component/collate/collate.py @@ -29,6 +29,10 @@ } +class NoAggregateFunctionError(ValueError): + pass + + def make_list(a): return [a] if not isinstance(a, list) else a @@ -497,6 +501,24 @@ def colname_aggregate_lookup(self): lookup[col.name] = agg return lookup + def colname_agg_function(self, colname): + if colname.endswith('_imp'): + raise ValueError('Imputation flag columns cannot have their aggregation function inferred') + + aggregate = self.colname_aggregate_lookup[colname] + if hasattr(aggregate, 'functions'): + used_function = next(funcname for funcname in aggregate.functions if colname.endswith(funcname)) + return used_function + else: + raise NoAggregateFunctionError() + + def imputation_flag_base(self, colname): + used_function = self.colname_agg_function(colname) + if used_function in AGGFUNCS_NEED_MULTIPLE_VALUES: + return colname + else: + return colname.rstrip('_' + used_function) + def _col_prefix(self, group): """ Helper for creating a column prefix for the group @@ -726,18 +748,14 @@ def _get_impute_select(self, impute_cols, nonimpute_cols, partitionby=None): # the function, and see its available functions. we expect exactly one of # these functions to end the column name and remove it if so # this is passed to the imputer - if hasattr(self.colname_aggregate_lookup[col], 'functions'): - agg_functions = self.colname_aggregate_lookup[col].functions - used_function = next(funcname for funcname in agg_functions if col.endswith(funcname)) - if used_function in AGGFUNCS_NEED_MULTIPLE_VALUES: - impflag_basecol = col - else: - impflag_basecol = col.rstrip('_' + used_function) - else: + try: + impflag_basecol = self.imputation_flag_base(col) + except NoAggregationFunctionError: logger.warning("Imputation flag merging is not implemented for " "AggregateExpression objects that don't define an aggregate " "function (e.g. composites)") impflag_basecol = col + impute_rule = imprules[col] try: diff --git a/src/triage/risklist/__init__.py b/src/triage/risklist/__init__.py index d5ce92e97..9e287cf2f 100644 --- a/src/triage/risklist/__init__.py +++ b/src/triage/risklist/__init__.py @@ -62,11 +62,11 @@ def get_feature_names(aggregation, matrix_metadata): return feature_group, feature_names_in_group -def get_feature_needs_imputation_in_train(feature_names): +def get_feature_needs_imputation_in_train(aggregation, feature_names): features_imputed_in_train = [ f for f in set(feature_names) if not f.endswith('_imp') - and '_'.join(f.split('_')[0:-1]) + '_imp' in feature_names + and aggregation.imputation_flag_base(f) + '_imp' in feature_names ] logger.spam("Features imputed in train = %s", features_imputed_in_train) return features_imputed_in_train @@ -133,7 +133,7 @@ def generate_risk_list(db_engine, project_storage, model_id, as_of_date): # Make sure that the features imputed in training should also be imputed in production - features_imputed_in_train = get_feature_needs_imputation_in_train(feature_names) + features_imputed_in_train = get_feature_needs_imputation_in_train(aggregation, feature_names) features_imputed_in_production = get_feature_needs_imputation_in_production(aggregation, conn) total_impute_cols = set(features_imputed_in_production) | set(features_imputed_in_train) From 7dfb7e196aae1891f1dbe74ad1eb492dcb7372c7 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Sun, 10 Jan 2021 23:01:52 -0600 Subject: [PATCH 19/52] Add more tests, fill out doc page --- docs/sources/risklist/index.md | 41 ++++++++++++++++++++++++++++++++- src/tests/test_risklist.py | 39 ++++++++++++++++++++++++++++++- src/triage/risklist/__init__.py | 8 +++---- 3 files changed, 82 insertions(+), 6 deletions(-) diff --git a/docs/sources/risklist/index.md b/docs/sources/risklist/index.md index 3d244420c..7edc9be5b 100644 --- a/docs/sources/risklist/index.md +++ b/docs/sources/risklist/index.md @@ -1,3 +1,42 @@ # Risklist -Here is a stub of the risklist page +If you would like to generate a list of predictions on already-trained Triage model with new data, you can use the 'Risklist' module. + +## Examples +Both examples assume you have already run a Triage Experiment in the past, and know these two pieces of information: +1. A `model_id` from a Triage model that you want to use to generate predictions +2. An `as_of_date` to generate your predictions on. + +### CLI +`triage risklist ` + +Example: +`triage risklist 46 2019-05-06` + +The risklist will assume the current path to be the 'project path' to find models and write matrices, but this can be overridden by sending the `--project-path` option. + +### Python + +The `generate_risk_list` function from the `triage.risklist` module can be used similarly to the CLI, with the addition of the database engine and project storage as inputs. +``` +from triage.risklist generate generate_risk_list +from triage.catwalk.component.storage import ProjectStorage +from triage import create_engine + +generate_risk_list( + db_engine=create_engine(), + project_storage=ProjectStorage('/home/you/triage/project2') + model_id=46, + as_of_date='2019-05-06' +) +``` + +## Output +The Risklist is stored similarly to the matrices created during an Experiment: +- Raw Matrix saved to the matrices directory in project storage +- Predictions saved in a table (production.list_predictions) +- Prediction metadata (tiebreaking, random seed) saved in a table (production.prediction_metadata) + +## Notes +- The cohort and features for the Risklist are all inferred from the Experiment that trained the given model_id (as defined by the experiment_models table). +- The feature list ensures that imputation flag columns are present for any columns that either needed to be imputed in the training process, or that needed to be imputed in the risklist dataset. diff --git a/src/tests/test_risklist.py b/src/tests/test_risklist.py index 7fbf16ca1..20effc092 100644 --- a/src/tests/test_risklist.py +++ b/src/tests/test_risklist.py @@ -2,7 +2,7 @@ from triage.validation_primitives import table_should_have_data -def test_risklist(finished_experiment): +def test_risklist_should_write_predictions(finished_experiment): # given a model id and as-of-date <= today # and the model id is trained and is linked to an experiment with feature and cohort config # generate records in listpredictions @@ -20,6 +20,43 @@ def test_risklist(finished_experiment): ) +def test_risklist_should_be_same_shape_as_cohort(finished_experiment): + model_id = 1 + as_of_date = '2013-01-01' + generate_risk_list( + db_engine=finished_experiment.db_engine, + project_storage=finished_experiment.project_storage, + model_id=model_id, + as_of_date=as_of_date) + + num_records_matching_cohort = finished_experiment.db_engine.execute( + f'''select count(*) + from production.list_predictions + join production.cohort_{finished_experiment.config['cohort_config']['name']} using (entity_id, as_of_date) + ''' + ).first()[0] + + num_records = finished_experiment.db_engine.execute( + 'select count(*) from production.list_predictions' + ).first()[0] + assert num_records_matching_cohort == num_records + + +def test_risklist_matrix_record_is_populated(finished_experiment): + model_id = 1 + as_of_date = '2013-01-01' + generate_risk_list( + db_engine=finished_experiment.db_engine, + project_storage=finished_experiment.project_storage, + model_id=model_id, + as_of_date=as_of_date) + + matrix_records = list(finished_experiment.db_engine.execute( + "select * from triage_metadata.matrices where matrix_type = 'production'" + )) + assert len(matrix_records) == 1 + + def test_experiment_config_from_model_id(finished_experiment): model_id = 1 experiment_config = experiment_config_from_model_id(finished_experiment.db_engine, model_id) diff --git a/src/triage/risklist/__init__.py b/src/triage/risklist/__init__.py index 9e287cf2f..75362ff3b 100644 --- a/src/triage/risklist/__init__.py +++ b/src/triage/risklist/__init__.py @@ -64,10 +64,10 @@ def get_feature_names(aggregation, matrix_metadata): def get_feature_needs_imputation_in_train(aggregation, feature_names): features_imputed_in_train = [ - f for f in set(feature_names) - if not f.endswith('_imp') - and aggregation.imputation_flag_base(f) + '_imp' in feature_names - ] + f for f in set(feature_names) + if not f.endswith('_imp') + and aggregation.imputation_flag_base(f) + '_imp' in feature_names + ] logger.spam("Features imputed in train = %s", features_imputed_in_train) return features_imputed_in_train From cc9fe4ae011b6092589a67f1def00fd95643faea Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Sun, 10 Jan 2021 23:08:33 -0600 Subject: [PATCH 20/52] Fix exception name typo --- src/triage/component/collate/collate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/triage/component/collate/collate.py b/src/triage/component/collate/collate.py index be925cb44..457d8d1a2 100644 --- a/src/triage/component/collate/collate.py +++ b/src/triage/component/collate/collate.py @@ -750,7 +750,7 @@ def _get_impute_select(self, impute_cols, nonimpute_cols, partitionby=None): # this is passed to the imputer try: impflag_basecol = self.imputation_flag_base(col) - except NoAggregationFunctionError: + except NoAggregateFunctionError: logger.warning("Imputation flag merging is not implemented for " "AggregateExpression objects that don't define an aggregate " "function (e.g. composites)") From 59515658f47f8962de4a8ea800efb68f1b47755a Mon Sep 17 00:00:00 2001 From: tweddielin Date: Fri, 15 Jan 2021 00:07:01 -0600 Subject: [PATCH 21/52] use timechop and planner to create matrix_metadata for production --- src/tests/test_risklist.py | 6 +- src/triage/component/architect/planner.py | 20 ++-- src/triage/experiments/validate.py | 2 +- src/triage/risklist/__init__.py | 115 ++++++++++++++-------- 4 files changed, 91 insertions(+), 52 deletions(-) diff --git a/src/tests/test_risklist.py b/src/tests/test_risklist.py index 20effc092..2ded72fbd 100644 --- a/src/tests/test_risklist.py +++ b/src/tests/test_risklist.py @@ -8,7 +8,7 @@ def test_risklist_should_write_predictions(finished_experiment): # generate records in listpredictions # the # of records should equal the size of the cohort for that date model_id = 1 - as_of_date = '2013-01-01' + as_of_date = '2014-01-01' generate_risk_list( db_engine=finished_experiment.db_engine, project_storage=finished_experiment.project_storage, @@ -22,7 +22,7 @@ def test_risklist_should_write_predictions(finished_experiment): def test_risklist_should_be_same_shape_as_cohort(finished_experiment): model_id = 1 - as_of_date = '2013-01-01' + as_of_date = '2014-01-01' generate_risk_list( db_engine=finished_experiment.db_engine, project_storage=finished_experiment.project_storage, @@ -44,7 +44,7 @@ def test_risklist_should_be_same_shape_as_cohort(finished_experiment): def test_risklist_matrix_record_is_populated(finished_experiment): model_id = 1 - as_of_date = '2013-01-01' + as_of_date = '2014-01-01' generate_risk_list( db_engine=finished_experiment.db_engine, project_storage=finished_experiment.project_storage, diff --git a/src/triage/component/architect/planner.py b/src/triage/component/architect/planner.py index 18ca8e65d..92ea2e955 100644 --- a/src/triage/component/architect/planner.py +++ b/src/triage/component/architect/planner.py @@ -37,15 +37,17 @@ def _generate_build_task( "matrix_metadata": matrix_metadata, "matrix_type": matrix_metadata["matrix_type"], } - - def _make_metadata( - self, + + @staticmethod + def make_metadata( matrix_definition, feature_dictionary, label_name, label_type, cohort_name, matrix_type, + feature_start_time, + user_metadata, ): """ Generate dictionary of matrix metadata. @@ -77,7 +79,7 @@ def _make_metadata( ) matrix_metadata = { # temporal information - "feature_start_time": self.feature_start_time, + "feature_start_time": feature_start_time, "end_time": matrix_definition["matrix_info_end_time"], "as_of_date_frequency": matrix_definition.get( "training_as_of_date_frequency", @@ -100,7 +102,7 @@ def _make_metadata( "matrix_type": matrix_type, } matrix_metadata.update(matrix_definition) - matrix_metadata.update(self.user_metadata) + matrix_metadata.update(user_metadata) return matrix_metadata @@ -138,13 +140,15 @@ def generate_plans(self, matrix_set_definitions, feature_dictionaries): ): matrix_set_clone = copy.deepcopy(matrix_set) # get a uuid - train_metadata = self._make_metadata( + train_metadata = self.make_metadata( train_matrix, feature_dictionary, label_name, label_type, cohort_name, "train", + self.feature_start_time, + self.user_metadata, ) train_uuid = filename_friendly_hash(train_metadata) logger.debug( @@ -168,13 +172,15 @@ def generate_plans(self, matrix_set_definitions, feature_dictionaries): test_uuids = [] for test_matrix in matrix_set_clone["test_matrices"]: - test_metadata = self._make_metadata( + test_metadata = self.make_metadata( test_matrix, feature_dictionary, label_name, label_type, cohort_name, "test", + self.feature_start_time, + self.user_metadata, ) test_uuid = filename_friendly_hash(test_metadata) logger.debug( diff --git a/src/triage/experiments/validate.py b/src/triage/experiments/validate.py index 736feaafe..1e583e249 100644 --- a/src/triage/experiments/validate.py +++ b/src/triage/experiments/validate.py @@ -695,7 +695,7 @@ def _run(self, model_group_keys, user_metadata): ) ) classifier_keys = ["class_path", "parameters"] - # planner_keys are defined in architect.Planner._make_metadata + # planner_keys are defined in architect.Planner.make_metadata planner_keys = [ "feature_start_time", "end_time", diff --git a/src/triage/risklist/__init__.py b/src/triage/risklist/__init__.py index 75362ff3b..fbbde6ade 100644 --- a/src/triage/risklist/__init__.py +++ b/src/triage/risklist/__init__.py @@ -1,10 +1,13 @@ from triage.component.results_schema import upgrade_db, Experiment, ExperimentModel from triage.component.architect.entity_date_table_generators import EntityDateTableGenerator, DEFAULT_ACTIVE_STATE from triage.component.architect.features import FeatureGenerator +from triage.component.architect.feature_group_creator import FeatureGroup from triage.component.architect.builders import MatrixBuilder +from triage.component.architect.planner import Planner +from triage.component.timechop import Timechop from triage.component.catwalk.predictors import Predictor from triage.component.catwalk.utils import filename_friendly_hash -from triage.util.conf import dt_from_str +from triage.util.conf import convert_str_to_relativedelta, dt_from_str from triage.util.db import scoped_session from sqlalchemy import select @@ -63,6 +66,11 @@ def get_feature_names(aggregation, matrix_metadata): return feature_group, feature_names_in_group def get_feature_needs_imputation_in_train(aggregation, feature_names): + """Returns features that needs imputation from training data + Args: + aggregation (SpacetimeAggregation) + feature_names (list) A list of feature names + """ features_imputed_in_train = [ f for f in set(feature_names) if not f.endswith('_imp') @@ -72,8 +80,15 @@ def get_feature_needs_imputation_in_train(aggregation, feature_names): return features_imputed_in_train -def get_feature_needs_imputation_in_production(aggregation, conn): - nulls_results = conn.execute(aggregation.find_nulls()) +def get_feature_needs_imputation_in_production(aggregation, db_engine): + """Returns features that needs imputation from production + Args: + aggregation (SpacetimeAggregation) + db_engine (sqlalchemy.db.engine) + """ + with db_engine.begin() as conn: + nulls_results = conn.execute(aggregation.find_nulls()) + null_counts = nulls_results.first().items() features_imputed_in_production = [col for (col, val) in null_counts if val > 0] @@ -95,7 +110,7 @@ def generate_risk_list(db_engine, project_storage, model_id, as_of_date): # 1. Get feature and cohort config from database (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id) experiment_config = experiment_config_from_model_id(db_engine, model_id) - + # 2. Generate cohort cohort_table_name = f"production.cohort_{experiment_config['cohort_config']['name']}" cohort_table_generator = EntityDateTableGenerator( @@ -124,29 +139,31 @@ def generate_risk_list(db_engine, project_storage, model_id, as_of_date): ) # 4. Reconstruct feature disctionary from feature_names and generate imputation - reconstructed_feature_dictionary = {} + + reconstructed_feature_dict = FeatureGroup() imputation_table_tasks = OrderedDict() - with db_engine.begin() as conn: - for aggregation in collate_aggregations: - feature_group, feature_names = get_feature_names(aggregation, matrix_metadata) - reconstructed_feature_dictionary[feature_group] = feature_names - - # Make sure that the features imputed in training should also be imputed in production - - features_imputed_in_train = get_feature_needs_imputation_in_train(aggregation, feature_names) - features_imputed_in_production = get_feature_needs_imputation_in_production(aggregation, conn) - - total_impute_cols = set(features_imputed_in_production) | set(features_imputed_in_train) - total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols - - task_generator = feature_generator._generate_imp_table_tasks_for - - imputation_table_tasks.update(task_generator( - aggregation, - impute_cols=list(total_impute_cols), - nonimpute_cols=list(total_nonimpute_cols) - ) + + for aggregation in collate_aggregations: + feature_group, feature_names = get_feature_names(aggregation, matrix_metadata) + reconstructed_feature_dict[feature_group] = feature_names + + # Make sure that the features imputed in training should also be imputed in production + + features_imputed_in_train = get_feature_needs_imputation_in_train(aggregation, feature_names) + + features_imputed_in_production = get_feature_needs_imputation_in_production(aggregation, db_engine) + + total_impute_cols = set(features_imputed_in_production) | set(features_imputed_in_train) + total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols + + task_generator = feature_generator._generate_imp_table_tasks_for + + imputation_table_tasks.update(task_generator( + aggregation, + impute_cols=list(total_impute_cols), + nonimpute_cols=list(total_nonimpute_cols) ) + ) feature_generator.process_table_tasks(imputation_table_tasks) # 5. Build matrix @@ -163,26 +180,42 @@ def generate_risk_list(db_engine, project_storage, model_id, as_of_date): experiment_hash=None, replace=True, ) - - matrix_metadata = { - 'as_of_times': [as_of_date], - 'matrix_id': str(as_of_date) + '_prediction', - 'state': DEFAULT_ACTIVE_STATE, - 'test_duration': '1y', - 'matrix_type': 'production', - 'label_timespan': None, - 'label_name': experiment_config['label_config']['name'], - 'indices': ["entity_id", "as_of_date"], - 'feature_start_time': experiment_config['temporal_config']['feature_start_time'], - } + + feature_start_time = experiment_config['temporal_config']['feature_start_time'] + label_name = experiment_config['label_config']['name'] + label_type = 'binary' + cohort_name = experiment_config['cohort_config']['name'] + user_metadata = experiment_config['user_metadata'] + + # Use timechop to get the time definition for production + temporal_config = experiment_config["temporal_config"] + timechopper = Timechop(**temporal_config) + prod_definitions = timechopper.define_test_matrices( + dt_from_str(as_of_date), + temporal_config['test_durations'][0], + temporal_config['test_label_timespans'][0] + ) + + matrix_metadata = Planner.make_metadata( + prod_definitions[-1], + reconstructed_feature_dict, + label_name, + label_type, + cohort_name, + 'production', + feature_start_time, + user_metadata, + ) + + matrix_metadata['matrix_id'] = str(as_of_date) + f'_model_id_{model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) - + matrix_builder.build_matrix( as_of_times=[as_of_date], - label_name=experiment_config['label_config']['name'], - label_type=None, - feature_dictionary=reconstructed_feature_dictionary, + label_name=label_name, + label_type=label_type, + feature_dictionary=reconstructed_feature_dict, matrix_metadata=matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="production", From 537f6c81956de5aaaee957e31cb8afc054f3c0ff Mon Sep 17 00:00:00 2001 From: tweddielin Date: Thu, 15 Apr 2021 02:51:23 -0400 Subject: [PATCH 22/52] retrain and predict forward --- docs/sources/risklist/index.md | 73 ++- src/tests/test_cli.py | 17 +- src/tests/test_predictlist.py | 109 ++++ src/tests/test_risklist.py | 70 --- src/triage/cli.py | 41 +- .../component/catwalk/model_trainers.py | 71 ++- ..._production_schema_and_prediction_table.py | 30 + src/triage/component/results_schema/schema.py | 8 +- src/triage/predictlist/__init__.py | 593 ++++++++++++++++++ src/triage/risklist/__init__.py | 236 ------- 10 files changed, 891 insertions(+), 357 deletions(-) create mode 100644 src/tests/test_predictlist.py delete mode 100644 src/tests/test_risklist.py create mode 100644 src/triage/component/results_schema/alembic/versions/cdd0dc9d9870_rename_production_schema_and_prediction_table.py create mode 100644 src/triage/predictlist/__init__.py delete mode 100644 src/triage/risklist/__init__.py diff --git a/docs/sources/risklist/index.md b/docs/sources/risklist/index.md index 7edc9be5b..c48d9acf8 100644 --- a/docs/sources/risklist/index.md +++ b/docs/sources/risklist/index.md @@ -1,6 +1,8 @@ -# Risklist +# Predictlist +If you would like to generate a list of predictions on already-trained Triage model with new data, you can use the 'Predictlist' module. -If you would like to generate a list of predictions on already-trained Triage model with new data, you can use the 'Risklist' module. +# Predict Foward with Existed Model +Use an existing model object to generate predictions on new data. ## Examples Both examples assume you have already run a Triage Experiment in the past, and know these two pieces of information: @@ -8,35 +10,76 @@ Both examples assume you have already run a Triage Experiment in the past, and k 2. An `as_of_date` to generate your predictions on. ### CLI -`triage risklist ` +`triage predictlist ` Example: -`triage risklist 46 2019-05-06` +`triage predictlist 46 2019-05-06` -The risklist will assume the current path to be the 'project path' to find models and write matrices, but this can be overridden by sending the `--project-path` option. +The predictlist will assume the current path to be the 'project path' to find models and write matrices, but this can be overridden by sending the `--project-path` option. ### Python -The `generate_risk_list` function from the `triage.risklist` module can be used similarly to the CLI, with the addition of the database engine and project storage as inputs. +The `predict_forward_with_existed_model` function from the `triage.predictlist` module can be used similarly to the CLI, with the addition of the database engine and project storage as inputs. ``` -from triage.risklist generate generate_risk_list -from triage.catwalk.component.storage import ProjectStorage +from triage.predictlist import generate predict_forward_with_existed_model from triage import create_engine -generate_risk_list( +predict_forward_with_existed_model( db_engine=create_engine(), - project_storage=ProjectStorage('/home/you/triage/project2') + project_path='/home/you/triage/project2' model_id=46, as_of_date='2019-05-06' ) ``` ## Output -The Risklist is stored similarly to the matrices created during an Experiment: +The Predictlist is stored similarly to the matrices created during an Experiment: - Raw Matrix saved to the matrices directory in project storage -- Predictions saved in a table (production.list_predictions) -- Prediction metadata (tiebreaking, random seed) saved in a table (production.prediction_metadata) +- Predictions saved in a table (triage_production.predictions) +- Prediction metadata (tiebreaking, random seed) saved in a table (triage_production.prediction_metadata) ## Notes -- The cohort and features for the Risklist are all inferred from the Experiment that trained the given model_id (as defined by the experiment_models table). -- The feature list ensures that imputation flag columns are present for any columns that either needed to be imputed in the training process, or that needed to be imputed in the risklist dataset. +- The cohort and features for the Predictlist are all inferred from the Experiment that trained the given model_id (as defined by the experiment_models table). +- The feature list ensures that imputation flag columns are present for any columns that either needed to be imputed in the training process, or that needed to be imputed in the predictlist dataset. + +# Retrain and Predict +Use an existing model group to retrain a new model on all the data up to the current date and then predict forward into the future. + +## Examples +Both examples assume you have already run a Triage Experiment in the past, and know these two pieces of information: +1. A `model_group_id` from a Triage model group that you want to use to retrain a model and generate prediction +2. A `today` to generate your predictions on. + +### CLI +`triage retrainpredict ` + +Example: +`triage retrainpredict 30 2021-04-04` + +The `retrainpredict` will assume the current path to be the 'project path' to train models and write matrices, but this can be overridden by sending the `--project-path` option + +### Python +The `Retrainer` class from `triage.predictlist` module can be used to retrain a model and predict forward. + +```python +from triage.predictlist import Retrainer +from triage import create_engine + +retrainer = Retrainer( + db_engine=create_engine(), + project_path='/home/you/triage/project2' + model_group_id=36, +) +retrainer.retrain(today='2021-04-04') +retrainer.predict(today='2021-04-04') +``` + +## Output +The retrained model is sotred similariy to the matrices created during an Experiment: +- Raw Matrix saved to the matrices directory in project storage +- Raw Model saved to the trained_model directory in project storage +- Retrained Model info saved in a table (triage_metadata.models) where model_comment = 'retrain_2021-04-04' +- Predictions saved in a table (triage_production.predictions) +- Prediction metadata (tiebreaking, random seed) saved in a table (triage_produciton.prediction_metadata) + + diff --git a/src/tests/test_cli.py b/src/tests/test_cli.py index 4577b4b8b..8df512414 100644 --- a/src/tests/test_cli.py +++ b/src/tests/test_cli.py @@ -59,11 +59,20 @@ def test_featuretest(): cohortmock.assert_called_once() -def test_cli_risklist(): - with patch('triage.cli.generate_risk_list', autospec=True) as mock: - try_command('risklist', '40', '2019-06-04') +def test_cli_predictlist(): + with patch('triage.cli.predict_forward_with_existed_model', autospec=True) as mock: + try_command('predictlist', '40', '2019-06-04') mock.assert_called_once() assert mock.call_args[0][0].url - assert mock.call_args[0][1].project_path + assert mock.call_args[0][1] assert mock.call_args[0][2] == 40 assert mock.call_args[0][3] == datetime.datetime(2019, 6, 4) + + +def test_cli_retrain_predict(): + with patch('triage.cli.Retrainer', autospec=True) as mock: + try_command('retrainpredict', '3', '2021-04-04') + mock.assert_called_once() + assert mock.call_args[0][0].url + assert mock.call_args[0][1] + assert mock.call_args[0][2] == 3 diff --git a/src/tests/test_predictlist.py b/src/tests/test_predictlist.py new file mode 100644 index 000000000..a64d51e6a --- /dev/null +++ b/src/tests/test_predictlist.py @@ -0,0 +1,109 @@ +from triage.predictlist import Retrainer, predict_forward_with_existed_model, train_matrix_info_from_model_id, experiment_config_from_model_id +from triage.validation_primitives import table_should_have_data + + +def test_predict_forward_with_existed_model_should_write_predictions(finished_experiment): + # given a model id and as-of-date <= today + # and the model id is trained and is linked to an experiment with feature and cohort config + # generate records in triage_production.predictions + # the # of records should equal the size of the cohort for that date + model_id = 1 + as_of_date = '2014-01-01' + predict_forward_with_existed_model( + db_engine=finished_experiment.db_engine, + project_path=finished_experiment.project_storage.project_path, + model_id=model_id, + as_of_date=as_of_date + ) + table_should_have_data( + db_engine=finished_experiment.db_engine, + table_name="triage_production.predictions", + ) + + +def test_predict_forward_with_existed_model_should_be_same_shape_as_cohort(finished_experiment): + model_id = 1 + as_of_date = '2014-01-01' + predict_forward_with_existed_model( + db_engine=finished_experiment.db_engine, + project_path=finished_experiment.project_storage.project_path, + model_id=model_id, + as_of_date=as_of_date) + + num_records_matching_cohort = finished_experiment.db_engine.execute( + f'''select count(*) + from triage_production.predictions + join triage_production.cohort_{finished_experiment.config['cohort_config']['name']} using (entity_id, as_of_date) + ''' + ).first()[0] + + num_records = finished_experiment.db_engine.execute( + 'select count(*) from triage_production.predictions' + ).first()[0] + assert num_records_matching_cohort == num_records + + +def test_predict_forward_with_existed_model_matrix_record_is_populated(finished_experiment): + model_id = 1 + as_of_date = '2014-01-01' + predict_forward_with_existed_model( + db_engine=finished_experiment.db_engine, + project_path=finished_experiment.project_storage.project_path, + model_id=model_id, + as_of_date=as_of_date) + + matrix_records = list(finished_experiment.db_engine.execute( + "select * from triage_metadata.matrices where matrix_type = 'production'" + )) + assert len(matrix_records) == 1 + + +def test_experiment_config_from_model_id(finished_experiment): + model_id = 1 + experiment_config = experiment_config_from_model_id(finished_experiment.db_engine, model_id) + assert experiment_config == finished_experiment.config + + +def test_train_matrix_info_from_model_id(finished_experiment): + model_id = 1 + (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(finished_experiment.db_engine, model_id) + assert train_matrix_uuid + assert matrix_metadata + + +def test_retrain_should_write_model(finished_experiment): + # given a model id and today + # and the model id is trained and is linked to an experiment with feature and cohort config + # create matrix for retraining a model + # generate records in production models + # retrain_model_hash should be the same with model_hash in triage_metadata.models + model_group_id = 1 + today = '2014-03-01' + + retrainer = Retrainer( + db_engine=finished_experiment.db_engine, + project_path=finished_experiment.project_storage.project_path, + model_group_id=model_group_id, + ) + retrainer.retrain(today) + + records = [ + row + for row in finished_experiment.db_engine.execute( + "select model_hash from triage_metadata.models where model_comment = 'retrain_2014-03-01'" + ) + ] + assert len(records) == 1 + assert retrainer.retrained_model_hash == records[0][0] + + retrainer.predict(today) + + table_should_have_data( + db_engine=finished_experiment.db_engine, + table_name="triage_production.predictions", + ) + + matrix_records = list(finished_experiment.db_engine.execute( + f"select * from triage_metadata.matrices where matrix_uuid = '{retrainer.predict_matrix_uuid}'" + )) + assert len(matrix_records) == 1 diff --git a/src/tests/test_risklist.py b/src/tests/test_risklist.py deleted file mode 100644 index 2ded72fbd..000000000 --- a/src/tests/test_risklist.py +++ /dev/null @@ -1,70 +0,0 @@ -from triage.risklist import generate_risk_list, train_matrix_info_from_model_id, experiment_config_from_model_id -from triage.validation_primitives import table_should_have_data - - -def test_risklist_should_write_predictions(finished_experiment): - # given a model id and as-of-date <= today - # and the model id is trained and is linked to an experiment with feature and cohort config - # generate records in listpredictions - # the # of records should equal the size of the cohort for that date - model_id = 1 - as_of_date = '2014-01-01' - generate_risk_list( - db_engine=finished_experiment.db_engine, - project_storage=finished_experiment.project_storage, - model_id=model_id, - as_of_date=as_of_date) - table_should_have_data( - db_engine=finished_experiment.db_engine, - table_name="production.list_predictions", - ) - - -def test_risklist_should_be_same_shape_as_cohort(finished_experiment): - model_id = 1 - as_of_date = '2014-01-01' - generate_risk_list( - db_engine=finished_experiment.db_engine, - project_storage=finished_experiment.project_storage, - model_id=model_id, - as_of_date=as_of_date) - - num_records_matching_cohort = finished_experiment.db_engine.execute( - f'''select count(*) - from production.list_predictions - join production.cohort_{finished_experiment.config['cohort_config']['name']} using (entity_id, as_of_date) - ''' - ).first()[0] - - num_records = finished_experiment.db_engine.execute( - 'select count(*) from production.list_predictions' - ).first()[0] - assert num_records_matching_cohort == num_records - - -def test_risklist_matrix_record_is_populated(finished_experiment): - model_id = 1 - as_of_date = '2014-01-01' - generate_risk_list( - db_engine=finished_experiment.db_engine, - project_storage=finished_experiment.project_storage, - model_id=model_id, - as_of_date=as_of_date) - - matrix_records = list(finished_experiment.db_engine.execute( - "select * from triage_metadata.matrices where matrix_type = 'production'" - )) - assert len(matrix_records) == 1 - - -def test_experiment_config_from_model_id(finished_experiment): - model_id = 1 - experiment_config = experiment_config_from_model_id(finished_experiment.db_engine, model_id) - assert experiment_config == finished_experiment.config - - -def test_train_matrix_info_from_model_id(finished_experiment): - model_id = 1 - (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(finished_experiment.db_engine, model_id) - assert train_matrix_uuid - assert matrix_metadata diff --git a/src/triage/cli.py b/src/triage/cli.py index b53b1c4b4..7c3e16c7a 100755 --- a/src/triage/cli.py +++ b/src/triage/cli.py @@ -20,7 +20,7 @@ MultiCoreExperiment, SingleThreadedExperiment, ) -from triage.risklist import generate_risk_list +from triage.predictlist import predict_forward_with_existed_model, Retrainer from triage.component.postmodeling.crosstabs import CrosstabsConfigLoader, run_crosstabs from triage.util.db import create_engine @@ -399,9 +399,41 @@ def __call__(self, args): config = CrosstabsConfigLoader(config=yaml.full_load(fd)) run_crosstabs(db_engine, config) +@Triage.register +class RetrainPredict(Command): + """Given a model_group_id, retrain and predict forwoard use all data up to current date""" + + def __init__(self, parser): + parser.add_argument( + "model_group_id", + type=natural_number, + help="The model_group_id to use for retrain and predict" + ) + + parser.add_argument( + "today", + type=valid_date, + help="The date as of which to run features. Format YYYY-MM-DD", + ) + parser.add_argument( + "--project-path", + default=os.getcwd(), + help="path to store matrices and trained models", + ) + + def __call__(self, args): + db_engine = create_engine(self.root.db_url) + retrainer = Retrainer( + db_engine, + args.project_path, + args.model_group_id, + ) + retrainer.retrain(args.today) + retrainer.predict(args.today) + @Triage.register -class Risklist(Command): +class Predictlist(Command): """Generate a list of risk scores from an already-trained model and new data""" def __init__(self, parser): @@ -423,10 +455,9 @@ def __init__(self, parser): def __call__(self, args): db_engine = create_engine(self.root.db_url) - - generate_risk_list( + predict_forward_with_existed_model( db_engine, - ProjectStorage(args.project_path), + args.project_path, args.model_id, args.as_of_date ) diff --git a/src/triage/component/catwalk/model_trainers.py b/src/triage/component/catwalk/model_trainers.py index ed1ea45ae..48fcbfc52 100644 --- a/src/triage/component/catwalk/model_trainers.py +++ b/src/triage/component/catwalk/model_trainers.py @@ -184,6 +184,7 @@ def _write_model_to_db( model_group_id, model_size, misc_db_parameters, + retrain, ): """Writes model and feature importance data to a database Will overwrite the data of any previous versions @@ -207,22 +208,33 @@ def _write_model_to_db( misc_db_parameters (dict) params to pass through to the database """ model_id = retrieve_model_id_from_hash(self.db_engine, model_hash) - if model_id and not self.replace: + if model_id and not self.replace and not retrain: logger.notice( f"Metadata for model {model_id} found in database. Reusing model metadata." ) return model_id else: - model = Model( - model_hash=model_hash, - model_type=class_path, - hyperparameters=parameters, - model_group_id=model_group_id, - built_by_experiment=self.experiment_hash, - built_in_experiment_run=self.run_id, - model_size=model_size, - **misc_db_parameters, - ) + if retrain: + model = Model( + model_group_id=model_group_id, + model_hash=model_hash, + model_type=class_path, + hyperparameters=parameters, + model_size=model_size, + **misc_db_parameters, + ) + + else: + model = Model( + model_hash=model_hash, + model_type=class_path, + hyperparameters=parameters, + model_group_id=model_group_id, + built_by_experiment=self.experiment_hash, + built_in_experiment_run=self.run_id, + model_size=model_size, + **misc_db_parameters, + ) session = self.sessionmaker() if model_id: logger.notice( @@ -237,7 +249,7 @@ def _write_model_to_db( model_id = model.model_id logger.notice(f"Model {model_id}, not found from previous runs. Adding the new model") session.close() - + logger.spam(f"Saving feature importances for model_id {model_id}") self._save_feature_importances( model_id, get_feature_importances(trained_model), feature_names @@ -246,7 +258,7 @@ def _write_model_to_db( return model_id def _train_and_store_model( - self, matrix_store, class_path, parameters, model_hash, misc_db_parameters, random_seed + self, matrix_store, class_path, parameters, model_hash, misc_db_parameters, random_seed, retrain, model_group_id, ): """Train a model, cache it, and write metadata to a database @@ -267,17 +279,29 @@ def _train_and_store_model( unique_parameters = self.unique_parameters(parameters) - model_group_id = self.model_grouper.get_model_group_id( - class_path, unique_parameters, matrix_store.metadata, self.db_engine - ) + + if model_hash is None and retrain and model_group_id: + model_hash = self._model_hash( + matrix_store.metadata, + class_path, + parameters, + random_seed, + ) + else: + model_group_id = self.model_grouper.get_model_group_id( + class_path, unique_parameters, matrix_store.metadata, self.db_engine + ) + + # Writing th model to storage, then getting its size in kilobytes. + self.model_storage_engine.write(trained_model, model_hash) + logger.debug( f"Trained model: hash {model_hash}, model group {model_group_id} " ) - # Writing th model to storage, then getting its size in kilobytes. - self.model_storage_engine.write(trained_model, model_hash) + logger.spam(f"Cached model: {model_hash}") + model_size = sys.getsizeof(trained_model) / (1024.0) - logger.spam(f"Cached model: {model_hash}") model_id = self._write_model_to_db( class_path, unique_parameters, @@ -287,9 +311,10 @@ def _train_and_store_model( model_group_id, model_size, misc_db_parameters, + retrain, ) logger.debug(f"Wrote model {model_id} [{model_hash}] to db") - return model_id + return model_id, model_hash @contextmanager def cache_models(self): @@ -347,7 +372,7 @@ def train_models(self, grid_config, misc_db_parameters, matrix_store): ] def process_train_task( - self, matrix_store, class_path, parameters, model_hash, misc_db_parameters, random_seed=None + self, matrix_store, class_path, parameters, model_hash, misc_db_parameters, random_seed=None, retrain=False, model_group_id=None, ): """Trains and stores a model, or skips it and returns the existing id @@ -384,8 +409,8 @@ def process_train_task( f"(reason to train: {reason})" ) try: - model_id = self._train_and_store_model( - matrix_store, class_path, parameters, model_hash, misc_db_parameters, random_seed + model_id, _ = self._train_and_store_model( + matrix_store, class_path, parameters, model_hash, misc_db_parameters, random_seed, retrain, model_group_id ) except BaselineFeatureNotInMatrix: logger.warning( diff --git a/src/triage/component/results_schema/alembic/versions/cdd0dc9d9870_rename_production_schema_and_prediction_table.py b/src/triage/component/results_schema/alembic/versions/cdd0dc9d9870_rename_production_schema_and_prediction_table.py new file mode 100644 index 000000000..173e3e117 --- /dev/null +++ b/src/triage/component/results_schema/alembic/versions/cdd0dc9d9870_rename_production_schema_and_prediction_table.py @@ -0,0 +1,30 @@ +"""rename production schema and list_predcitons to triage_predcition and predictions + +Revision ID: cdd0dc9d9870 +Revises: 670289044eb2 +Create Date: 2021-04-13 00:53:56.098572 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = 'cdd0dc9d9870' +down_revision = '670289044eb2' +branch_labels = None +depends_on = None + + +def upgrade(): + op.execute("CREATE SCHEMA IF NOT EXISTS triage_production") + op.execute("ALTER TABLE production.list_predictions SET SCHEMA triage_production;") + op.execute("ALTER TABLE production.prediction_metadata SET SCHEMA triage_production") + op.execute("ALTER TABLE triage_production.list_predictions RENAME TO predictions") + + +def downgrade(): + op.execute("ALTER TABLE triage_production.predictions SET SCHEMA production;") + op.execute("ALTER TABLE triage_production.prediction_metadata SET SCHEMA production") + op.execute("ALTER TABLE production.predictions RENAME TO list_predictions") + op.execute("DROP SCHEMA IF EXISTS triage_production") diff --git a/src/triage/component/results_schema/schema.py b/src/triage/component/results_schema/schema.py index d79647ea3..233e1c73a 100644 --- a/src/triage/component/results_schema/schema.py +++ b/src/triage/component/results_schema/schema.py @@ -31,7 +31,7 @@ "CREATE SCHEMA IF NOT EXISTS triage_metadata;" " CREATE SCHEMA IF NOT EXISTS test_results;" " CREATE SCHEMA IF NOT EXISTS train_results;" - " CREATE SCHEMA IF NOT EXISTS production;" + " CREATE SCHEMA IF NOT EXISTS triage_production;" ) event.listen(Base.metadata, "before_create", DDL(schemas)) @@ -140,8 +140,8 @@ class ModelGroup(Base): class ListPrediction(Base): - __tablename__ = "list_predictions" - __table_args__ = {"schema": "production"} + __tablename__ = "predictions" + __table_args__ = {"schema": "triage_production"} model_id = Column( Integer, ForeignKey("triage_metadata.models.model_id"), primary_key=True @@ -162,7 +162,7 @@ class ListPrediction(Base): class ListPredictionMetadata(Base): __tablename__ = "prediction_metadata" - __table_args__ = {"schema": "production"} + __table_args__ = {"schema": "triage_production"} model_id = Column( Integer, ForeignKey("triage_metadata.models.model_id"), primary_key=True diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py new file mode 100644 index 000000000..003a6d9f0 --- /dev/null +++ b/src/triage/predictlist/__init__.py @@ -0,0 +1,593 @@ +from triage.component.results_schema import upgrade_db +from triage.component.architect.entity_date_table_generators import EntityDateTableGenerator, DEFAULT_ACTIVE_STATE +from triage.component.architect.features import ( + FeatureGenerator, + FeatureDictionaryCreator, + FeatureGroupCreator, + FeatureGroupMixer, +) +from triage.component.architect.feature_group_creator import FeatureGroup +from triage.component.architect.builders import MatrixBuilder +from triage.component.architect.planner import Planner +from triage.component.architect.label_generators import LabelGenerator +from triage.component.timechop import Timechop +from triage.component.catwalk.storage import ModelStorageEngine, ProjectStorage +from triage.component.catwalk import ModelTrainer +from triage.component.catwalk.model_trainers import flatten_grid_config +from triage.component.catwalk.predictors import Predictor +from triage.component.catwalk.utils import filename_friendly_hash +from triage.util.conf import convert_str_to_relativedelta, dt_from_str +from triage.util.db import scoped_session + +from collections import OrderedDict +import json +import re +import random +from datetime import datetime + +import verboselogs, logging +logger = verboselogs.VerboseLogger(__name__) + + +def experiment_config_from_model_id(db_engine, model_id): + """Get original experiment config from model_id + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + + Returns: (dict) experiment config + """ + get_experiment_query = '''select experiments.config + from triage_metadata.experiments + join triage_metadata.models on (experiments.experiment_hash = models.built_by_experiment) + where model_id = %s + ''' + (config,) = db_engine.execute(get_experiment_query, model_id).first() + return config + + +def experiment_config_from_model_group_id(db_engine, model_group_id): + """Get original experiment config from model_id + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + + Returns: (dict) experiment config + """ + get_experiment_query = '''select experiments.config + from triage_metadata.experiments + join triage_metadata.models on (experiments.experiment_hash = models.built_by_experiment) + where model_group_id = %s + ''' + (config,) = db_engine.execute(get_experiment_query, model_group_id).first() + return config + + +def get_model_group_info(db_engine, model_group_id): + query = """ + SELECT m.model_group_id, m.model_type, m.hyperparameters + FROM triage_metadata.models m + JOIN triage_metadata.model_groups mg using (model_group_id) + WHERE model_group_id = %s + """ + model_group_info = db_engine.execute(query, model_group_id).fetchone() + return dict(model_group_info) + + +def train_matrix_info_from_model_id(db_engine, model_id): + """Get original train matrix information from model_id + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + + Returns: (str, dict) matrix uuid and matrix metadata + """ + get_train_matrix_query = """ + select matrix_uuid, matrices.matrix_metadata + from triage_metadata.matrices + join triage_metadata.models on (models.train_matrix_uuid = matrices.matrix_uuid) + where model_id = %s + """ + return db_engine.execute(get_train_matrix_query, model_id).first() + + +def get_feature_names(aggregation, matrix_metadata): + """Returns a feature group name and a list of feature names from a SpacetimeAggregation object""" + feature_prefix = aggregation.prefix + logger.spam("Feature prefix = %s", feature_prefix) + feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') + logger.spam("Feature group = %s", feature_group) + feature_names_in_group = [f for f in matrix_metadata['feature_names'] if re.match(f'\\A{feature_prefix}_', f)] + logger.spam("Feature names in group = %s", feature_names_in_group) + + return feature_group, feature_names_in_group + + +def get_feature_needs_imputation_in_train(aggregation, feature_names): + """Returns features that needs imputation from training data + Args: + aggregation (SpacetimeAggregation) + feature_names (list) A list of feature names + """ + features_imputed_in_train = [ + f for f in set(feature_names) + if not f.endswith('_imp') + and aggregation.imputation_flag_base(f) + '_imp' in feature_names + ] + logger.spam("Features imputed in train = %s", features_imputed_in_train) + return features_imputed_in_train + + +def get_feature_needs_imputation_in_production(aggregation, db_engine): + """Returns features that needs imputation from triage_production + Args: + aggregation (SpacetimeAggregation) + db_engine (sqlalchemy.db.engine) + """ + with db_engine.begin() as conn: + nulls_results = conn.execute(aggregation.find_nulls()) + + null_counts = nulls_results.first().items() + features_imputed_in_production = [col for (col, val) in null_counts if val is not None and val > 0] + + return features_imputed_in_production + + +def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_date): + """Predict forward given model_id and as_of_date and store the prediction in database + + Args: + db_engine (sqlalchemy.db.engine) + project_storage (catwalk.storage.ProjectStorage) + model_id (int) The id of a given model in the database + as_of_date (string) a date string like "YYYY-MM-DD" + """ + logger.spam("In RISK LIST................") + upgrade_db(db_engine=db_engine) + project_storage = ProjectStorage(project_path) + matrix_storage_engine = project_storage.matrix_storage_engine() + # 1. Get feature and cohort config from database + (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id) + experiment_config = experiment_config_from_model_id(db_engine, model_id) + + # 2. Generate cohort + cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}" + cohort_table_generator = EntityDateTableGenerator( + db_engine=db_engine, + query=experiment_config['cohort_config']['query'], + entity_date_table_name=cohort_table_name + ) + cohort_table_generator.generate_entity_date_table(as_of_dates=[dt_from_str(as_of_date)]) + + # 3. Generate feature aggregations + feature_generator = FeatureGenerator( + db_engine=db_engine, + features_schema_name="triage_production", + feature_start_time=experiment_config['temporal_config']['feature_start_time'], + ) + collate_aggregations = feature_generator.aggregations( + feature_aggregation_config=experiment_config['feature_aggregations'], + feature_dates=[as_of_date], + state_table=cohort_table_name + ) + feature_generator.process_table_tasks( + feature_generator.generate_all_table_tasks( + collate_aggregations, + task_type='aggregation' + ) + ) + + # 4. Reconstruct feature disctionary from feature_names and generate imputation + + reconstructed_feature_dict = FeatureGroup() + imputation_table_tasks = OrderedDict() + + for aggregation in collate_aggregations: + feature_group, feature_names = get_feature_names(aggregation, matrix_metadata) + reconstructed_feature_dict[feature_group] = feature_names + + # Make sure that the features imputed in training should also be imputed in production + + features_imputed_in_train = get_feature_needs_imputation_in_train(aggregation, feature_names) + + features_imputed_in_production = get_feature_needs_imputation_in_production(aggregation, db_engine) + + total_impute_cols = set(features_imputed_in_production) | set(features_imputed_in_train) + total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols + + task_generator = feature_generator._generate_imp_table_tasks_for + + imputation_table_tasks.update(task_generator( + aggregation, + impute_cols=list(total_impute_cols), + nonimpute_cols=list(total_nonimpute_cols) + ) + ) + feature_generator.process_table_tasks(imputation_table_tasks) + + # 5. Build matrix + db_config = { + "features_schema_name": "triage_production", + "labels_schema_name": "public", + "cohort_table_name": cohort_table_name, + } + + matrix_builder = MatrixBuilder( + db_config=db_config, + matrix_storage_engine=matrix_storage_engine, + engine=db_engine, + experiment_hash=None, + replace=True, + ) + + feature_start_time = experiment_config['temporal_config']['feature_start_time'] + label_name = experiment_config['label_config']['name'] + label_type = 'binary' + cohort_name = experiment_config['cohort_config']['name'] + user_metadata = experiment_config['user_metadata'] + + # Use timechop to get the time definition for production + temporal_config = experiment_config["temporal_config"] + timechopper = Timechop(**temporal_config) + prod_definitions = timechopper.define_test_matrices( + train_test_split_time=dt_from_str(as_of_date), + test_duration=temporal_config['test_durations'][0], + test_label_timespan=temporal_config['test_label_timespans'][0] + ) + + matrix_metadata = Planner.make_metadata( + prod_definitions[-1], + reconstructed_feature_dict, + label_name, + label_type, + cohort_name, + 'production', + feature_start_time, + user_metadata, + ) + + matrix_metadata['matrix_id'] = str(as_of_date) + f'_model_id_{model_id}' + '_risklist' + + matrix_uuid = filename_friendly_hash(matrix_metadata) + + matrix_builder.build_matrix( + as_of_times=[as_of_date], + label_name=label_name, + label_type=label_type, + feature_dictionary=reconstructed_feature_dict, + matrix_metadata=matrix_metadata, + matrix_uuid=matrix_uuid, + matrix_type="production", + ) + + # 6. Predict the risk score for production + predictor = Predictor( + model_storage_engine=project_storage.model_storage_engine(), + db_engine=db_engine, + rank_order='best' + ) + + predictor.predict( + model_id=model_id, + matrix_store=matrix_storage_engine.get_store(matrix_uuid), + misc_db_parameters={}, + train_matrix_columns=matrix_storage_engine.get_store(train_matrix_uuid).columns() + ) + + +class Retrainer: + """Given a model_group_id and today, retrain a model using the all the data till today + Args: + db_engine (sqlalchemy.engine) + project_path (string) + model_group_id (string) + """ + def __init__(self, db_engine, project_path, model_group_id): + self.db_engine = db_engine + upgrade_db(db_engine=self.db_engine) + + self.project_storage = ProjectStorage(project_path) + self.model_group_id = model_group_id + self.model_trainer = None + self.matrix_storage_engine = self.project_storage.matrix_storage_engine() + self.training_label_timespan = self.experiment_config['temporal_config']['training_label_timespans'][0] + self.feature_start_time=self.experiment_config['temporal_config']['feature_start_time'] + self.label_name = self.experiment_config['label_config']['name'] + self.cohort_name = self.experiment_config['cohort_config']['name'] + self.user_metadata = self.experiment_config['user_metadata'] + self.model_group_info = get_model_group_info(self.db_engine, self.model_group_id) + + self.feature_dictionary_creator = FeatureDictionaryCreator( + features_schema_name='triage_production', db_engine=self.db_engine + ) + self.label_generator = LabelGenerator( + label_name=self.experiment_config['label_config'].get("name", None), + query=self.experiment_config['label_config']["query"], + replace=True, + db_engine=self.db_engine, + ) + + self.labels_table_name = "labels_{}_{}_production".format( + self.experiment_config['label_config'].get('name', 'default'), + filename_friendly_hash(self.experiment_config['label_config']['query']) + ) + + self.feature_generator = FeatureGenerator( + db_engine=self.db_engine, + features_schema_name="triage_production", + feature_start_time=self.feature_start_time, + ) + + self.model_trainer = ModelTrainer( + experiment_hash=None, + model_storage_engine=ModelStorageEngine(self.project_storage), + db_engine=self.db_engine, + replace=False, + run_id=None, + ) + + @property + def experiment_config(self): + experiment_config = experiment_config_from_model_group_id(self.db_engine, self.model_group_id) + return experiment_config + + def generate_all_labels(self, as_of_date): + self.label_generator.generate_all_labels( + labels_table=self.labels_table_name, + as_of_dates=[as_of_date], + label_timespans=[self.training_label_timespan] + ) + + def generate_entity_date_table(self, as_of_date, entity_date_table_name): + cohort_table_generator = EntityDateTableGenerator( + db_engine=self.db_engine, + query=self.experiment_config['cohort_config']['query'], + entity_date_table_name=entity_date_table_name + ) + cohort_table_generator.generate_entity_date_table(as_of_dates=[dt_from_str(as_of_date)]) + + def get_collate_aggregations(self, as_of_date, state_table): + collate_aggregations = self.feature_generator.aggregations( + feature_aggregation_config=self.experiment_config['feature_aggregations'], + feature_dates=[as_of_date], + state_table=state_table + ) + return collate_aggregations + + def get_feature_dict_and_imputation_task(self, collate_aggregations, model_id): + (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(self.db_engine, model_id) + reconstructed_feature_dict = FeatureGroup() + imputation_table_tasks = OrderedDict() + + for aggregation in collate_aggregations: + feature_group, feature_names = get_feature_names(aggregation, matrix_metadata) + reconstructed_feature_dict[feature_group] = feature_names + + # Make sure that the features imputed in training should also be imputed in production + + features_imputed_in_train = get_feature_needs_imputation_in_train(aggregation, feature_names) + + features_imputed_in_production = get_feature_needs_imputation_in_production(aggregation, self.db_engine) + + total_impute_cols = set(features_imputed_in_production) | set(features_imputed_in_train) + total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols + + task_generator = self.feature_generator._generate_imp_table_tasks_for + + imputation_table_tasks.update(task_generator( + aggregation, + impute_cols=list(total_impute_cols), + nonimpute_cols=list(total_nonimpute_cols) + ) + ) + return reconstructed_feature_dict, imputation_table_tasks + + def retrain(self, today): + """Retrain a model by going back one split from today, so the as_of_date for training would be (today - training_label_timespan) + + Args: + today (str) + """ + today = dt_from_str(today) + as_of_date = datetime.strftime(today - convert_str_to_relativedelta(self.training_label_timespan), "%Y-%m-%d") + + new_train_definition = { + 'first_as_of_time': dt_from_str(as_of_date), + 'last_as_of_time': dt_from_str(as_of_date), + 'matrix_info_end_time': today, + 'as_of_times': [dt_from_str(as_of_date)], + 'training_label_timespan': self.training_label_timespan, + 'training_as_of_date_frequency': self.experiment_config['temporal_config']['training_as_of_date_frequencies'], + 'max_training_history': self.experiment_config['temporal_config']['max_training_histories'][0], + } + cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain" + + # 1. Generate all labels + self.generate_all_labels(as_of_date) + + # 2. Generate cohort + self.generate_entity_date_table(as_of_date, cohort_table_name) + + # 3. Generate feature aggregations + collate_aggregations = self.get_collate_aggregations(as_of_date, cohort_table_name) + feature_aggregation_table_tasks = self.feature_generator.generate_all_table_tasks( + collate_aggregations, + task_type='aggregation' + ) + self.feature_generator.process_table_tasks(feature_aggregation_table_tasks) + + # 4. Reconstruct feature disctionary from feature_names and generate imputation + feature_imputation_table_tasks = self.feature_generator.generate_all_table_tasks( + collate_aggregations, + task_type='imputation' + ) + self.feature_generator.process_table_tasks(feature_imputation_table_tasks) + + feature_dict = self.feature_dictionary_creator.feature_dictionary( + feature_table_names=feature_imputation_table_tasks.keys(), + index_column_lookup=self.feature_generator.index_column_lookup(collate_aggregations), + ) + feature_group_creator = FeatureGroupCreator({"all": [True]}) + feature_group_mixer = FeatureGroupMixer(["all"]) + feature_group_dict = feature_group_mixer.generate( + feature_group_creator.subsets(feature_dict) + )[0] + + # 5. Build new matrix + db_config = { + "features_schema_name": "triage_production", + "labels_schema_name": "public", + "cohort_table_name": cohort_table_name, + "labels_table_name": self.labels_table_name, + } + + matrix_builder = MatrixBuilder( + db_config=db_config, + matrix_storage_engine=self.matrix_storage_engine, + engine=self.db_engine, + experiment_hash=None, + replace=True, + ) + new_matrix_metadata = Planner.make_metadata( + matrix_definition=new_train_definition, + feature_dictionary=feature_group_dict, + label_name=self.label_name, + label_type='binary', + cohort_name=self.cohort_name, + matrix_type='train', + feature_start_time=self.feature_start_time, + user_metadata=self.user_metadata, + ) + + new_matrix_metadata['matrix_id'] = "_".join( + [ + self.label_name, + 'binary', + str(as_of_date), + 'retrain', + ] + ) + + matrix_uuid = filename_friendly_hash(new_matrix_metadata) + matrix_builder.build_matrix( + as_of_times=[as_of_date], + label_name=self.label_name, + label_type='binary', + feature_dictionary=feature_group_dict, + matrix_metadata=new_matrix_metadata, + matrix_uuid=matrix_uuid, + matrix_type="train", + ) + + misc_db_parameters = { + 'train_end_time': dt_from_str(as_of_date), + 'test': False, + 'train_matrix_uuid': matrix_uuid, + 'training_label_timespan': self.training_label_timespan, + 'model_comment': 'retrain_' + datetime.strftime(today, '%Y-%m-%d'), + } + retrained_model_id, retrained_model_hash = self.model_trainer._train_and_store_model( + matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), + class_path=self.model_group_info['model_type'], + parameters=self.model_group_info['hyperparameters'], + model_hash=None, + misc_db_parameters=misc_db_parameters, + random_seed=random.randint(1,1e7), + retrain=True, + model_group_id=self.model_group_id, + ) + self.retrained_model_hash = retrained_model_hash + self.retrained_matrix_uuid = matrix_uuid + self.retrained_model_id = retrained_model_id + + def predict(self, today): + """Predict forward by creating a matrix using as_of_date = today and applying the retrained model on it + + Args: + today (str) + """ + cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_predict" + + # 1. Generate cohort + self.generate_entity_date_table(today, cohort_table_name) + + # 2. Generate feature aggregations + collate_aggregations = self.get_collate_aggregations(today, cohort_table_name) + self.feature_generator.process_table_tasks( + self.feature_generator.generate_all_table_tasks( + collate_aggregations, + task_type='aggregation' + ) + ) + # 3. Reconstruct feature disctionary from feature_names and generate imputation + reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task( + collate_aggregations, + self.retrained_model_id + ) + self.feature_generator.process_table_tasks(imputation_table_tasks) + + # 4. Build matrix + db_config = { + "features_schema_name": "triage_production", + "labels_schema_name": "public", + "cohort_table_name": cohort_table_name, + } + + matrix_builder = MatrixBuilder( + db_config=db_config, + matrix_storage_engine=self.matrix_storage_engine, + engine=self.db_engine, + experiment_hash=None, + replace=True, + ) + # Use timechop to get the time definition for production + temporal_config = self.experiment_config["temporal_config"] + timechopper = Timechop(**temporal_config) + prod_definitions = timechopper.define_test_matrices( + train_test_split_time=dt_from_str(today), + test_duration=temporal_config['test_durations'][0], + test_label_timespan=temporal_config['test_label_timespans'][0] + ) + + last_split_definition = prod_definitions[-1] + + matrix_metadata = Planner.make_metadata( + matrix_definition=last_split_definition, + feature_dictionary=reconstructed_feature_dict, + label_name=self.label_name, + label_type='binary', + cohort_name=self.cohort_name, + matrix_type='production', + feature_start_time=self.feature_start_time, + user_metadata=self.user_metadata, + ) + + matrix_metadata['matrix_id'] = str(today) + f'_model_id_{self.retrained_model_id}' + '_risklist' + + matrix_uuid = filename_friendly_hash(matrix_metadata) + + matrix_builder.build_matrix( + as_of_times=[today], + label_name=self.label_name, + label_type='binary', + feature_dictionary=reconstructed_feature_dict, + matrix_metadata=matrix_metadata, + matrix_uuid=matrix_uuid, + matrix_type="production", + ) + + # 5. Predict the risk score for production + predictor = Predictor( + model_storage_engine=self.project_storage.model_storage_engine(), + db_engine=self.db_engine, + rank_order='best' + ) + + predictor.predict( + model_id=self.retrained_model_id, + matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), + misc_db_parameters={}, + train_matrix_columns=self.matrix_storage_engine.get_store(self.retrained_matrix_uuid).columns(), + ) + + self.predict_matrix_uuid = matrix_uuid diff --git a/src/triage/risklist/__init__.py b/src/triage/risklist/__init__.py deleted file mode 100644 index fbbde6ade..000000000 --- a/src/triage/risklist/__init__.py +++ /dev/null @@ -1,236 +0,0 @@ -from triage.component.results_schema import upgrade_db, Experiment, ExperimentModel -from triage.component.architect.entity_date_table_generators import EntityDateTableGenerator, DEFAULT_ACTIVE_STATE -from triage.component.architect.features import FeatureGenerator -from triage.component.architect.feature_group_creator import FeatureGroup -from triage.component.architect.builders import MatrixBuilder -from triage.component.architect.planner import Planner -from triage.component.timechop import Timechop -from triage.component.catwalk.predictors import Predictor -from triage.component.catwalk.utils import filename_friendly_hash -from triage.util.conf import convert_str_to_relativedelta, dt_from_str -from triage.util.db import scoped_session -from sqlalchemy import select - -from collections import OrderedDict -import json -import re - -import verboselogs, logging -logger = verboselogs.VerboseLogger(__name__) - - -def experiment_config_from_model_id(db_engine, model_id): - """Get original experiment config from model_id - Args: - db_engine (sqlalchemy.db.engine) - model_id (int) The id of a given model in the database - - Returns: (dict) experiment config - """ - get_experiment_query = '''select experiments.config - from triage_metadata.experiments - join triage_metadata.experiment_models using (experiment_hash) - join triage_metadata.models using (model_hash) - where model_id = %s - ''' - (config,) = db_engine.execute(get_experiment_query, model_id).first() - return config - - -def train_matrix_info_from_model_id(db_engine, model_id): - """Get original train matrix information from model_id - Args: - db_engine (sqlalchemy.db.engine) - model_id (int) The id of a given model in the database - - Returns: (str, dict) matrix uuid and matrix metadata - """ - get_train_matrix_query = """ - select matrix_uuid, matrices.matrix_metadata - from triage_metadata.matrices - join triage_metadata.models on (models.train_matrix_uuid = matrices.matrix_uuid) - where model_id = %s - """ - return db_engine.execute(get_train_matrix_query, model_id).first() - - -def get_feature_names(aggregation, matrix_metadata): - """Returns a feature group name and a list of feature names from a SpacetimeAggregation object""" - feature_prefix = aggregation.prefix - logger.spam("Feature prefix = %s", feature_prefix) - feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') - logger.spam("Feature group = %s", feature_group) - feature_names_in_group = [f for f in matrix_metadata['feature_names'] if re.match(f'\\A{feature_prefix}', f)] - logger.spam("Feature names in group = %s", feature_names_in_group) - - return feature_group, feature_names_in_group - -def get_feature_needs_imputation_in_train(aggregation, feature_names): - """Returns features that needs imputation from training data - Args: - aggregation (SpacetimeAggregation) - feature_names (list) A list of feature names - """ - features_imputed_in_train = [ - f for f in set(feature_names) - if not f.endswith('_imp') - and aggregation.imputation_flag_base(f) + '_imp' in feature_names - ] - logger.spam("Features imputed in train = %s", features_imputed_in_train) - return features_imputed_in_train - - -def get_feature_needs_imputation_in_production(aggregation, db_engine): - """Returns features that needs imputation from production - Args: - aggregation (SpacetimeAggregation) - db_engine (sqlalchemy.db.engine) - """ - with db_engine.begin() as conn: - nulls_results = conn.execute(aggregation.find_nulls()) - - null_counts = nulls_results.first().items() - features_imputed_in_production = [col for (col, val) in null_counts if val > 0] - - return features_imputed_in_production - - -def generate_risk_list(db_engine, project_storage, model_id, as_of_date): - """Generate the risk list based model_id and as_of_date - - Args: - db_engine (sqlalchemy.db.engine) - project_storage (catwalk.storage.ProjectStorage) - model_id (int) The id of a given model in the database - as_of_date (string) a date string like "YYYY-MM-DD" - """ - logger.spam("In RISK LIST................") - upgrade_db(db_engine=db_engine) - matrix_storage_engine = project_storage.matrix_storage_engine() - # 1. Get feature and cohort config from database - (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id) - experiment_config = experiment_config_from_model_id(db_engine, model_id) - - # 2. Generate cohort - cohort_table_name = f"production.cohort_{experiment_config['cohort_config']['name']}" - cohort_table_generator = EntityDateTableGenerator( - db_engine=db_engine, - query=experiment_config['cohort_config']['query'], - entity_date_table_name=cohort_table_name - ) - cohort_table_generator.generate_entity_date_table(as_of_dates=[dt_from_str(as_of_date)]) - - # 3. Generate feature aggregations - feature_generator = FeatureGenerator( - db_engine=db_engine, - features_schema_name="production", - feature_start_time=experiment_config['temporal_config']['feature_start_time'], - ) - collate_aggregations = feature_generator.aggregations( - feature_aggregation_config=experiment_config['feature_aggregations'], - feature_dates=[as_of_date], - state_table=cohort_table_name - ) - feature_generator.process_table_tasks( - feature_generator.generate_all_table_tasks( - collate_aggregations, - task_type='aggregation' - ) - ) - - # 4. Reconstruct feature disctionary from feature_names and generate imputation - - reconstructed_feature_dict = FeatureGroup() - imputation_table_tasks = OrderedDict() - - for aggregation in collate_aggregations: - feature_group, feature_names = get_feature_names(aggregation, matrix_metadata) - reconstructed_feature_dict[feature_group] = feature_names - - # Make sure that the features imputed in training should also be imputed in production - - features_imputed_in_train = get_feature_needs_imputation_in_train(aggregation, feature_names) - - features_imputed_in_production = get_feature_needs_imputation_in_production(aggregation, db_engine) - - total_impute_cols = set(features_imputed_in_production) | set(features_imputed_in_train) - total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols - - task_generator = feature_generator._generate_imp_table_tasks_for - - imputation_table_tasks.update(task_generator( - aggregation, - impute_cols=list(total_impute_cols), - nonimpute_cols=list(total_nonimpute_cols) - ) - ) - feature_generator.process_table_tasks(imputation_table_tasks) - - # 5. Build matrix - db_config = { - "features_schema_name": "production", - "labels_schema_name": "public", - "cohort_table_name": cohort_table_name, - } - - matrix_builder = MatrixBuilder( - db_config=db_config, - matrix_storage_engine=matrix_storage_engine, - engine=db_engine, - experiment_hash=None, - replace=True, - ) - - feature_start_time = experiment_config['temporal_config']['feature_start_time'] - label_name = experiment_config['label_config']['name'] - label_type = 'binary' - cohort_name = experiment_config['cohort_config']['name'] - user_metadata = experiment_config['user_metadata'] - - # Use timechop to get the time definition for production - temporal_config = experiment_config["temporal_config"] - timechopper = Timechop(**temporal_config) - prod_definitions = timechopper.define_test_matrices( - dt_from_str(as_of_date), - temporal_config['test_durations'][0], - temporal_config['test_label_timespans'][0] - ) - - matrix_metadata = Planner.make_metadata( - prod_definitions[-1], - reconstructed_feature_dict, - label_name, - label_type, - cohort_name, - 'production', - feature_start_time, - user_metadata, - ) - - matrix_metadata['matrix_id'] = str(as_of_date) + f'_model_id_{model_id}' + '_risklist' - - matrix_uuid = filename_friendly_hash(matrix_metadata) - - matrix_builder.build_matrix( - as_of_times=[as_of_date], - label_name=label_name, - label_type=label_type, - feature_dictionary=reconstructed_feature_dict, - matrix_metadata=matrix_metadata, - matrix_uuid=matrix_uuid, - matrix_type="production", - ) - - # 6. Predict the risk score for production - predictor = Predictor( - model_storage_engine=project_storage.model_storage_engine(), - db_engine=db_engine, - rank_order='best' - ) - - predictor.predict( - model_id=model_id, - matrix_store=matrix_storage_engine.get_store(matrix_uuid), - misc_db_parameters={}, - train_matrix_columns=matrix_storage_engine.get_store(train_matrix_uuid).columns() - ) From b429540f792d4d834c08ff2c02343904eb3979bd Mon Sep 17 00:00:00 2001 From: tweddielin Date: Thu, 15 Apr 2021 10:38:32 -0400 Subject: [PATCH 23/52] rename to retrain_definition --- src/triage/predictlist/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 003a6d9f0..717c8655f 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -391,7 +391,7 @@ def retrain(self, today): today = dt_from_str(today) as_of_date = datetime.strftime(today - convert_str_to_relativedelta(self.training_label_timespan), "%Y-%m-%d") - new_train_definition = { + retrain_definition = { 'first_as_of_time': dt_from_str(as_of_date), 'last_as_of_time': dt_from_str(as_of_date), 'matrix_info_end_time': today, @@ -449,7 +449,7 @@ def retrain(self, today): replace=True, ) new_matrix_metadata = Planner.make_metadata( - matrix_definition=new_train_definition, + matrix_definition=retrain_definition, feature_dictionary=feature_group_dict, label_name=self.label_name, label_type='binary', From 0045aa5644d2e3f30b77d82cb5bdce83068b0a35 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Fri, 7 May 2021 22:47:09 -0700 Subject: [PATCH 24/52] reusing random seeds from existing models --- .../component/catwalk/model_trainers.py | 24 +++++++++++- src/triage/component/catwalk/utils.py | 39 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/src/triage/component/catwalk/model_trainers.py b/src/triage/component/catwalk/model_trainers.py index ed1ea45ae..9ec67455c 100644 --- a/src/triage/component/catwalk/model_trainers.py +++ b/src/triage/component/catwalk/model_trainers.py @@ -27,6 +27,8 @@ retrieve_model_id_from_hash, db_retry, save_db_objects, + retrieve_existing_model_random_seeds, + retrieve_experiment_seed_from_hash, ) NO_FEATURE_IMPORTANCE = ( @@ -71,6 +73,7 @@ def __init__( self.db_engine = db_engine self.replace = replace self.run_id = run_id + self.experiment_random_seed = retrieve_experiment_seed_from_hash(self.db_engine, self.experiment_hash) @property def sessionmaker(self): @@ -405,6 +408,17 @@ def process_train_task( def flattened_grid_config(grid_config): return flatten_grid_config(grid_config) + + def get_or_generate_random_seed(self, model_group_id, matrix_metadata, train_matrix_uuid): + train_end_time = matrix_metadata["end_time"] + training_label_timespan = matrix_metadata["label_timespan"] + existing_seeds = retrieve_existing_model_random_seeds(self.db_engine, model_group_id, train_end_time, train_matrix_uuid, training_label_timespan, self.experiment_random_seed) + if existing_seeds: + return existing_seeds[0] + else: + return generate_python_random_seed() + + def generate_train_tasks(self, grid_config, misc_db_parameters, matrix_store=None): """Train and store configured models, yielding the ids one by one @@ -432,7 +446,15 @@ def generate_train_tasks(self, grid_config, misc_db_parameters, matrix_store=Non tasks = [] for class_path, parameters in self.flattened_grid_config(grid_config): - random_seed = generate_python_random_seed() + + unique_parameters = self.unique_parameters(parameters) + model_group_id = self.model_grouper.get_model_group_id( + class_path, unique_parameters, matrix_store.metadata, self.db_engine + ) + random_seed = self.get_or_generate_random_seed( + model_group_id, matrix_store.metadata, matrix_store.uuid + ) + model_hash = self._model_hash( matrix_store.metadata, class_path, diff --git a/src/triage/component/catwalk/utils.py b/src/triage/component/catwalk/utils.py index dd38a735f..98c1ab662 100644 --- a/src/triage/component/catwalk/utils.py +++ b/src/triage/component/catwalk/utils.py @@ -232,6 +232,45 @@ def retrieve_model_hash_from_id(db_engine, model_id): session.close() +@db_retry +def retrieve_existing_model_random_seeds(db_engine, model_group_id, train_end_time, train_matrix_uuid, training_label_timespan, experiment_random_seed): + """Retrieve existing model random seeds matching the model parameters and + experiment-level random seed to allow for reusing seeds before creating a + new one. + """ + query = f""" + select models.random_seed + from {ExperimentModel.__table__.fullname} experiment_models + join {Model.__table__.fullname} models + on (experiment_models.model_hash = models.model_hash) + join {Experiment.__table__.fullname} experiments + on (experiment_models.experiment_hash = experiments.experiment_hash) + where models.model_group_id = %s + and models.train_end_time = %s + and models.train_matrix_uuid = %s + and models.training_label_timespan = %s + and experiments.random_seed = %s + order by models.run_time DESC, random() + """ + return [row[0] for row in db_engine.execute(query, model_group_id, train_end_time, train_matrix_uuid, training_label_timespan, experiment_random_seed)] + + +@db_retry +def retrieve_experiment_seed_from_hash(db_engine, experiment_hash): + """Retrieves the random seed associated with a given experiment hash + + Args: + experiment_hash (str) The hash of a given experiment in the database + + Returns: (int) the stored random seed from the experiment + """ + session = sessionmaker(bind=db_engine)() + try: + return session.query(Experiment).get(experiment_hash).random_seed + finally: + session.close() + + def _write_csv(file_like, db_objects, type_of_object): writer = csv.writer(file_like, quoting=csv.QUOTE_MINIMAL, lineterminator='\n') for db_object in db_objects: From 9dc3697b56e248fc41ff4e59e2ef3c04efcd9456 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Mon, 10 May 2021 14:26:02 -0700 Subject: [PATCH 25/52] fix tests (write experiment to test db) --- .../catwalk_tests/test_model_trainers.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/tests/catwalk_tests/test_model_trainers.py b/src/tests/catwalk_tests/test_model_trainers.py index bf4100acc..15fe981aa 100644 --- a/src/tests/catwalk_tests/test_model_trainers.py +++ b/src/tests/catwalk_tests/test_model_trainers.py @@ -5,6 +5,7 @@ from triage.component.catwalk.model_grouping import ModelGrouper from triage.component.catwalk.model_trainers import ModelTrainer +from triage.component.catwalk.utils import save_experiment_and_get_hash from tests.utils import get_matrix_store @@ -22,8 +23,13 @@ def grid_config(): @pytest.fixture(scope="function") def default_model_trainer(db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() + experiment_hash = save_experiment_and_get_hash( + config={'foo': 'bar'}, + random_seed=112358, + db_engine=db_engine_with_results_schema + ) trainer = ModelTrainer( - experiment_hash=None, + experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine_with_results_schema, model_grouper=ModelGrouper(), @@ -132,8 +138,13 @@ def set_test_seed(): "select max(batch_run_time) from triage_metadata.models" ) ][0] + experiment_hash = save_experiment_and_get_hash( + config={'foo': 'bar'}, + random_seed=112358, + db_engine=db_engine + ) trainer = ModelTrainer( - experiment_hash=None, + experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=["label_name", "label_timespan"] @@ -212,8 +223,13 @@ def test_baseline_exception_handling(default_model_trainer): def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() + experiment_hash = save_experiment_and_get_hash( + config={'foo': 'bar'}, + random_seed=112358, + db_engine=db_engine_with_results_schema + ) trainer = ModelTrainer( - experiment_hash=None, + experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(["class_path"]), db_engine=db_engine_with_results_schema, From da870d50fa2150618cad99dd422496e587ebd982 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Mon, 10 May 2021 16:04:39 -0700 Subject: [PATCH 26/52] unit test for reusing model random seeds --- .../catwalk_tests/test_model_trainers.py | 92 ++++++++++++++++++- 1 file changed, 89 insertions(+), 3 deletions(-) diff --git a/src/tests/catwalk_tests/test_model_trainers.py b/src/tests/catwalk_tests/test_model_trainers.py index 15fe981aa..407c49fae 100644 --- a/src/tests/catwalk_tests/test_model_trainers.py +++ b/src/tests/catwalk_tests/test_model_trainers.py @@ -25,7 +25,7 @@ def default_model_trainer(db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, - random_seed=112358, + random_seed=5, db_engine=db_engine_with_results_schema ) trainer = ModelTrainer( @@ -140,7 +140,7 @@ def set_test_seed(): ][0] experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, - random_seed=112358, + random_seed=5, db_engine=db_engine ) trainer = ModelTrainer( @@ -225,7 +225,7 @@ def test_custom_groups(grid_config, db_engine_with_results_schema, project_stora model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, - random_seed=112358, + random_seed=5, db_engine=db_engine_with_results_schema ) trainer = ModelTrainer( @@ -251,6 +251,92 @@ def test_custom_groups(grid_config, db_engine_with_results_schema, project_stora assert records[0] == model_ids[0] +def test_reuse_model_random_seeds(grid_config, default_model_trainer): + trainer = default_model_trainer + db_engine = trainer.db_engine + project_storage = trainer.model_storage_engine.project_storage + model_storage_engine = trainer.model_storage_engine + + # re-using the random seeds requires the association between experiments and models + # to exist, which we're not getting in these tests since we aren't using the experiment + # architecture, so back-fill these associations after each train_models() run + def update_experiment_models(db_engine): + sql = """ + INSERT INTO triage_metadata.experiment_models(experiment_hash,model_hash) + SELECT m.built_by_experiment, m.model_hash + FROM triage_metadata.models m + LEFT JOIN triage_metadata.experiment_models em + ON m.model_hash = em.model_hash + AND m.built_by_experiment = em.experiment_hash + WHERE em.experiment_hash IS NULL + """ + db_engine.execute(sql) + db_engine.execute('COMMIT;') + + random.seed(5) + model_ids = trainer.train_models( + grid_config=grid_config, + misc_db_parameters=dict(), + matrix_store=get_matrix_store(project_storage), + ) + update_experiment_models(db_engine) + + # simulate running a new experiment where the experiment hash has changed + # (e.g. because the model grid is different), but experiment seed is the + # same, so previously-trained models should not get new seeds + experiment_hash = save_experiment_and_get_hash( + config={'baz': 'qux'}, + random_seed=5, + db_engine=db_engine + ) + trainer = ModelTrainer( + experiment_hash=experiment_hash, + model_storage_engine=model_storage_engine, + db_engine=db_engine, + model_grouper=ModelGrouper(), + ) + new_grid = grid_config.copy() + new_grid['sklearn.tree.DecisionTreeClassifier']['min_samples_split'] = [3,10,100] + random.seed(5) + new_model_ids = trainer.train_models( + grid_config=new_grid, + misc_db_parameters=dict(), + matrix_store=get_matrix_store(project_storage), + ) + update_experiment_models(db_engine) + + # should have received 5 models + assert len(new_model_ids) == 6 + + # all the original model ids should be in the new set + assert len(set(new_model_ids) & set(model_ids)) == len(model_ids) + + # however, we should NOT re-use the random seeds (and so get new model_ids) + # if the experiment-level seed is different + experiment_hash = save_experiment_and_get_hash( + config={'lorem': 'ipsum'}, + random_seed=42, + db_engine=db_engine + ) + trainer = ModelTrainer( + experiment_hash=experiment_hash, + model_storage_engine=model_storage_engine, + db_engine=db_engine, + model_grouper=ModelGrouper(), + ) + random.seed(42) # different from above + newer_model_ids = trainer.train_models( + grid_config=new_grid, + misc_db_parameters=dict(), + matrix_store=get_matrix_store(project_storage), + ) + update_experiment_models(db_engine) + + # should get entirely new models now (different IDs) + assert len(newer_model_ids) == 6 + assert len(set(new_model_ids) & set(newer_model_ids)) == 0 + + def test_n_jobs_not_new_model(default_model_trainer): grid_config = { "sklearn.ensemble.AdaBoostClassifier": {"n_estimators": [10, 100, 1000]}, From 6768ee5c6bc8a6c53b4dea97a49144726eef34c3 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Mon, 10 May 2021 16:19:13 -0700 Subject: [PATCH 27/52] add docstring --- src/triage/component/catwalk/model_trainers.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/triage/component/catwalk/model_trainers.py b/src/triage/component/catwalk/model_trainers.py index 9ec67455c..780ebe6b5 100644 --- a/src/triage/component/catwalk/model_trainers.py +++ b/src/triage/component/catwalk/model_trainers.py @@ -410,6 +410,15 @@ def flattened_grid_config(grid_config): def get_or_generate_random_seed(self, model_group_id, matrix_metadata, train_matrix_uuid): + """Look for an existing model with the same model group, train matrix metadata, and experiment-level + random seed and reuse this model's random seed if found, otherwise generate a new one. If multiple + matching models are found, we'll use the one with the most recent run time. + + Args: + model_group_id (int): unique id for the model group this model is associated with + matrix_metadata (dict): metatdata associated with the model's training matrix + train_matrix_uuid (str): unique identifier for the model's training matrix + """ train_end_time = matrix_metadata["end_time"] training_label_timespan = matrix_metadata["label_timespan"] existing_seeds = retrieve_existing_model_random_seeds(self.db_engine, model_group_id, train_end_time, train_matrix_uuid, training_label_timespan, self.experiment_random_seed) From 7d6a420c10d67874d1ade9228ae52ca4e3d4f415 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 20 May 2021 12:17:59 -0700 Subject: [PATCH 28/52] only store random seed in experiment runs --- src/tests/catwalk_tests/test_integration.py | 2 +- .../catwalk_tests/test_model_trainers.py | 41 ++++++++++++++++--- src/tests/catwalk_tests/test_utils.py | 4 +- src/triage/component/catwalk/utils.py | 11 ++--- src/triage/component/results_schema/schema.py | 1 - src/triage/experiments/base.py | 2 +- 6 files changed, 46 insertions(+), 15 deletions(-) diff --git a/src/tests/catwalk_tests/test_integration.py b/src/tests/catwalk_tests/test_integration.py index 2a5682f07..3a7408c86 100644 --- a/src/tests/catwalk_tests/test_integration.py +++ b/src/tests/catwalk_tests/test_integration.py @@ -19,7 +19,7 @@ def test_ModelTrainTester_generate_tasks(db_engine_with_results_schema, project_ model_storage_engine = ModelStorageEngine(project_storage) matrix_storage_engine = MatrixStorageEngine(project_storage) sample_matrix_store = get_matrix_store(project_storage) - experiment_hash = save_experiment_and_get_hash({}, 1234, db_engine) + experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, diff --git a/src/tests/catwalk_tests/test_model_trainers.py b/src/tests/catwalk_tests/test_model_trainers.py index 407c49fae..2ec7be4d8 100644 --- a/src/tests/catwalk_tests/test_model_trainers.py +++ b/src/tests/catwalk_tests/test_model_trainers.py @@ -6,6 +6,7 @@ from triage.component.catwalk.model_grouping import ModelGrouper from triage.component.catwalk.model_trainers import ModelTrainer from triage.component.catwalk.utils import save_experiment_and_get_hash +from triage.tracking import initialize_tracking_and_get_run_id from tests.utils import get_matrix_store @@ -25,9 +26,15 @@ def default_model_trainer(db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, - random_seed=5, db_engine=db_engine_with_results_schema ) + run_id = initialize_tracking_and_get_run_id( + experiment_hash, + experiment_class_path="", + random_seed=5, + experiment_kwargs={}, + db_engine=db_engine_with_results_schema + ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, @@ -140,9 +147,15 @@ def set_test_seed(): ][0] experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, - random_seed=5, db_engine=db_engine ) + run_id = initialize_tracking_and_get_run_id( + experiment_hash, + experiment_class_path="", + random_seed=5, + experiment_kwargs={}, + db_engine=db_engine_with_results_schema + ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, @@ -225,9 +238,15 @@ def test_custom_groups(grid_config, db_engine_with_results_schema, project_stora model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, - random_seed=5, db_engine=db_engine_with_results_schema ) + run_id = initialize_tracking_and_get_run_id( + experiment_hash, + experiment_class_path="", + random_seed=5, + experiment_kwargs={}, + db_engine=db_engine_with_results_schema + ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, @@ -286,9 +305,15 @@ def update_experiment_models(db_engine): # same, so previously-trained models should not get new seeds experiment_hash = save_experiment_and_get_hash( config={'baz': 'qux'}, - random_seed=5, db_engine=db_engine ) + run_id = initialize_tracking_and_get_run_id( + experiment_hash, + experiment_class_path="", + random_seed=5, + experiment_kwargs={}, + db_engine=db_engine_with_results_schema + ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, @@ -315,9 +340,15 @@ def update_experiment_models(db_engine): # if the experiment-level seed is different experiment_hash = save_experiment_and_get_hash( config={'lorem': 'ipsum'}, - random_seed=42, db_engine=db_engine ) + run_id = initialize_tracking_and_get_run_id( + experiment_hash, + experiment_class_path="", + random_seed=42, + experiment_kwargs={}, + db_engine=db_engine_with_results_schema + ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, diff --git a/src/tests/catwalk_tests/test_utils.py b/src/tests/catwalk_tests/test_utils.py index 3e7dccf6b..f1ab77a53 100644 --- a/src/tests/catwalk_tests/test_utils.py +++ b/src/tests/catwalk_tests/test_utils.py @@ -64,9 +64,9 @@ def test_save_experiment_and_get_hash(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) - exp_hash = save_experiment_and_get_hash(experiment_config, 1234, engine) + exp_hash = save_experiment_and_get_hash(experiment_config, engine) assert isinstance(exp_hash, str) - new_hash = save_experiment_and_get_hash(experiment_config, 1234, engine) + new_hash = save_experiment_and_get_hash(experiment_config, engine) assert new_hash == exp_hash diff --git a/src/triage/component/catwalk/utils.py b/src/triage/component/catwalk/utils.py index 98c1ab662..7af014088 100644 --- a/src/triage/component/catwalk/utils.py +++ b/src/triage/component/catwalk/utils.py @@ -24,6 +24,7 @@ Model, ExperimentMatrix, ExperimentModel, + ExperimentRun, ) @@ -62,10 +63,10 @@ def retry_if_db_error(exception): @db_retry -def save_experiment_and_get_hash(config, random_seed, db_engine): +def save_experiment_and_get_hash(config, db_engine): experiment_hash = filename_friendly_hash(config) session = sessionmaker(bind=db_engine)() - session.merge(Experiment(experiment_hash=experiment_hash, random_seed=random_seed, config=config)) + session.merge(Experiment(experiment_hash=experiment_hash, config=config)) session.commit() session.close() return experiment_hash @@ -243,13 +244,13 @@ def retrieve_existing_model_random_seeds(db_engine, model_group_id, train_end_ti from {ExperimentModel.__table__.fullname} experiment_models join {Model.__table__.fullname} models on (experiment_models.model_hash = models.model_hash) - join {Experiment.__table__.fullname} experiments - on (experiment_models.experiment_hash = experiments.experiment_hash) + join {ExperimentRun.__table__.fullname} experiment_runs + on (experiment_models.experiment_hash = experiment_runs.experiment_hash) where models.model_group_id = %s and models.train_end_time = %s and models.train_matrix_uuid = %s and models.training_label_timespan = %s - and experiments.random_seed = %s + and experiment_runs.random_seed = %s order by models.run_time DESC, random() """ return [row[0] for row in db_engine.execute(query, model_group_id, train_end_time, train_matrix_uuid, training_label_timespan, experiment_random_seed)] diff --git a/src/triage/component/results_schema/schema.py b/src/triage/component/results_schema/schema.py index a5a647135..0f2c53ed6 100644 --- a/src/triage/component/results_schema/schema.py +++ b/src/triage/component/results_schema/schema.py @@ -68,7 +68,6 @@ class Experiment(Base): matrices_needed = Column(Integer) grid_size = Column(Integer) models_needed = Column(Integer) - random_seed = Column(Integer) class ExperimentRunStatus(enum.Enum): diff --git a/src/triage/experiments/base.py b/src/triage/experiments/base.py index 01dbd1499..2898a1624 100644 --- a/src/triage/experiments/base.py +++ b/src/triage/experiments/base.py @@ -214,7 +214,7 @@ def __init__( ###################### RUBICON ###################### - self.experiment_hash = save_experiment_and_get_hash(self.config, self.random_seed, self.db_engine) + self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine) logger.debug(f"Experiment hash [{self.experiment_hash}] assigned") self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, From b8fe6d8a73e29e194055b46fa3b3e39912931170 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 20 May 2021 15:57:04 -0700 Subject: [PATCH 29/52] DB migration to remove random seed from experiments table --- .../component/results_schema/alembic.ini | 3 + .../component/results_schema/alembic/env.py | 33 ++++++- ...829_remove_random_seed_from_experiments.py | 86 +++++++++++++++++++ 3 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 src/triage/component/results_schema/alembic/versions/b097e47ba829_remove_random_seed_from_experiments.py diff --git a/src/triage/component/results_schema/alembic.ini b/src/triage/component/results_schema/alembic.ini index 1ab81496b..3be219c32 100644 --- a/src/triage/component/results_schema/alembic.ini +++ b/src/triage/component/results_schema/alembic.ini @@ -1,2 +1,5 @@ [alembic] script_location = %(here)s/alembic + +[exclude] +tables = predictions_\d+ diff --git a/src/triage/component/results_schema/alembic/env.py b/src/triage/component/results_schema/alembic/env.py index 47644b7e1..d8b105d9f 100644 --- a/src/triage/component/results_schema/alembic/env.py +++ b/src/triage/component/results_schema/alembic/env.py @@ -3,6 +3,7 @@ import os import yaml +import re from alembic import context from sqlalchemy import create_engine from sqlalchemy import pool @@ -26,6 +27,35 @@ # my_important_option = config.get_main_option("my_important_option") # ... etc. +def get_excludes_from_config(config_, type_="tables"): + excludes = config_.get(type_, None) + if excludes is not None: + excludes = excludes.split(",") + excludes = excludes or [] + return excludes + + +excluded_tables = get_excludes_from_config(config.get_section('exclude'), "tables") +excluded_indices = get_excludes_from_config(config.get_section('exclude'), "indices") + + +def include_object(obj, name, type_, reflected, compare_to): + if type_ == "table": + for table_pat in excluded_tables: + if re.match(table_pat, name): + return False + return True + + elif type_ == "index": + for index_pat in excluded_indices: + if re.match(index_pat, name): + return False + return True + + else: + return True + + url = None if "url" in config.attributes: @@ -52,7 +82,6 @@ port=config["port"], ) - def run_migrations_offline(): """Run migrations in 'offline' mode. @@ -70,6 +99,7 @@ def run_migrations_offline(): target_metadata=target_metadata, literal_binds=True, version_table="results_schema_versions", + include_object=include_object, ) with context.begin_transaction(): @@ -92,6 +122,7 @@ def run_migrations_online(): target_metadata=target_metadata, version_table="results_schema_versions", include_schemas=True, + include_object=include_object, ) connection.execute('set search_path to "{}", public'.format("results")) diff --git a/src/triage/component/results_schema/alembic/versions/b097e47ba829_remove_random_seed_from_experiments.py b/src/triage/component/results_schema/alembic/versions/b097e47ba829_remove_random_seed_from_experiments.py new file mode 100644 index 000000000..8c9e30ce4 --- /dev/null +++ b/src/triage/component/results_schema/alembic/versions/b097e47ba829_remove_random_seed_from_experiments.py @@ -0,0 +1,86 @@ +"""empty message + +Revision ID: b097e47ba829 +Revises: 45219f25072b +Create Date: 2021-05-20 15:40:47.288721 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'b097e47ba829' +down_revision = '45219f25072b' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('experiment_runs', 'matrices_errored', + existing_type=sa.INTEGER(), + nullable=True, + schema='triage_metadata') + op.alter_column('experiment_runs', 'matrices_made', + existing_type=sa.INTEGER(), + nullable=True, + schema='triage_metadata') + op.alter_column('experiment_runs', 'matrices_skipped', + existing_type=sa.INTEGER(), + nullable=True, + schema='triage_metadata') + op.alter_column('experiment_runs', 'models_errored', + existing_type=sa.INTEGER(), + nullable=True, + schema='triage_metadata') + op.alter_column('experiment_runs', 'models_made', + existing_type=sa.INTEGER(), + nullable=True, + schema='triage_metadata') + op.alter_column('experiment_runs', 'models_skipped', + existing_type=sa.INTEGER(), + nullable=True, + schema='triage_metadata') + op.drop_column('experiments', 'random_seed', schema='triage_metadata') + op.create_index(op.f('ix_triage_metadata_matrices_matrix_uuid'), 'matrices', ['matrix_uuid'], unique=True, schema='triage_metadata') + op.drop_index('ix_model_metadata_matrices_matrix_uuid', table_name='matrices', schema='triage_metadata') + op.create_index(op.f('ix_triage_metadata_models_model_hash'), 'models', ['model_hash'], unique=True, schema='triage_metadata') + op.drop_index('ix_model_metadata_models_model_hash', table_name='models', schema='triage_metadata') + op.create_foreign_key(None, 'models', 'experiment_runs', ['built_in_experiment_run'], ['id'], source_schema='triage_metadata', referent_schema='triage_metadata') + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(None, 'models', schema='triage_metadata', type_='foreignkey') + op.create_index('ix_model_metadata_models_model_hash', 'models', ['model_hash'], unique=True, schema='triage_metadata') + op.drop_index(op.f('ix_triage_metadata_models_model_hash'), table_name='models', schema='triage_metadata') + op.create_index('ix_model_metadata_matrices_matrix_uuid', 'matrices', ['matrix_uuid'], unique=True, schema='triage_metadata') + op.drop_index(op.f('ix_triage_metadata_matrices_matrix_uuid'), table_name='matrices', schema='triage_metadata') + op.add_column('experiments', sa.Column('random_seed', sa.INTEGER(), autoincrement=False, nullable=True), schema='triage_metadata') + op.alter_column('experiment_runs', 'models_skipped', + existing_type=sa.INTEGER(), + nullable=False, + schema='triage_metadata') + op.alter_column('experiment_runs', 'models_made', + existing_type=sa.INTEGER(), + nullable=False, + schema='triage_metadata') + op.alter_column('experiment_runs', 'models_errored', + existing_type=sa.INTEGER(), + nullable=False, + schema='triage_metadata') + op.alter_column('experiment_runs', 'matrices_skipped', + existing_type=sa.INTEGER(), + nullable=False, + schema='triage_metadata') + op.alter_column('experiment_runs', 'matrices_made', + existing_type=sa.INTEGER(), + nullable=False, + schema='triage_metadata') + op.alter_column('experiment_runs', 'matrices_errored', + existing_type=sa.INTEGER(), + nullable=False, + schema='triage_metadata') + # ### end Alembic commands ### From 8207fcd996133163f7b4384be9032016b756c5a9 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 20 May 2021 16:21:33 -0700 Subject: [PATCH 30/52] debugging --- src/triage/component/catwalk/model_trainers.py | 4 ++-- src/triage/component/catwalk/utils.py | 8 ++++---- .../b097e47ba829_remove_random_seed_from_experiments.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/triage/component/catwalk/model_trainers.py b/src/triage/component/catwalk/model_trainers.py index 780ebe6b5..f2532fa7f 100644 --- a/src/triage/component/catwalk/model_trainers.py +++ b/src/triage/component/catwalk/model_trainers.py @@ -28,7 +28,7 @@ db_retry, save_db_objects, retrieve_existing_model_random_seeds, - retrieve_experiment_seed_from_hash, + retrieve_experiment_seed_from_run_id, ) NO_FEATURE_IMPORTANCE = ( @@ -73,7 +73,7 @@ def __init__( self.db_engine = db_engine self.replace = replace self.run_id = run_id - self.experiment_random_seed = retrieve_experiment_seed_from_hash(self.db_engine, self.experiment_hash) + self.experiment_random_seed = retrieve_experiment_seed_from_run_id(self.db_engine, self.run_id) @property def sessionmaker(self): diff --git a/src/triage/component/catwalk/utils.py b/src/triage/component/catwalk/utils.py index 7af014088..9f5f29294 100644 --- a/src/triage/component/catwalk/utils.py +++ b/src/triage/component/catwalk/utils.py @@ -257,17 +257,17 @@ def retrieve_existing_model_random_seeds(db_engine, model_group_id, train_end_ti @db_retry -def retrieve_experiment_seed_from_hash(db_engine, experiment_hash): - """Retrieves the random seed associated with a given experiment hash +def retrieve_experiment_seed_from_run_id(db_engine, run_id): + """Retrieves the random seed associated with a given experiment run Args: - experiment_hash (str) The hash of a given experiment in the database + run_id (int) The id of a given experiment run in the database Returns: (int) the stored random seed from the experiment """ session = sessionmaker(bind=db_engine)() try: - return session.query(Experiment).get(experiment_hash).random_seed + return session.query(ExperimentRun).get(run_id).random_seed finally: session.close() diff --git a/src/triage/component/results_schema/alembic/versions/b097e47ba829_remove_random_seed_from_experiments.py b/src/triage/component/results_schema/alembic/versions/b097e47ba829_remove_random_seed_from_experiments.py index 8c9e30ce4..e28e8794f 100644 --- a/src/triage/component/results_schema/alembic/versions/b097e47ba829_remove_random_seed_from_experiments.py +++ b/src/triage/component/results_schema/alembic/versions/b097e47ba829_remove_random_seed_from_experiments.py @@ -47,13 +47,13 @@ def upgrade(): op.drop_index('ix_model_metadata_matrices_matrix_uuid', table_name='matrices', schema='triage_metadata') op.create_index(op.f('ix_triage_metadata_models_model_hash'), 'models', ['model_hash'], unique=True, schema='triage_metadata') op.drop_index('ix_model_metadata_models_model_hash', table_name='models', schema='triage_metadata') - op.create_foreign_key(None, 'models', 'experiment_runs', ['built_in_experiment_run'], ['id'], source_schema='triage_metadata', referent_schema='triage_metadata') + op.create_foreign_key('models_built_in_experiment_run_fkey', 'models', 'experiment_runs', ['built_in_experiment_run'], ['id'], source_schema='triage_metadata', referent_schema='triage_metadata') # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.drop_constraint(None, 'models', schema='triage_metadata', type_='foreignkey') + op.drop_constraint('models_built_in_experiment_run_fkey', 'models', schema='triage_metadata', type_='foreignkey') op.create_index('ix_model_metadata_models_model_hash', 'models', ['model_hash'], unique=True, schema='triage_metadata') op.drop_index(op.f('ix_triage_metadata_models_model_hash'), table_name='models', schema='triage_metadata') op.create_index('ix_model_metadata_matrices_matrix_uuid', 'matrices', ['matrix_uuid'], unique=True, schema='triage_metadata') From 45c9d684194fa8746db6f2cf6d505a3d228dcaa1 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 20 May 2021 17:12:37 -0700 Subject: [PATCH 31/52] debug model trainer tests --- src/tests/catwalk_tests/test_model_trainers.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/tests/catwalk_tests/test_model_trainers.py b/src/tests/catwalk_tests/test_model_trainers.py index 2ec7be4d8..8e51c66c9 100644 --- a/src/tests/catwalk_tests/test_model_trainers.py +++ b/src/tests/catwalk_tests/test_model_trainers.py @@ -35,11 +35,13 @@ def default_model_trainer(db_engine_with_results_schema, project_storage): experiment_kwargs={}, db_engine=db_engine_with_results_schema ) + # import pdb; pdb.set_trace() trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine_with_results_schema, model_grouper=ModelGrouper(), + run_id=run_id, ) yield trainer @@ -154,7 +156,7 @@ def set_test_seed(): experiment_class_path="", random_seed=5, experiment_kwargs={}, - db_engine=db_engine_with_results_schema + db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, @@ -164,6 +166,7 @@ def set_test_seed(): ), db_engine=db_engine, replace=True, + run_id=run_id, ) set_test_seed() new_model_ids = trainer.train_models( @@ -252,6 +255,7 @@ def test_custom_groups(grid_config, db_engine_with_results_schema, project_stora model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(["class_path"]), db_engine=db_engine_with_results_schema, + run_id=run_id, ) # create training set model_ids = trainer.train_models( @@ -312,13 +316,14 @@ def update_experiment_models(db_engine): experiment_class_path="", random_seed=5, experiment_kwargs={}, - db_engine=db_engine_with_results_schema + db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), + run_id=run_id, ) new_grid = grid_config.copy() new_grid['sklearn.tree.DecisionTreeClassifier']['min_samples_split'] = [3,10,100] @@ -347,13 +352,14 @@ def update_experiment_models(db_engine): experiment_class_path="", random_seed=42, experiment_kwargs={}, - db_engine=db_engine_with_results_schema + db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), + run_id=run_id, ) random.seed(42) # different from above newer_model_ids = trainer.train_models( From a665e7eaf3a9d7f9f6fe2482e90ad32e5a6e1422 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 20 May 2021 17:15:36 -0700 Subject: [PATCH 32/52] debug catwalk utils tests --- src/tests/catwalk_tests/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/catwalk_tests/test_utils.py b/src/tests/catwalk_tests/test_utils.py index f1ab77a53..863fe1dd1 100644 --- a/src/tests/catwalk_tests/test_utils.py +++ b/src/tests/catwalk_tests/test_utils.py @@ -75,7 +75,7 @@ def test_missing_model_hashes(): db_engine = create_engine(postgresql.url()) ensure_db(db_engine) - experiment_hash = save_experiment_and_get_hash({}, 1234, db_engine) + experiment_hash = save_experiment_and_get_hash({}, db_engine) model_hashes = ['abcd', 'bcde', 'cdef'] # if we associate model hashes with an experiment but don't actually train the models @@ -96,7 +96,7 @@ def test_missing_matrix_uuids(): db_engine = create_engine(postgresql.url()) ensure_db(db_engine) - experiment_hash = save_experiment_and_get_hash({}, 1234, db_engine) + experiment_hash = save_experiment_and_get_hash({}, db_engine) matrix_uuids = ['abcd', 'bcde', 'cdef'] # if we associate matrix uuids with an experiment but don't actually build the matrices From ead882be806165e24ad6c9beffa35e555d735205 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 20 May 2021 18:27:54 -0700 Subject: [PATCH 33/52] debug catwalk integration test --- src/tests/catwalk_tests/test_integration.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/tests/catwalk_tests/test_integration.py b/src/tests/catwalk_tests/test_integration.py index 3a7408c86..db4344eed 100644 --- a/src/tests/catwalk_tests/test_integration.py +++ b/src/tests/catwalk_tests/test_integration.py @@ -6,6 +6,7 @@ MatrixStore, MatrixStorageEngine, ) +from triage.tracking import initialize_tracking_and_get_run_id from tests.utils import ( get_matrix_store, matrix_metadata_creator, @@ -20,11 +21,19 @@ def test_ModelTrainTester_generate_tasks(db_engine_with_results_schema, project_ matrix_storage_engine = MatrixStorageEngine(project_storage) sample_matrix_store = get_matrix_store(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) + run_id = initialize_tracking_and_get_run_id( + experiment_hash, + experiment_class_path="", + random_seed=5, + experiment_kwargs={}, + db_engine=db_engine_with_results_schema + ) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, + run_id=run_id, ) train_tester = ModelTrainTester( matrix_storage_engine=matrix_storage_engine, From de85f10187c63df954283a5f21479187727be43f Mon Sep 17 00:00:00 2001 From: tweddielin Date: Sun, 30 May 2021 19:03:10 -0400 Subject: [PATCH 34/52] use public method --- docs/mkdocs.yml | 2 +- src/tests/test_predictlist.py | 13 +++--- .../component/catwalk/model_trainers.py | 2 +- src/triage/predictlist/__init__.py | 46 +++++++++++-------- 4 files changed, 35 insertions(+), 28 deletions(-) diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index f3437e6ef..631d35bdd 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -120,7 +120,7 @@ nav: - Using Postmodeling: postmodeling/index.md - Postmodeling & Crosstabs Configuration: postmodeling/postmodeling-config.md - Model governance: dirtyduck/ml_governance.md - - Risklist: risklist/index.md + -Predictlist: predictlist/index.md - Scaling up: dirtyduck/aws_batch.md - API Reference: - Audition: diff --git a/src/tests/test_predictlist.py b/src/tests/test_predictlist.py index a64d51e6a..9deaaa5a0 100644 --- a/src/tests/test_predictlist.py +++ b/src/tests/test_predictlist.py @@ -3,7 +3,7 @@ def test_predict_forward_with_existed_model_should_write_predictions(finished_experiment): - # given a model id and as-of-date <= today + # given a model id and as-of-date <= today # and the model id is trained and is linked to an experiment with feature and cohort config # generate records in triage_production.predictions # the # of records should equal the size of the cohort for that date @@ -72,31 +72,32 @@ def test_train_matrix_info_from_model_id(finished_experiment): def test_retrain_should_write_model(finished_experiment): - # given a model id and today + # given a model id and prediction_date # and the model id is trained and is linked to an experiment with feature and cohort config # create matrix for retraining a model # generate records in production models # retrain_model_hash should be the same with model_hash in triage_metadata.models model_group_id = 1 - today = '2014-03-01' + prediction_date = '2014-03-01' retrainer = Retrainer( db_engine=finished_experiment.db_engine, project_path=finished_experiment.project_storage.project_path, model_group_id=model_group_id, ) - retrainer.retrain(today) + retrain_info = retrainer.retrain(prediction_date) + model_comment = retrain_info['retrain_model_comment'] records = [ row for row in finished_experiment.db_engine.execute( - "select model_hash from triage_metadata.models where model_comment = 'retrain_2014-03-01'" + f"select model_hash from triage_metadata.models where model_comment = '{model_comment}'" ) ] assert len(records) == 1 assert retrainer.retrained_model_hash == records[0][0] - retrainer.predict(today) + retrainer.predict(prediction_date) table_should_have_data( db_engine=finished_experiment.db_engine, diff --git a/src/triage/component/catwalk/model_trainers.py b/src/triage/component/catwalk/model_trainers.py index 48fcbfc52..7b1f96ba7 100644 --- a/src/triage/component/catwalk/model_trainers.py +++ b/src/triage/component/catwalk/model_trainers.py @@ -409,7 +409,7 @@ def process_train_task( f"(reason to train: {reason})" ) try: - model_id, _ = self._train_and_store_model( + model_id, model_hash = self._train_and_store_model( matrix_store, class_path, parameters, model_hash, misc_db_parameters, random_seed, retrain, model_group_id ) except BaselineFeatureNotInMatrix: diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 717c8655f..146b6977b 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -15,7 +15,7 @@ from triage.component.catwalk import ModelTrainer from triage.component.catwalk.model_trainers import flatten_grid_config from triage.component.catwalk.predictors import Predictor -from triage.component.catwalk.utils import filename_friendly_hash +from triage.component.catwalk.utils import retrieve_model_hash_from_id, filename_friendly_hash from triage.util.conf import convert_str_to_relativedelta, dt_from_str from triage.util.db import scoped_session @@ -276,7 +276,7 @@ def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_ class Retrainer: - """Given a model_group_id and today, retrain a model using the all the data till today + """Given a model_group_id and prediction_date, retrain a model using the all the data till prediction_date Args: db_engine (sqlalchemy.engine) project_path (string) @@ -322,7 +322,7 @@ def __init__(self, db_engine, project_path, model_group_id): experiment_hash=None, model_storage_engine=ModelStorageEngine(self.project_storage), db_engine=self.db_engine, - replace=False, + replace=True, run_id=None, ) @@ -382,19 +382,19 @@ def get_feature_dict_and_imputation_task(self, collate_aggregations, model_id): ) return reconstructed_feature_dict, imputation_table_tasks - def retrain(self, today): - """Retrain a model by going back one split from today, so the as_of_date for training would be (today - training_label_timespan) + def retrain(self, prediction_date): + """Retrain a model by going back one split from prediction_date, so the as_of_date for training would be (prediction_date - training_label_timespan) Args: - today (str) + prediction_date(str) """ - today = dt_from_str(today) - as_of_date = datetime.strftime(today - convert_str_to_relativedelta(self.training_label_timespan), "%Y-%m-%d") + prediction_date = dt_from_str(prediction_date) + as_of_date = datetime.strftime(prediction_date - convert_str_to_relativedelta(self.training_label_timespan), "%Y-%m-%d") retrain_definition = { 'first_as_of_time': dt_from_str(as_of_date), 'last_as_of_time': dt_from_str(as_of_date), - 'matrix_info_end_time': today, + 'matrix_info_end_time': prediction_date, 'as_of_times': [dt_from_str(as_of_date)], 'training_label_timespan': self.training_label_timespan, 'training_as_of_date_frequency': self.experiment_config['temporal_config']['training_as_of_date_frequencies'], @@ -427,6 +427,7 @@ def retrain(self, today): feature_table_names=feature_imputation_table_tasks.keys(), index_column_lookup=self.feature_generator.index_column_lookup(collate_aggregations), ) + feature_group_creator = FeatureGroupCreator({"all": [True]}) feature_group_mixer = FeatureGroupMixer(["all"]) feature_group_dict = feature_group_mixer.generate( @@ -478,15 +479,17 @@ def retrain(self, today): matrix_uuid=matrix_uuid, matrix_type="train", ) + + retrain_model_comment = 'retrain_' + str(datetime.now()) misc_db_parameters = { 'train_end_time': dt_from_str(as_of_date), 'test': False, 'train_matrix_uuid': matrix_uuid, 'training_label_timespan': self.training_label_timespan, - 'model_comment': 'retrain_' + datetime.strftime(today, '%Y-%m-%d'), + 'model_comment': retrain_model_comment, } - retrained_model_id, retrained_model_hash = self.model_trainer._train_and_store_model( + retrained_model_id = self.model_trainer.process_train_task( matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], @@ -496,23 +499,26 @@ def retrain(self, today): retrain=True, model_group_id=self.model_group_id, ) - self.retrained_model_hash = retrained_model_hash + + self.retrained_model_hash = retrieve_model_hash_from_id(self.db_engine, retrained_model_id) self.retrained_matrix_uuid = matrix_uuid self.retrained_model_id = retrained_model_id + + return {'retrain_model_comment': retrain_model_comment} - def predict(self, today): - """Predict forward by creating a matrix using as_of_date = today and applying the retrained model on it + def predict(self, prediction_date): + """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrained model on it Args: - today (str) + prediction_date(str) """ cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_predict" # 1. Generate cohort - self.generate_entity_date_table(today, cohort_table_name) + self.generate_entity_date_table(prediction_date, cohort_table_name) # 2. Generate feature aggregations - collate_aggregations = self.get_collate_aggregations(today, cohort_table_name) + collate_aggregations = self.get_collate_aggregations(prediction_date, cohort_table_name) self.feature_generator.process_table_tasks( self.feature_generator.generate_all_table_tasks( collate_aggregations, @@ -544,7 +550,7 @@ def predict(self, today): temporal_config = self.experiment_config["temporal_config"] timechopper = Timechop(**temporal_config) prod_definitions = timechopper.define_test_matrices( - train_test_split_time=dt_from_str(today), + train_test_split_time=dt_from_str(prediction_date), test_duration=temporal_config['test_durations'][0], test_label_timespan=temporal_config['test_label_timespans'][0] ) @@ -562,12 +568,12 @@ def predict(self, today): user_metadata=self.user_metadata, ) - matrix_metadata['matrix_id'] = str(today) + f'_model_id_{self.retrained_model_id}' + '_risklist' + matrix_metadata['matrix_id'] = str(prediction_date) + f'_model_id_{self.retrained_model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( - as_of_times=[today], + as_of_times=[prediction_date], label_name=self.label_name, label_type='binary', feature_dictionary=reconstructed_feature_dict, From 40466d55db608984f91e2fba41771436393ca253 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Sun, 30 May 2021 23:48:37 -0400 Subject: [PATCH 35/52] alembic merge --- ...8b_merge_b097e47ba829_with_cdd0dc9d9870.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 src/triage/component/results_schema/alembic/versions/079a74c15e8b_merge_b097e47ba829_with_cdd0dc9d9870.py diff --git a/src/triage/component/results_schema/alembic/versions/079a74c15e8b_merge_b097e47ba829_with_cdd0dc9d9870.py b/src/triage/component/results_schema/alembic/versions/079a74c15e8b_merge_b097e47ba829_with_cdd0dc9d9870.py new file mode 100644 index 000000000..73021015a --- /dev/null +++ b/src/triage/component/results_schema/alembic/versions/079a74c15e8b_merge_b097e47ba829_with_cdd0dc9d9870.py @@ -0,0 +1,24 @@ +"""merge b097e47ba829 with cdd0dc9d9870 + +Revision ID: 079a74c15e8b +Revises: b097e47ba829, cdd0dc9d9870 +Create Date: 2021-05-30 20:49:19.039280 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '079a74c15e8b' +down_revision = ('b097e47ba829', 'cdd0dc9d9870') +branch_labels = None +depends_on = None + + +def upgrade(): + pass + + +def downgrade(): + pass From 83c73856f42d9e41dac8b55aecfb7edc3a652fd8 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Sun, 30 May 2021 23:49:46 -0400 Subject: [PATCH 36/52] reuse random seed --- .../test_model_group_evaluator.py | 2 +- src/tests/utils.py | 2 +- src/triage/predictlist/__init__.py | 43 ++++++++++++------- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/src/tests/postmodeling_tests/test_model_group_evaluator.py b/src/tests/postmodeling_tests/test_model_group_evaluator.py index 06a31482b..7db44e625 100644 --- a/src/tests/postmodeling_tests/test_model_group_evaluator.py +++ b/src/tests/postmodeling_tests/test_model_group_evaluator.py @@ -11,7 +11,7 @@ def model_group_evaluator(finished_experiment): def test_ModelGroupEvaluator_metadata(model_group_evaluator): assert isinstance(model_group_evaluator.metadata, list) - assert len(model_group_evaluator.metadata) == 8 # 8 model groups expected from basic experiment + assert len(model_group_evaluator.metadata) == 2 # 2 models expected for a model_group from basic experiment for row in model_group_evaluator.metadata: assert isinstance(row, dict) diff --git a/src/tests/utils.py b/src/tests/utils.py index a1d43fe7e..6265b9f00 100644 --- a/src/tests/utils.py +++ b/src/tests/utils.py @@ -437,7 +437,7 @@ def sample_config(): "label_config": label_config, "entity_column_name": "entity_id", "model_comment": "test2-final-final", - "model_group_keys": ["label_name", "label_type", "custom_key"], + "model_group_keys": ["label_name", "label_type", "custom_key", "class_path", "parameters"], "feature_aggregations": feature_config, "cohort_config": cohort_config, "temporal_config": temporal_config, diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 146b6977b..cafef1117 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -54,21 +54,26 @@ def experiment_config_from_model_group_id(db_engine, model_group_id): Returns: (dict) experiment config """ - get_experiment_query = '''select experiments.config + get_experiment_query = ''' + select experiment_runs.id as run_id, experiments.config from triage_metadata.experiments - join triage_metadata.models on (experiments.experiment_hash = models.built_by_experiment) + join triage_metadata.models + on (experiments.experiment_hash = models.built_by_experiment) + join triage_metadata.experiment_runs + on (experiment_runs.experiment_hash = models.built_by_experiment) where model_group_id = %s ''' - (config,) = db_engine.execute(get_experiment_query, model_group_id).first() - return config + (run_id, config) = db_engine.execute(get_experiment_query, model_group_id).first() + return run_id, config def get_model_group_info(db_engine, model_group_id): query = """ - SELECT m.model_group_id, m.model_type, m.hyperparameters + SELECT m.model_group_id, m.model_type, m.hyperparameters, m.model_id as model_id_last_split FROM triage_metadata.models m JOIN triage_metadata.model_groups mg using (model_group_id) WHERE model_group_id = %s + ORDER BY m.train_end_time DESC """ model_group_info = db_engine.execute(query, model_group_id).fetchone() return dict(model_group_info) @@ -285,11 +290,11 @@ class Retrainer: def __init__(self, db_engine, project_path, model_group_id): self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) - self.project_storage = ProjectStorage(project_path) self.model_group_id = model_group_id self.model_trainer = None self.matrix_storage_engine = self.project_storage.matrix_storage_engine() + self.run_id, self.experiment_config = experiment_config_from_model_group_id(self.db_engine, self.model_group_id) self.training_label_timespan = self.experiment_config['temporal_config']['training_label_timespans'][0] self.feature_start_time=self.experiment_config['temporal_config']['feature_start_time'] self.label_name = self.experiment_config['label_config']['name'] @@ -323,14 +328,9 @@ def __init__(self, db_engine, project_path, model_group_id): model_storage_engine=ModelStorageEngine(self.project_storage), db_engine=self.db_engine, replace=True, - run_id=None, + run_id=self.run_id, ) - @property - def experiment_config(self): - experiment_config = experiment_config_from_model_group_id(self.db_engine, self.model_group_id) - return experiment_config - def generate_all_labels(self, as_of_date): self.label_generator.generate_all_labels( labels_table=self.labels_table_name, @@ -401,7 +401,7 @@ def retrain(self, prediction_date): 'max_training_history': self.experiment_config['temporal_config']['max_training_histories'][0], } cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain" - + # 1. Generate all labels self.generate_all_labels(as_of_date) @@ -479,7 +479,6 @@ def retrain(self, prediction_date): matrix_uuid=matrix_uuid, matrix_type="train", ) - retrain_model_comment = 'retrain_' + str(datetime.now()) misc_db_parameters = { @@ -489,15 +488,27 @@ def retrain(self, prediction_date): 'training_label_timespan': self.training_label_timespan, 'model_comment': retrain_model_comment, } + + last_split_train_matrix_uuid, last_split_matrix_metadata = train_matrix_info_from_model_id( + self.db_engine, + model_id=self.model_group_info['model_id_last_split'] + ) + + random_seed = self.model_trainer.get_or_generate_random_seed( + model_group_id=self.model_group_id, + matrix_metadata=last_split_matrix_metadata, + train_matrix_uuid=last_split_train_matrix_uuid + ) + retrained_model_id = self.model_trainer.process_train_task( matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], model_hash=None, misc_db_parameters=misc_db_parameters, - random_seed=random.randint(1,1e7), + random_seed=random_seed, retrain=True, - model_group_id=self.model_group_id, + model_group_id=self.model_group_id ) self.retrained_model_hash = retrieve_model_hash_from_id(self.db_engine, retrained_model_id) From f97089bf3e6a28d09ea991b1928f3843e9ff80d7 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Wed, 30 Jun 2021 17:52:34 -0400 Subject: [PATCH 37/52] use timechop for getting retrain information --- src/triage/predictlist/__init__.py | 36 +++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index cafef1117..225216440 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -296,6 +296,7 @@ def __init__(self, db_engine, project_path, model_group_id): self.matrix_storage_engine = self.project_storage.matrix_storage_engine() self.run_id, self.experiment_config = experiment_config_from_model_group_id(self.db_engine, self.model_group_id) self.training_label_timespan = self.experiment_config['temporal_config']['training_label_timespans'][0] + self.test_label_timespan = self.experiment_config['temporal_config']['test_label_timespans'][0] self.feature_start_time=self.experiment_config['temporal_config']['feature_start_time'] self.label_name = self.experiment_config['label_config']['name'] self.cohort_name = self.experiment_config['cohort_config']['name'] @@ -389,19 +390,33 @@ def retrain(self, prediction_date): prediction_date(str) """ prediction_date = dt_from_str(prediction_date) - as_of_date = datetime.strftime(prediction_date - convert_str_to_relativedelta(self.training_label_timespan), "%Y-%m-%d") - + # as_of_date = datetime.strftime(prediction_date - convert_str_to_relativedelta(self.training_label_timespan), "%Y-%m-%d") + temporal_config = self.experiment_config['temporal_config'].copy() + temporal_config['feature_end_time'] = datetime.strftime(prediction_date, "%Y-%m-%d") + temporal_config['label_start_time'] = datetime.strftime( + prediction_date - + convert_str_to_relativedelta(self.training_label_timespan) - + convert_str_to_relativedelta(self.test_label_timespan), + "%Y-%m-%d") + temporal_config['label_end_time'] = datetime.strftime( + prediction_date + convert_str_to_relativedelta(self.test_label_timespan), + "%Y-%m-%d") + temporal_config['model_update_frequency'] = self.test_label_timespan + timechopper = Timechop(**temporal_config) + chops = timechopper.chop_time() + assert len(chops) == 1 + chops_train_matrix = chops[0]['train_matrix'] retrain_definition = { - 'first_as_of_time': dt_from_str(as_of_date), - 'last_as_of_time': dt_from_str(as_of_date), - 'matrix_info_end_time': prediction_date, - 'as_of_times': [dt_from_str(as_of_date)], - 'training_label_timespan': self.training_label_timespan, - 'training_as_of_date_frequency': self.experiment_config['temporal_config']['training_as_of_date_frequencies'], - 'max_training_history': self.experiment_config['temporal_config']['max_training_histories'][0], + 'first_as_of_time': chops_train_matrix['first_as_of_time'], + 'last_as_of_time': chops_train_matrix['last_as_of_time'], + 'matrix_info_end_time': chops_train_matrix['matrix_info_end_time'], + 'as_of_times': chops_train_matrix['as_of_times'], + 'training_label_timespan': chops_train_matrix['training_label_timespan'], + 'max_training_history': chops_train_matrix['max_training_history'], } - cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain" + as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'], "%Y-%m-%d") + cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain" # 1. Generate all labels self.generate_all_labels(as_of_date) @@ -514,7 +529,6 @@ def retrain(self, prediction_date): self.retrained_model_hash = retrieve_model_hash_from_id(self.db_engine, retrained_model_id) self.retrained_matrix_uuid = matrix_uuid self.retrained_model_id = retrained_model_id - return {'retrain_model_comment': retrain_model_comment} def predict(self, prediction_date): From 6f0af1c15c56f83ebe4bb9aeb18d21eef6f18ec0 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Wed, 30 Jun 2021 19:57:30 -0400 Subject: [PATCH 38/52] create retrain model hash in retrain level instead of model_trainer level --- src/triage/component/catwalk/model_trainers.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/triage/component/catwalk/model_trainers.py b/src/triage/component/catwalk/model_trainers.py index 586b95675..c36823382 100644 --- a/src/triage/component/catwalk/model_trainers.py +++ b/src/triage/component/catwalk/model_trainers.py @@ -283,13 +283,11 @@ def _train_and_store_model( unique_parameters = self.unique_parameters(parameters) - if model_hash is None and retrain and model_group_id: - model_hash = self._model_hash( - matrix_store.metadata, - class_path, - parameters, - random_seed, - ) + if retrain: + # if retrain, use the provided model_group_id + if not model_group_id: + raise ValueError("model_group_id should be provided when retrain") + else: model_group_id = self.model_grouper.get_model_group_id( class_path, unique_parameters, matrix_store.metadata, self.db_engine From 42bccaa09335eac799850396361e8bb464899740 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Wed, 30 Jun 2021 19:58:12 -0400 Subject: [PATCH 39/52] move util functions to utils --- src/triage/predictlist/__init__.py | 168 +++++++---------------------- src/triage/predictlist/utils.py | 114 ++++++++++++++++++++ 2 files changed, 154 insertions(+), 128 deletions(-) create mode 100644 src/triage/predictlist/utils.py diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 225216440..7968020b4 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -18,10 +18,19 @@ from triage.component.catwalk.utils import retrieve_model_hash_from_id, filename_friendly_hash from triage.util.conf import convert_str_to_relativedelta, dt_from_str from triage.util.db import scoped_session +from .utils import ( + experiment_config_from_model_id, + experiment_config_from_model_group_id, + get_model_group_info, + train_matrix_info_from_model_id, + get_feature_names, + get_feature_needs_imputation_in_train, + get_feature_needs_imputation_in_production, +) + from collections import OrderedDict import json -import re import random from datetime import datetime @@ -29,114 +38,6 @@ logger = verboselogs.VerboseLogger(__name__) -def experiment_config_from_model_id(db_engine, model_id): - """Get original experiment config from model_id - Args: - db_engine (sqlalchemy.db.engine) - model_id (int) The id of a given model in the database - - Returns: (dict) experiment config - """ - get_experiment_query = '''select experiments.config - from triage_metadata.experiments - join triage_metadata.models on (experiments.experiment_hash = models.built_by_experiment) - where model_id = %s - ''' - (config,) = db_engine.execute(get_experiment_query, model_id).first() - return config - - -def experiment_config_from_model_group_id(db_engine, model_group_id): - """Get original experiment config from model_id - Args: - db_engine (sqlalchemy.db.engine) - model_id (int) The id of a given model in the database - - Returns: (dict) experiment config - """ - get_experiment_query = ''' - select experiment_runs.id as run_id, experiments.config - from triage_metadata.experiments - join triage_metadata.models - on (experiments.experiment_hash = models.built_by_experiment) - join triage_metadata.experiment_runs - on (experiment_runs.experiment_hash = models.built_by_experiment) - where model_group_id = %s - ''' - (run_id, config) = db_engine.execute(get_experiment_query, model_group_id).first() - return run_id, config - - -def get_model_group_info(db_engine, model_group_id): - query = """ - SELECT m.model_group_id, m.model_type, m.hyperparameters, m.model_id as model_id_last_split - FROM triage_metadata.models m - JOIN triage_metadata.model_groups mg using (model_group_id) - WHERE model_group_id = %s - ORDER BY m.train_end_time DESC - """ - model_group_info = db_engine.execute(query, model_group_id).fetchone() - return dict(model_group_info) - - -def train_matrix_info_from_model_id(db_engine, model_id): - """Get original train matrix information from model_id - Args: - db_engine (sqlalchemy.db.engine) - model_id (int) The id of a given model in the database - - Returns: (str, dict) matrix uuid and matrix metadata - """ - get_train_matrix_query = """ - select matrix_uuid, matrices.matrix_metadata - from triage_metadata.matrices - join triage_metadata.models on (models.train_matrix_uuid = matrices.matrix_uuid) - where model_id = %s - """ - return db_engine.execute(get_train_matrix_query, model_id).first() - - -def get_feature_names(aggregation, matrix_metadata): - """Returns a feature group name and a list of feature names from a SpacetimeAggregation object""" - feature_prefix = aggregation.prefix - logger.spam("Feature prefix = %s", feature_prefix) - feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') - logger.spam("Feature group = %s", feature_group) - feature_names_in_group = [f for f in matrix_metadata['feature_names'] if re.match(f'\\A{feature_prefix}_', f)] - logger.spam("Feature names in group = %s", feature_names_in_group) - - return feature_group, feature_names_in_group - - -def get_feature_needs_imputation_in_train(aggregation, feature_names): - """Returns features that needs imputation from training data - Args: - aggregation (SpacetimeAggregation) - feature_names (list) A list of feature names - """ - features_imputed_in_train = [ - f for f in set(feature_names) - if not f.endswith('_imp') - and aggregation.imputation_flag_base(f) + '_imp' in feature_names - ] - logger.spam("Features imputed in train = %s", features_imputed_in_train) - return features_imputed_in_train - - -def get_feature_needs_imputation_in_production(aggregation, db_engine): - """Returns features that needs imputation from triage_production - Args: - aggregation (SpacetimeAggregation) - db_engine (sqlalchemy.db.engine) - """ - with db_engine.begin() as conn: - nulls_results = conn.execute(aggregation.find_nulls()) - - null_counts = nulls_results.first().items() - features_imputed_in_production = [col for (col, val) in null_counts if val is not None and val > 0] - - return features_imputed_in_production - def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_date): """Predict forward given model_id and as_of_date and store the prediction in database @@ -147,7 +48,7 @@ def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_ model_id (int) The id of a given model in the database as_of_date (string) a date string like "YYYY-MM-DD" """ - logger.spam("In RISK LIST................") + logger.spam("In PREDICT LIST................") upgrade_db(db_engine=db_engine) project_storage = ProjectStorage(project_path) matrix_storage_engine = project_storage.matrix_storage_engine() @@ -292,7 +193,6 @@ def __init__(self, db_engine, project_path, model_group_id): upgrade_db(db_engine=self.db_engine) self.project_storage = ProjectStorage(project_path) self.model_group_id = model_group_id - self.model_trainer = None self.matrix_storage_engine = self.project_storage.matrix_storage_engine() self.run_id, self.experiment_config = experiment_config_from_model_group_id(self.db_engine, self.model_group_id) self.training_label_timespan = self.experiment_config['temporal_config']['training_label_timespans'][0] @@ -331,6 +231,21 @@ def __init__(self, db_engine, project_path, model_group_id): replace=True, run_id=self.run_id, ) + + def get_temporal_config_for_retrain(self, prediction_date): + temporal_config = self.experiment_config['temporal_config'].copy() + temporal_config['feature_end_time'] = datetime.strftime(prediction_date, "%Y-%m-%d") + temporal_config['label_start_time'] = datetime.strftime( + prediction_date - + convert_str_to_relativedelta(self.training_label_timespan) - + convert_str_to_relativedelta(self.test_label_timespan), + "%Y-%m-%d") + temporal_config['label_end_time'] = datetime.strftime( + prediction_date + convert_str_to_relativedelta(self.test_label_timespan), + "%Y-%m-%d") + temporal_config['model_update_frequency'] = self.test_label_timespan + + return temporal_config def generate_all_labels(self, as_of_date): self.label_generator.generate_all_labels( @@ -390,18 +305,7 @@ def retrain(self, prediction_date): prediction_date(str) """ prediction_date = dt_from_str(prediction_date) - # as_of_date = datetime.strftime(prediction_date - convert_str_to_relativedelta(self.training_label_timespan), "%Y-%m-%d") - temporal_config = self.experiment_config['temporal_config'].copy() - temporal_config['feature_end_time'] = datetime.strftime(prediction_date, "%Y-%m-%d") - temporal_config['label_start_time'] = datetime.strftime( - prediction_date - - convert_str_to_relativedelta(self.training_label_timespan) - - convert_str_to_relativedelta(self.test_label_timespan), - "%Y-%m-%d") - temporal_config['label_end_time'] = datetime.strftime( - prediction_date + convert_str_to_relativedelta(self.test_label_timespan), - "%Y-%m-%d") - temporal_config['model_update_frequency'] = self.test_label_timespan + temporal_config = self.get_temporal_config_for_retrain(prediction_date) timechopper = Timechop(**temporal_config) chops = timechopper.chop_time() assert len(chops) == 1 @@ -503,7 +407,8 @@ def retrain(self, prediction_date): 'training_label_timespan': self.training_label_timespan, 'model_comment': retrain_model_comment, } - + + # get the random seed fromthe last split last_split_train_matrix_uuid, last_split_matrix_metadata = train_matrix_info_from_model_id( self.db_engine, model_id=self.model_group_info['model_id_last_split'] @@ -514,12 +419,20 @@ def retrain(self, prediction_date): matrix_metadata=last_split_matrix_metadata, train_matrix_uuid=last_split_train_matrix_uuid ) + + # create retrained model hash + retrained_model_hash = self.model_trainer._model_hash( + self.matrix_storage_engine.get_store(matrix_uuid).metadata, + class_path=self.model_group_info['model_type'], + parameters=self.model_group_info['hyperparameters'], + random_seed=random_seed, + ) retrained_model_id = self.model_trainer.process_train_task( matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], - model_hash=None, + model_hash=retrained_model_hash, misc_db_parameters=misc_db_parameters, random_seed=random_seed, retrain=True, @@ -572,16 +485,15 @@ def predict(self, prediction_date): replace=True, ) # Use timechop to get the time definition for production - temporal_config = self.experiment_config["temporal_config"] + # temporal_config = self.experiment_config["temporal_config"] + temporal_config = self.get_temporal_config_for_retrain(dt_from_str(prediction_date)) timechopper = Timechop(**temporal_config) prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(prediction_date), test_duration=temporal_config['test_durations'][0], test_label_timespan=temporal_config['test_label_timespans'][0] ) - last_split_definition = prod_definitions[-1] - matrix_metadata = Planner.make_metadata( matrix_definition=last_split_definition, feature_dictionary=reconstructed_feature_dict, diff --git a/src/triage/predictlist/utils.py b/src/triage/predictlist/utils.py new file mode 100644 index 000000000..79508c4f2 --- /dev/null +++ b/src/triage/predictlist/utils.py @@ -0,0 +1,114 @@ +import re +import verboselogs, logging +logger = verboselogs.VerboseLogger(__name__) + + +def experiment_config_from_model_id(db_engine, model_id): + """Get original experiment config from model_id + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + + Returns: (dict) experiment config + """ + get_experiment_query = '''select experiments.config + from triage_metadata.experiments + join triage_metadata.models on (experiments.experiment_hash = models.built_by_experiment) + where model_id = %s + ''' + (config,) = db_engine.execute(get_experiment_query, model_id).first() + return config + + +def experiment_config_from_model_group_id(db_engine, model_group_id): + """Get original experiment config from model_id + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + + Returns: (dict) experiment config + """ + get_experiment_query = ''' + select experiment_runs.id as run_id, experiments.config + from triage_metadata.experiments + join triage_metadata.models + on (experiments.experiment_hash = models.built_by_experiment) + join triage_metadata.experiment_runs + on (experiment_runs.id = models.built_in_experiment_run) + where model_group_id = %s + order by experiment_runs.start_time desc + ''' + (run_id, config) = db_engine.execute(get_experiment_query, model_group_id).first() + return run_id, config + + +def get_model_group_info(db_engine, model_group_id): + query = """ + SELECT model_group_id, model_type, hyperparameters, model_id as model_id_last_split + FROM triage_metadata.models + WHERE model_group_id = %s + ORDER BY train_end_time DESC + """ + model_group_info = db_engine.execute(query, model_group_id).fetchone() + return dict(model_group_info) + + +def train_matrix_info_from_model_id(db_engine, model_id): + """Get original train matrix information from model_id + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + + Returns: (str, dict) matrix uuid and matrix metadata + """ + get_train_matrix_query = """ + select matrix_uuid, matrices.matrix_metadata + from triage_metadata.matrices + join triage_metadata.models on (models.train_matrix_uuid = matrices.matrix_uuid) + where model_id = %s + """ + return db_engine.execute(get_train_matrix_query, model_id).first() + + +def get_feature_names(aggregation, matrix_metadata): + """Returns a feature group name and a list of feature names from a SpacetimeAggregation object""" + feature_prefix = aggregation.prefix + logger.spam("Feature prefix = %s", feature_prefix) + feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') + logger.spam("Feature group = %s", feature_group) + feature_names_in_group = [f for f in matrix_metadata['feature_names'] if re.match(f'\\A{feature_prefix}_', f)] + logger.spam("Feature names in group = %s", feature_names_in_group) + + return feature_group, feature_names_in_group + + +def get_feature_needs_imputation_in_train(aggregation, feature_names): + """Returns features that needs imputation from training data + Args: + aggregation (SpacetimeAggregation) + feature_names (list) A list of feature names + """ + features_imputed_in_train = [ + f for f in set(feature_names) + if not f.endswith('_imp') + and aggregation.imputation_flag_base(f) + '_imp' in feature_names + ] + logger.spam("Features imputed in train = %s", features_imputed_in_train) + return features_imputed_in_train + + +def get_feature_needs_imputation_in_production(aggregation, db_engine): + """Returns features that needs imputation from triage_production + Args: + aggregation (SpacetimeAggregation) + db_engine (sqlalchemy.db.engine) + """ + with db_engine.begin() as conn: + nulls_results = conn.execute(aggregation.find_nulls()) + + null_counts = nulls_results.first().items() + features_imputed_in_production = [col for (col, val) in null_counts if val is not None and val > 0] + + return features_imputed_in_production + + From 3ec377fefe71705f4af504c06378146bccbe1ccb Mon Sep 17 00:00:00 2001 From: tweddielin Date: Wed, 30 Jun 2021 21:05:04 -0400 Subject: [PATCH 40/52] fix cli and docs --- .../{risklist => predictlist}/index.md | 79 ++++++++++--------- src/triage/cli.py | 6 +- src/triage/predictlist/__init__.py | 2 +- 3 files changed, 44 insertions(+), 43 deletions(-) rename docs/sources/{risklist => predictlist}/index.md (99%) diff --git a/docs/sources/risklist/index.md b/docs/sources/predictlist/index.md similarity index 99% rename from docs/sources/risklist/index.md rename to docs/sources/predictlist/index.md index c48d9acf8..e25e0f7cd 100644 --- a/docs/sources/risklist/index.md +++ b/docs/sources/predictlist/index.md @@ -1,85 +1,86 @@ -# Predictlist -If you would like to generate a list of predictions on already-trained Triage model with new data, you can use the 'Predictlist' module. - -# Predict Foward with Existed Model -Use an existing model object to generate predictions on new data. +# Retrain and Predict +Use an existing model group to retrain a new model on all the data up to the current date and then predict forward into the future. ## Examples Both examples assume you have already run a Triage Experiment in the past, and know these two pieces of information: -1. A `model_id` from a Triage model that you want to use to generate predictions -2. An `as_of_date` to generate your predictions on. +1. A `model_group_id` from a Triage model group that you want to use to retrain a model and generate prediction +2. A `today` to generate your predictions on. ### CLI -`triage predictlist ` +`triage retrainpredict ` Example: -`triage predictlist 46 2019-05-06` +`triage retrainpredict 30 2021-04-04` -The predictlist will assume the current path to be the 'project path' to find models and write matrices, but this can be overridden by sending the `--project-path` option. +The `retrainpredict` will assume the current path to be the 'project path' to train models and write matrices, but this can be overridden by sending the `--project-path` option ### Python +The `Retrainer` class from `triage.predictlist` module can be used to retrain a model and predict forward. -The `predict_forward_with_existed_model` function from the `triage.predictlist` module can be used similarly to the CLI, with the addition of the database engine and project storage as inputs. -``` -from triage.predictlist import generate predict_forward_with_existed_model +```python +from triage.predictlist import Retrainer from triage import create_engine -predict_forward_with_existed_model( +retrainer = Retrainer( db_engine=create_engine(), project_path='/home/you/triage/project2' - model_id=46, - as_of_date='2019-05-06' + model_group_id=36, ) +retrainer.retrain(today='2021-04-04') +retrainer.predict(today='2021-04-04') ``` ## Output -The Predictlist is stored similarly to the matrices created during an Experiment: +The retrained model is sotred similariy to the matrices created during an Experiment: - Raw Matrix saved to the matrices directory in project storage +- Raw Model saved to the trained_model directory in project storage +- Retrained Model info saved in a table (triage_metadata.models) where model_comment = 'retrain_2021-04-04' - Predictions saved in a table (triage_production.predictions) -- Prediction metadata (tiebreaking, random seed) saved in a table (triage_production.prediction_metadata) +- Prediction metadata (tiebreaking, random seed) saved in a table (triage_produciton.prediction_metadata) -## Notes -- The cohort and features for the Predictlist are all inferred from the Experiment that trained the given model_id (as defined by the experiment_models table). -- The feature list ensures that imputation flag columns are present for any columns that either needed to be imputed in the training process, or that needed to be imputed in the predictlist dataset. -# Retrain and Predict -Use an existing model group to retrain a new model on all the data up to the current date and then predict forward into the future. +# Predictlist +If you would like to generate a list of predictions on already-trained Triage model with new data, you can use the 'Predictlist' module. + +# Predict Foward with Existed Model +Use an existing model object to generate predictions on new data. ## Examples Both examples assume you have already run a Triage Experiment in the past, and know these two pieces of information: -1. A `model_group_id` from a Triage model group that you want to use to retrain a model and generate prediction -2. A `today` to generate your predictions on. +1. A `model_id` from a Triage model that you want to use to generate predictions +2. An `as_of_date` to generate your predictions on. ### CLI -`triage retrainpredict ` +`triage predictlist ` Example: -`triage retrainpredict 30 2021-04-04` +`triage predictlist 46 2019-05-06` -The `retrainpredict` will assume the current path to be the 'project path' to train models and write matrices, but this can be overridden by sending the `--project-path` option +The predictlist will assume the current path to be the 'project path' to find models and write matrices, but this can be overridden by sending the `--project-path` option. ### Python -The `Retrainer` class from `triage.predictlist` module can be used to retrain a model and predict forward. -```python -from triage.predictlist import Retrainer +The `predict_forward_with_existed_model` function from the `triage.predictlist` module can be used similarly to the CLI, with the addition of the database engine and project storage as inputs. +``` +from triage.predictlist import generate predict_forward_with_existed_model from triage import create_engine -retrainer = Retrainer( +predict_forward_with_existed_model( db_engine=create_engine(), project_path='/home/you/triage/project2' - model_group_id=36, + model_id=46, + as_of_date='2019-05-06' ) -retrainer.retrain(today='2021-04-04') -retrainer.predict(today='2021-04-04') ``` ## Output -The retrained model is sotred similariy to the matrices created during an Experiment: +The Predictlist is stored similarly to the matrices created during an Experiment: - Raw Matrix saved to the matrices directory in project storage -- Raw Model saved to the trained_model directory in project storage -- Retrained Model info saved in a table (triage_metadata.models) where model_comment = 'retrain_2021-04-04' - Predictions saved in a table (triage_production.predictions) -- Prediction metadata (tiebreaking, random seed) saved in a table (triage_produciton.prediction_metadata) +- Prediction metadata (tiebreaking, random seed) saved in a table (triage_production.prediction_metadata) + +## Notes +- The cohort and features for the Predictlist are all inferred from the Experiment that trained the given model_id (as defined by the experiment_models table). +- The feature list ensures that imputation flag columns are present for any columns that either needed to be imputed in the training process, or that needed to be imputed in the predictlist dataset. diff --git a/src/triage/cli.py b/src/triage/cli.py index 7c3e16c7a..df58a2b9e 100755 --- a/src/triage/cli.py +++ b/src/triage/cli.py @@ -411,7 +411,7 @@ def __init__(self, parser): ) parser.add_argument( - "today", + "prediction_date", type=valid_date, help="The date as of which to run features. Format YYYY-MM-DD", ) @@ -428,8 +428,8 @@ def __call__(self, args): args.project_path, args.model_group_id, ) - retrainer.retrain(args.today) - retrainer.predict(args.today) + retrainer.retrain(args.prediction_date) + retrainer.predict(args.prediction_date) @Triage.register diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 7968020b4..65d989335 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -442,7 +442,7 @@ def retrain(self, prediction_date): self.retrained_model_hash = retrieve_model_hash_from_id(self.db_engine, retrained_model_id) self.retrained_matrix_uuid = matrix_uuid self.retrained_model_id = retrained_model_id - return {'retrain_model_comment': retrain_model_comment} + return {'retrain_model_comment': retrain_model_comment, 'retrained_model_id': retrained_model_id} def predict(self, prediction_date): """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrained model on it From 1c4da24ec74186b5161150aee6ab0570aa6abff5 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Wed, 30 Jun 2021 22:45:57 -0400 Subject: [PATCH 41/52] update docs --- docs/sources/predictlist/index.md | 3 ++- src/tests/test_predictlist.py | 2 +- src/triage/predictlist/__init__.py | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/sources/predictlist/index.md b/docs/sources/predictlist/index.md index e25e0f7cd..e7b615432 100644 --- a/docs/sources/predictlist/index.md +++ b/docs/sources/predictlist/index.md @@ -28,13 +28,14 @@ retrainer = Retrainer( ) retrainer.retrain(today='2021-04-04') retrainer.predict(today='2021-04-04') + ``` ## Output The retrained model is sotred similariy to the matrices created during an Experiment: - Raw Matrix saved to the matrices directory in project storage - Raw Model saved to the trained_model directory in project storage -- Retrained Model info saved in a table (triage_metadata.models) where model_comment = 'retrain_2021-04-04' +- Retrained Model info saved in a table (triage_metadata.models) where model_comment = 'retrain_2021-04-04 21:19:09.975112' - Predictions saved in a table (triage_production.predictions) - Prediction metadata (tiebreaking, random seed) saved in a table (triage_produciton.prediction_metadata) diff --git a/src/tests/test_predictlist.py b/src/tests/test_predictlist.py index 9deaaa5a0..3b903832f 100644 --- a/src/tests/test_predictlist.py +++ b/src/tests/test_predictlist.py @@ -86,7 +86,7 @@ def test_retrain_should_write_model(finished_experiment): model_group_id=model_group_id, ) retrain_info = retrainer.retrain(prediction_date) - model_comment = retrain_info['retrain_model_comment'] + model_comment = retrain_info['retrained_model_comment'] records = [ row diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 65d989335..579c91b3b 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -398,14 +398,14 @@ def retrain(self, prediction_date): matrix_uuid=matrix_uuid, matrix_type="train", ) - retrain_model_comment = 'retrain_' + str(datetime.now()) + retrained_model_comment = 'retrain_' + str(datetime.now()) misc_db_parameters = { 'train_end_time': dt_from_str(as_of_date), 'test': False, 'train_matrix_uuid': matrix_uuid, 'training_label_timespan': self.training_label_timespan, - 'model_comment': retrain_model_comment, + 'model_comment': retrained_model_comment, } # get the random seed fromthe last split @@ -442,7 +442,7 @@ def retrain(self, prediction_date): self.retrained_model_hash = retrieve_model_hash_from_id(self.db_engine, retrained_model_id) self.retrained_matrix_uuid = matrix_uuid self.retrained_model_id = retrained_model_id - return {'retrain_model_comment': retrain_model_comment, 'retrained_model_id': retrained_model_id} + return {'retrained_model_comment': retrained_model_comment, 'retrained_model_id': retrained_model_id} def predict(self, prediction_date): """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrained model on it From 35bd97851785ddf36c5734bed42713468504066c Mon Sep 17 00:00:00 2001 From: tweddielin Date: Thu, 1 Jul 2021 02:07:19 -0400 Subject: [PATCH 42/52] use reconstructed feature dict --- src/triage/predictlist/__init__.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 579c91b3b..c0b728cb6 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -274,11 +274,9 @@ def get_feature_dict_and_imputation_task(self, collate_aggregations, model_id): (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(self.db_engine, model_id) reconstructed_feature_dict = FeatureGroup() imputation_table_tasks = OrderedDict() - for aggregation in collate_aggregations: feature_group, feature_names = get_feature_names(aggregation, matrix_metadata) reconstructed_feature_dict[feature_group] = feature_names - # Make sure that the features imputed in training should also be imputed in production features_imputed_in_train = get_feature_needs_imputation_in_train(aggregation, feature_names) @@ -317,6 +315,7 @@ def retrain(self, prediction_date): 'as_of_times': chops_train_matrix['as_of_times'], 'training_label_timespan': chops_train_matrix['training_label_timespan'], 'max_training_history': chops_train_matrix['max_training_history'], + 'training_as_of_date_frequency': chops_train_matrix['training_as_of_date_frequency'], } as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'], "%Y-%m-%d") @@ -336,23 +335,16 @@ def retrain(self, prediction_date): self.feature_generator.process_table_tasks(feature_aggregation_table_tasks) # 4. Reconstruct feature disctionary from feature_names and generate imputation - feature_imputation_table_tasks = self.feature_generator.generate_all_table_tasks( - collate_aggregations, - task_type='imputation' - ) - self.feature_generator.process_table_tasks(feature_imputation_table_tasks) - - feature_dict = self.feature_dictionary_creator.feature_dictionary( - feature_table_names=feature_imputation_table_tasks.keys(), - index_column_lookup=self.feature_generator.index_column_lookup(collate_aggregations), + reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task( + collate_aggregations, + self.model_group_info['model_id_last_split'], ) - - feature_group_creator = FeatureGroupCreator({"all": [True]}) + feature_group_creator = FeatureGroupCreator(self.experiment_config['feature_group_definition']) feature_group_mixer = FeatureGroupMixer(["all"]) feature_group_dict = feature_group_mixer.generate( - feature_group_creator.subsets(feature_dict) + feature_group_creator.subsets(reconstructed_feature_dict) )[0] - + self.feature_generator.process_table_tasks(imputation_table_tasks) # 5. Build new matrix db_config = { "features_schema_name": "triage_production", @@ -375,7 +367,7 @@ def retrain(self, prediction_date): label_type='binary', cohort_name=self.cohort_name, matrix_type='train', - feature_start_time=self.feature_start_time, + feature_start_time=dt_from_str(self.feature_start_time), user_metadata=self.user_metadata, ) From 9f5a09947a699d0638339b234c7f312898e84dc9 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Thu, 29 Jul 2021 12:46:34 -0400 Subject: [PATCH 43/52] add RetrainModel and Retrain --- .../component/catwalk/model_trainers.py | 5 +- .../component/results_schema/__init__.py | 4 + .../versions/5dd2ba8222b1_add_run_type.py | 48 +++++++++++ src/triage/component/results_schema/schema.py | 42 +++++++++- src/triage/predictlist/__init__.py | 79 ++++++++++++++++--- src/triage/predictlist/utils.py | 25 +++++- src/triage/tracking.py | 1 + 7 files changed, 191 insertions(+), 13 deletions(-) create mode 100644 src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py diff --git a/src/triage/component/catwalk/model_trainers.py b/src/triage/component/catwalk/model_trainers.py index c36823382..fbb12c427 100644 --- a/src/triage/component/catwalk/model_trainers.py +++ b/src/triage/component/catwalk/model_trainers.py @@ -218,11 +218,14 @@ def _write_model_to_db( return model_id else: if retrain: + logger.debug("Retrain model...") model = Model( model_group_id=model_group_id, model_hash=model_hash, model_type=class_path, hyperparameters=parameters, + built_by_retrain=self.experiment_hash, + built_in_triage_run=self.run_id, model_size=model_size, **misc_db_parameters, ) @@ -234,7 +237,7 @@ def _write_model_to_db( hyperparameters=parameters, model_group_id=model_group_id, built_by_experiment=self.experiment_hash, - built_in_experiment_run=self.run_id, + built_in_triage_run=self.run_id, model_size=model_size, **misc_db_parameters, ) diff --git a/src/triage/component/results_schema/__init__.py b/src/triage/component/results_schema/__init__.py index 97a8b27a9..20f92884e 100644 --- a/src/triage/component/results_schema/__init__.py +++ b/src/triage/component/results_schema/__init__.py @@ -13,12 +13,14 @@ from .schema import ( Base, Experiment, + Retrain, FeatureImportance, IndividualImportance, ListPrediction, ExperimentMatrix, Matrix, ExperimentModel, + RetrainModel, ExperimentRun, ExperimentRunStatus, Model, @@ -39,11 +41,13 @@ __all__ = ( "Base", "Experiment", + "Retrain", "FeatureImportance", "IndividualImportance", "ListPrediction", "ExperimentMatrix", "Matrix", + "RetrainModel", "ExperimentModel", "ExperimentRun", "ExperimentRunStatus", diff --git a/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py b/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py new file mode 100644 index 000000000..9653e2a0f --- /dev/null +++ b/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py @@ -0,0 +1,48 @@ +"""add run_type + +Revision ID: 5dd2ba8222b1 +Revises: 079a74c15e8b +Create Date: 2021-07-22 23:53:04.043651 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '5dd2ba8222b1' +down_revision = '079a74c15e8b' +branch_labels = None +depends_on = None + + +def upgrade(): + op.create_table('retrain', + sa.Column('retrain_hash', sa.Text(), nullable=False), + sa.Column('config', postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column('prediction_date', sa.DateTime(), nullable=True), + sa.PrimaryKeyConstraint('retrain_hash'), + schema='triage_metadata', + ) + op.add_column('experiment_runs', sa.Column('run_type', sa.Text(), nullable=True), schema='triage_metadata') + op.add_column('experiment_runs', sa.Column('retrain_hash', sa.Text(), nullable=True), schema='triage_metadata') + op.alter_column('models', 'built_in_experiment_run', nullable=False, new_column_name='built_in_triage_run', schema='triage_metadata') + op.add_column('models', sa.Column('built_by_retrain', sa.Text(), nullable=True), schema='triage_metadata') + + op.create_table('retrain_models', + sa.Column('retrain_hash', sa.String(), nullable=False), + sa.Column('model_hash', sa.String(), nullable=False), + sa.ForeignKeyConstraint(['retrain_hash'], ['triage_metadata.retrain.retrain_hash'], ), + sa.PrimaryKeyConstraint('retrain_hash', 'model_hash'), + schema='triage_metadata' + ) + + +def downgrade(): + op.drop_column('experiment_runs', 'run_type', schema='triage_metadata') + op.drop_column('experiment_runs', 'retrain_hash', schema='triage_metadata') + op.drop_table('retrain_models', schema='triage_metadata') + op.drop_table('retrain', schema='triage_metadata') + op.drop_column('models', 'built_by_retrain', schema='triage_metadata') + op.alter_column('models', 'built_in_triage_run', nullable=False, new_column_name='built_in_experiment_run', schema='triage_metadata') + diff --git a/src/triage/component/results_schema/schema.py b/src/triage/component/results_schema/schema.py index 8f4a3b547..2d6cbc730 100644 --- a/src/triage/component/results_schema/schema.py +++ b/src/triage/component/results_schema/schema.py @@ -23,6 +23,7 @@ from sqlalchemy.orm import relationship from sqlalchemy.types import ARRAY, Enum from sqlalchemy.sql import func +from sqlalchemy.ext.hybrid import hybrid_property # One declarative_base object for each schema created Base = declarative_base() @@ -71,6 +72,15 @@ class Experiment(Base): models_needed = Column(Integer) +class Retrain(Base): + __tablename__ = "retrain" + __table_args__ = {"schema": "triage_metadata"} + + retrain_hash = Column(String, primary_key=True) + config = Column(JSONB) + prediction_date = Column(DateTime) + + class ExperimentRunStatus(enum.Enum): started = 1 completed = 2 @@ -78,6 +88,7 @@ class ExperimentRunStatus(enum.Enum): class ExperimentRun(Base): +# class TriageRun(Base): __tablename__ = "experiment_runs" __table_args__ = {"schema": "triage_metadata"} @@ -88,10 +99,15 @@ class ExperimentRun(Base): git_hash = Column(String) triage_version = Column(String) python_version = Column(String) + run_type = Column(String) experiment_hash = Column( String, ForeignKey("triage_metadata.experiments.experiment_hash") ) + retrain_hash = Column( + String, + ForeignKey("triage_metadata.retrain.retrain_hash") + ) platform = Column(Text) os_user = Column(Text) working_directory = Column(Text) @@ -113,6 +129,11 @@ class ExperimentRun(Base): stacktrace = Column(Text) random_seed = Column(Integer) experiment_rel = relationship("Experiment") + retrain_rel = relationship("Retrain") + + @hybrid_property + def external_hash(self): + return self.experiment_hash or self.retrain_hash class Subset(Base): @@ -225,8 +246,11 @@ class Model(Base): built_by_experiment = Column( String, ForeignKey("triage_metadata.experiments.experiment_hash") ) - built_in_experiment_run = Column( - Integer, ForeignKey("triage_metadata.experiment_runs.id") + built_by_retrain = Column( + String, ForeignKey("triage_metadata.retrain.retrain_hash") + ) + built_in_triage_run = Column( + Integer, ForeignKey("triage_metadata.experiment_runs.id"), nullable=True ) train_end_time = Column(DateTime) test = Column(Boolean) @@ -261,6 +285,20 @@ class ExperimentModel(Base): experiment_rel = relationship("Experiment") +class RetrainModel(Base): + __tablename__ = "retrain_models" + __table_args__ = {"schema": "triage_metadata"} + + retrain_hash = Column( + String, + ForeignKey("triage_metadata.retrain.retrain_hash"), + primary_key=True + ) + model_hash = Column(String, primary_key=True) + model_rel = relationship("Model", primaryjoin=(Model.model_hash == model_hash), foreign_keys=model_hash) + retrain_rel = relationship("Retrain") + + class FeatureImportance(Base): __tablename__ = "feature_importances" diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index c0b728cb6..d3e5f2458 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -1,4 +1,4 @@ -from triage.component.results_schema import upgrade_db +from triage.component.results_schema import upgrade_db, Retrain, ExperimentRun, ExperimentRunStatus from triage.component.architect.entity_date_table_generators import EntityDateTableGenerator, DEFAULT_ACTIVE_STATE from triage.component.architect.features import ( FeatureGenerator, @@ -15,9 +15,19 @@ from triage.component.catwalk import ModelTrainer from triage.component.catwalk.model_trainers import flatten_grid_config from triage.component.catwalk.predictors import Predictor -from triage.component.catwalk.utils import retrieve_model_hash_from_id, filename_friendly_hash +from triage.component.catwalk.utils import retrieve_model_hash_from_id, filename_friendly_hash, retrieve_experiment_seed_from_run_id from triage.util.conf import convert_str_to_relativedelta, dt_from_str -from triage.util.db import scoped_session +from triage.util.db import scoped_session, get_for_update +from triage.util.introspection import classpath +from triage.tracking import ( + infer_git_hash, + infer_ec2_instance_type, + infer_installed_libraries, + infer_python_version, + infer_triage_version, + infer_log_location, + +) from .utils import ( experiment_config_from_model_id, experiment_config_from_model_group_id, @@ -26,12 +36,17 @@ get_feature_names, get_feature_needs_imputation_in_train, get_feature_needs_imputation_in_production, + associate_models_with_retrain, + save_retrain_and_get_hash, ) from collections import OrderedDict import json import random +import platform +import getpass +import os from datetime import datetime import verboselogs, logging @@ -189,12 +204,13 @@ class Retrainer: model_group_id (string) """ def __init__(self, db_engine, project_path, model_group_id): + self.retrain_hash = None self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) self.project_storage = ProjectStorage(project_path) self.model_group_id = model_group_id self.matrix_storage_engine = self.project_storage.matrix_storage_engine() - self.run_id, self.experiment_config = experiment_config_from_model_group_id(self.db_engine, self.model_group_id) + self.experiment_run_id, self.experiment_config = experiment_config_from_model_group_id(self.db_engine, self.model_group_id) self.training_label_timespan = self.experiment_config['temporal_config']['training_label_timespans'][0] self.test_label_timespan = self.experiment_config['temporal_config']['test_label_timespans'][0] self.feature_start_time=self.experiment_config['temporal_config']['feature_start_time'] @@ -229,7 +245,7 @@ def __init__(self, db_engine, project_path, model_group_id): model_storage_engine=ModelStorageEngine(self.project_storage), db_engine=self.db_engine, replace=True, - run_id=self.run_id, + run_id=self.experiment_run_id, ) def get_temporal_config_for_retrain(self, prediction_date): @@ -295,13 +311,24 @@ def get_feature_dict_and_imputation_task(self, collate_aggregations, model_id): ) ) return reconstructed_feature_dict, imputation_table_tasks - + def retrain(self, prediction_date): """Retrain a model by going back one split from prediction_date, so the as_of_date for training would be (prediction_date - training_label_timespan) Args: prediction_date(str) """ + + retrain_config = { + "model_group_id": self.model_group_id, + "prediction_date": prediction_date, + } + self.retrain_hash = save_retrain_and_get_hash(retrain_config, self.db_engine) + + with get_for_update(self.db_engine, Retrain, self.retrain_hash) as retrain: + retrain.prediction_date = prediction_date + + # Timechop prediction_date = dt_from_str(prediction_date) temporal_config = self.get_temporal_config_for_retrain(prediction_date) timechopper = Timechop(**temporal_config) @@ -319,11 +346,42 @@ def retrain(self, prediction_date): } as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'], "%Y-%m-%d") - cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain" + # Set ExperimentRun + run = ExperimentRun( + start_time=datetime.now(), + git_hash=infer_git_hash(), + triage_version=infer_triage_version(), + python_version=infer_python_version(), + run_type="retrain", + retrain_hash=self.retrain_hash, + last_updated_time=datetime.now(), + current_status=ExperimentRunStatus.started, + installed_libraries=infer_installed_libraries(), + platform=platform.platform(), + os_user=getpass.getuser(), + working_directory=os.getcwd(), + ec2_instance_type=infer_ec2_instance_type(), + log_location=infer_log_location(), + experiment_class_path=classpath(self.__class__), + random_seed = retrieve_experiment_seed_from_run_id(self.db_engine, self.experiment_run_id), + ) + run_id = None + with scoped_session(self.db_engine) as session: + session.add(run) + session.commit() + run_id = run.run_id + if not run_id: + raise ValueError("Failed to retrieve run_id from saved row") + + # set ModelTrainer's run_id and experiment_hash for Retrain run + self.model_trainer.run_id = run_id + self.model_trainer.experiment_hash = self.retrain_hash + # 1. Generate all labels self.generate_all_labels(as_of_date) # 2. Generate cohort + cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain" self.generate_entity_date_table(as_of_date, cohort_table_name) # 3. Generate feature aggregations @@ -400,7 +458,7 @@ def retrain(self, prediction_date): 'model_comment': retrained_model_comment, } - # get the random seed fromthe last split + # get the random seed from the last split last_split_train_matrix_uuid, last_split_matrix_metadata = train_matrix_info_from_model_id( self.db_engine, model_id=self.model_group_info['model_id_last_split'] @@ -420,6 +478,8 @@ def retrain(self, prediction_date): random_seed=random_seed, ) + associate_models_with_retrain(self.retrain_hash, (retrained_model_hash, ), self.db_engine) + retrained_model_id = self.model_trainer.process_train_task( matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), class_path=self.model_group_info['model_type'], @@ -434,6 +494,8 @@ def retrain(self, prediction_date): self.retrained_model_hash = retrieve_model_hash_from_id(self.db_engine, retrained_model_id) self.retrained_matrix_uuid = matrix_uuid self.retrained_model_id = retrained_model_id + # import ipdb + # ipdb.set_trace() return {'retrained_model_comment': retrained_model_comment, 'retrained_model_id': retrained_model_id} def predict(self, prediction_date): @@ -524,5 +586,4 @@ def predict(self, prediction_date): misc_db_parameters={}, train_matrix_columns=self.matrix_storage_engine.get_store(self.retrained_matrix_uuid).columns(), ) - self.predict_matrix_uuid = matrix_uuid diff --git a/src/triage/predictlist/utils.py b/src/triage/predictlist/utils.py index 79508c4f2..5c93cdf92 100644 --- a/src/triage/predictlist/utils.py +++ b/src/triage/predictlist/utils.py @@ -1,4 +1,8 @@ +from triage.component.results_schema import RetrainModel, Retrain +from triage.component.catwalk.utils import db_retry, filename_friendly_hash + import re +from sqlalchemy.orm import sessionmaker import verboselogs, logging logger = verboselogs.VerboseLogger(__name__) @@ -34,7 +38,7 @@ def experiment_config_from_model_group_id(db_engine, model_group_id): join triage_metadata.models on (experiments.experiment_hash = models.built_by_experiment) join triage_metadata.experiment_runs - on (experiment_runs.id = models.built_in_experiment_run) + on (experiment_runs.id = models.built_in_triage_run) where model_group_id = %s order by experiment_runs.start_time desc ''' @@ -112,3 +116,22 @@ def get_feature_needs_imputation_in_production(aggregation, db_engine): return features_imputed_in_production +@db_retry +def associate_models_with_retrain(retrain_hash, model_hashes, db_engine): + session = sessionmaker(bind=db_engine)() + for model_hash in model_hashes: + session.merge(RetrainModel(retrain_hash=retrain_hash, model_hash=model_hash)) + session.commit() + session.close() + logger.spam("Associated models with retrain in database") + +@db_retry +def save_retrain_and_get_hash(config, db_engine): + retrain_hash = filename_friendly_hash(config) + session = sessionmaker(bind=db_engine)() + session.merge(Retrain(retrain_hash=retrain_hash, config=config)) + session.commit() + session.close() + return retrain_hash + + diff --git a/src/triage/tracking.py b/src/triage/tracking.py index 0803d6b5e..08f7b791e 100644 --- a/src/triage/tracking.py +++ b/src/triage/tracking.py @@ -120,6 +120,7 @@ def initialize_tracking_and_get_run_id( git_hash=infer_git_hash(), triage_version=infer_triage_version(), python_version=infer_python_version(), + run_type="experiment", experiment_hash=experiment_hash, last_updated_time=datetime.datetime.now(), current_status=ExperimentRunStatus.started, From ba8482253fac256edfc63de57d266f777032130e Mon Sep 17 00:00:00 2001 From: tweddielin Date: Thu, 29 Jul 2021 14:41:21 -0400 Subject: [PATCH 44/52] remove break point --- src/triage/predictlist/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index d3e5f2458..be684b444 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -494,8 +494,6 @@ def retrain(self, prediction_date): self.retrained_model_hash = retrieve_model_hash_from_id(self.db_engine, retrained_model_id) self.retrained_matrix_uuid = matrix_uuid self.retrained_model_id = retrained_model_id - # import ipdb - # ipdb.set_trace() return {'retrained_model_comment': retrained_model_comment, 'retrained_model_id': retrained_model_id} def predict(self, prediction_date): From 83e0f6653cc9e3d55c626fff97d4d54e543ae237 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Sat, 21 Aug 2021 14:17:46 -0400 Subject: [PATCH 45/52] change experiment_runs to triage_runs --- .../catwalk_tests/test_model_trainers.py | 11 ++-- src/tests/results_tests/factories.py | 8 +-- src/tests/test_tracking_experiments.py | 33 +++++----- .../component/catwalk/model_trainers.py | 4 +- src/triage/component/catwalk/utils.py | 10 +-- .../component/results_schema/__init__.py | 8 +-- .../versions/5dd2ba8222b1_add_run_type.py | 16 +++-- src/triage/component/results_schema/schema.py | 65 +++++++++++-------- src/triage/predictlist/__init__.py | 14 ++-- src/triage/predictlist/utils.py | 19 +++--- src/triage/tracking.py | 40 ++++++------ 11 files changed, 124 insertions(+), 104 deletions(-) diff --git a/src/tests/catwalk_tests/test_model_trainers.py b/src/tests/catwalk_tests/test_model_trainers.py index 8e51c66c9..9b202cce0 100644 --- a/src/tests/catwalk_tests/test_model_trainers.py +++ b/src/tests/catwalk_tests/test_model_trainers.py @@ -60,7 +60,6 @@ def set_test_seed(): misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) - # assert # 1. that the models and feature importances table entries are present records = [ @@ -286,11 +285,13 @@ def test_reuse_model_random_seeds(grid_config, default_model_trainer): def update_experiment_models(db_engine): sql = """ INSERT INTO triage_metadata.experiment_models(experiment_hash,model_hash) - SELECT m.built_by_experiment, m.model_hash - FROM triage_metadata.models m + SELECT er.run_hash, m.model_hash + FROM triage_metadata.models m + LEFT JOIN triage_metadata.triage_runs er + ON m.built_in_triage_run = er.id LEFT JOIN triage_metadata.experiment_models em - ON m.model_hash = em.model_hash - AND m.built_by_experiment = em.experiment_hash + ON m.model_hash = em.model_hash + AND er.run_hash = em.experiment_hash WHERE em.experiment_hash IS NULL """ db_engine.execute(sql) diff --git a/src/tests/results_tests/factories.py b/src/tests/results_tests/factories.py index 7b9c512ae..0b7cf2b2c 100644 --- a/src/tests/results_tests/factories.py +++ b/src/tests/results_tests/factories.py @@ -181,12 +181,12 @@ class Meta: matrix_uuid = factory.SelfAttribute("matrix_rel.matrix_uuid") -class ExperimentRunFactory(factory.alchemy.SQLAlchemyModelFactory): +class TriageRunFactory(factory.alchemy.SQLAlchemyModelFactory): class Meta: - model = schema.ExperimentRun + model = schema.TriageRun sqlalchemy_session = session - experiment_rel = factory.SubFactory(ExperimentFactory) + # experiment_rel = factory.SubFactory(ExperimentFactory) start_time = factory.fuzzy.FuzzyNaiveDateTime(datetime(2008, 1, 1)) start_method = "run" @@ -210,7 +210,7 @@ class Meta: models_skipped = 0 models_errored = 0 last_updated_time = factory.fuzzy.FuzzyNaiveDateTime(datetime(2008, 1, 1)) - current_status = schema.ExperimentRunStatus.started + current_status = schema.TriageRunStatus.started stacktrace = "" diff --git a/src/tests/test_tracking_experiments.py b/src/tests/test_tracking_experiments.py index 0fc2f064a..993560866 100644 --- a/src/tests/test_tracking_experiments.py +++ b/src/tests/test_tracking_experiments.py @@ -1,8 +1,8 @@ from tests.utils import sample_config, populate_source_data from triage.util.db import scoped_session from triage.experiments import MultiCoreExperiment, SingleThreadedExperiment -from triage.component.results_schema import ExperimentRun, ExperimentRunStatus -from tests.results_tests.factories import ExperimentFactory, ExperimentRunFactory, session as factory_session +from triage.component.results_schema import TriageRun, TriageRunStatus +from tests.results_tests.factories import ExperimentFactory, TriageRunFactory, session as factory_session from sqlalchemy.orm import Session import pytest import datetime @@ -30,9 +30,10 @@ def test_experiment_tracker(test_engine, project_path): project_path=project_path, n_processes=4, ) - experiment_run = Session(bind=test_engine).query(ExperimentRun).get(experiment.run_id) - assert experiment_run.current_status == ExperimentRunStatus.started - assert experiment_run.experiment_hash == experiment.experiment_hash + experiment_run = Session(bind=test_engine).query(TriageRun).get(experiment.run_id) + assert experiment_run.current_status == TriageRunStatus.started + assert experiment_run.run_hash == experiment.experiment_hash + assert experiment_run.run_type == 'experiment' assert experiment_run.experiment_class_path == 'triage.experiments.multicore.MultiCoreExperiment' assert experiment_run.platform assert experiment_run.os_user @@ -45,7 +46,7 @@ def test_experiment_tracker(test_engine, project_path): assert experiment_run.models_made == 0 experiment.run() - experiment_run = Session(bind=test_engine).query(ExperimentRun).get(experiment.run_id) + experiment_run = Session(bind=test_engine).query(TriageRun).get(experiment.run_id) assert experiment_run.start_method == "run" assert experiment_run.matrices_made == len(experiment.matrix_build_tasks) assert experiment_run.matrices_skipped == 0 @@ -57,7 +58,7 @@ def test_experiment_tracker(test_engine, project_path): assert isinstance(experiment_run.model_building_started, datetime.datetime) assert isinstance(experiment_run.last_updated_time, datetime.datetime) assert not experiment_run.stacktrace - assert experiment_run.current_status == ExperimentRunStatus.completed + assert experiment_run.current_status == TriageRunStatus.completed def test_experiment_tracker_exception(db_engine, project_path): @@ -71,8 +72,8 @@ def test_experiment_tracker_exception(db_engine, project_path): experiment.run() with scoped_session(db_engine) as session: - experiment_run = session.query(ExperimentRun).get(experiment.run_id) - assert experiment_run.current_status == ExperimentRunStatus.failed + experiment_run = session.query(TriageRun).get(experiment.run_id) + assert experiment_run.current_status == TriageRunStatus.failed assert isinstance(experiment_run.last_updated_time, datetime.datetime) assert experiment_run.stacktrace @@ -86,7 +87,7 @@ def test_experiment_tracker_in_parts(test_engine, project_path): experiment.generate_matrices() experiment.train_and_test_models() with scoped_session(test_engine) as session: - experiment_run = session.query(ExperimentRun).get(experiment.run_id) + experiment_run = session.query(TriageRun).get(experiment.run_id) assert experiment_run.start_method == "generate_matrices" @@ -103,8 +104,8 @@ def test_initialize_tracking_and_get_run_id(db_engine_with_results_schema): ) assert run_id with scoped_session(db_engine_with_results_schema) as session: - experiment_run = session.query(ExperimentRun).get(run_id) - assert experiment_run.experiment_hash == experiment_hash + experiment_run = session.query(TriageRun).get(run_id) + assert experiment_run.run_hash == experiment_hash assert experiment_run.experiment_class_path == 'mymodule.MyClassName' assert experiment_run.random_seed == 1234 assert experiment_run.experiment_kwargs == {'key': 'value'} @@ -119,7 +120,7 @@ def test_initialize_tracking_and_get_run_id(db_engine_with_results_schema): def test_get_run_for_update(db_engine_with_results_schema): - experiment_run = ExperimentRunFactory() + experiment_run = TriageRunFactory() factory_session.commit() with get_run_for_update( db_engine=db_engine_with_results_schema, @@ -128,16 +129,16 @@ def test_get_run_for_update(db_engine_with_results_schema): run_obj.stacktrace = "My stacktrace" with scoped_session(db_engine_with_results_schema) as session: - experiment_run_from_db = session.query(ExperimentRun).get(experiment_run.run_id) + experiment_run_from_db = session.query(TriageRun).get(experiment_run.run_id) assert experiment_run_from_db.stacktrace == "My stacktrace" def test_increment_field(db_engine_with_results_schema): - experiment_run = ExperimentRunFactory() + experiment_run = TriageRunFactory() factory_session.commit() increment_field('matrices_made', experiment_run.run_id, db_engine_with_results_schema) increment_field('matrices_made', experiment_run.run_id, db_engine_with_results_schema) with scoped_session(db_engine_with_results_schema) as session: - experiment_run_from_db = session.query(ExperimentRun).get(experiment_run.run_id) + experiment_run_from_db = session.query(TriageRun).get(experiment_run.run_id) assert experiment_run_from_db.matrices_made == 2 diff --git a/src/triage/component/catwalk/model_trainers.py b/src/triage/component/catwalk/model_trainers.py index fbb12c427..805c7e9a5 100644 --- a/src/triage/component/catwalk/model_trainers.py +++ b/src/triage/component/catwalk/model_trainers.py @@ -224,7 +224,7 @@ def _write_model_to_db( model_hash=model_hash, model_type=class_path, hyperparameters=parameters, - built_by_retrain=self.experiment_hash, + # built_by_retrain=self.experiment_hash, built_in_triage_run=self.run_id, model_size=model_size, **misc_db_parameters, @@ -236,7 +236,7 @@ def _write_model_to_db( model_type=class_path, hyperparameters=parameters, model_group_id=model_group_id, - built_by_experiment=self.experiment_hash, + # built_by_experiment=self.experiment_hash, built_in_triage_run=self.run_id, model_size=model_size, **misc_db_parameters, diff --git a/src/triage/component/catwalk/utils.py b/src/triage/component/catwalk/utils.py index 9f5f29294..24a78959c 100644 --- a/src/triage/component/catwalk/utils.py +++ b/src/triage/component/catwalk/utils.py @@ -24,7 +24,7 @@ Model, ExperimentMatrix, ExperimentModel, - ExperimentRun, + TriageRun, ) @@ -244,13 +244,13 @@ def retrieve_existing_model_random_seeds(db_engine, model_group_id, train_end_ti from {ExperimentModel.__table__.fullname} experiment_models join {Model.__table__.fullname} models on (experiment_models.model_hash = models.model_hash) - join {ExperimentRun.__table__.fullname} experiment_runs - on (experiment_models.experiment_hash = experiment_runs.experiment_hash) + join {TriageRun.__table__.fullname} triage_runs + on (experiment_models.experiment_hash = triage_runs.run_hash) where models.model_group_id = %s and models.train_end_time = %s and models.train_matrix_uuid = %s and models.training_label_timespan = %s - and experiment_runs.random_seed = %s + and triage_runs.random_seed = %s order by models.run_time DESC, random() """ return [row[0] for row in db_engine.execute(query, model_group_id, train_end_time, train_matrix_uuid, training_label_timespan, experiment_random_seed)] @@ -267,7 +267,7 @@ def retrieve_experiment_seed_from_run_id(db_engine, run_id): """ session = sessionmaker(bind=db_engine)() try: - return session.query(ExperimentRun).get(run_id).random_seed + return session.query(TriageRun).get(run_id).random_seed finally: session.close() diff --git a/src/triage/component/results_schema/__init__.py b/src/triage/component/results_schema/__init__.py index 20f92884e..3d02cb695 100644 --- a/src/triage/component/results_schema/__init__.py +++ b/src/triage/component/results_schema/__init__.py @@ -21,8 +21,8 @@ Matrix, ExperimentModel, RetrainModel, - ExperimentRun, - ExperimentRunStatus, + TriageRun, + TriageRunStatus, Model, ModelGroup, Subset, @@ -49,8 +49,8 @@ "Matrix", "RetrainModel", "ExperimentModel", - "ExperimentRun", - "ExperimentRunStatus", + "TriageRun", + "TriageRunStatus", "Model", "ModelGroup", "Subset", diff --git a/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py b/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py index 9653e2a0f..0a8bbaf45 100644 --- a/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py +++ b/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py @@ -17,6 +17,12 @@ def upgrade(): + op.add_column('experiment_runs', sa.Column('run_type', sa.Text(), nullable=True), schema='triage_metadata') + + op.add_column('experiment_runs', sa.Column('run_hash', sa.Text(), nullable=True), schema='triage_metadata') + op.drop_column('experiment_runs', 'experiment_hash', schema='triage_metadata') + op.execute("ALTER TABLE triage_metadata.experiment_runs RENAME TO triage_runs") + op.create_table('retrain', sa.Column('retrain_hash', sa.Text(), nullable=False), sa.Column('config', postgresql.JSONB(astext_type=sa.Text()), nullable=True), @@ -24,10 +30,9 @@ def upgrade(): sa.PrimaryKeyConstraint('retrain_hash'), schema='triage_metadata', ) - op.add_column('experiment_runs', sa.Column('run_type', sa.Text(), nullable=True), schema='triage_metadata') - op.add_column('experiment_runs', sa.Column('retrain_hash', sa.Text(), nullable=True), schema='triage_metadata') + op.alter_column('models', 'built_in_experiment_run', nullable=False, new_column_name='built_in_triage_run', schema='triage_metadata') - op.add_column('models', sa.Column('built_by_retrain', sa.Text(), nullable=True), schema='triage_metadata') + op.drop_column('models', 'built_by_experiment', schema='triage_metadata') op.create_table('retrain_models', sa.Column('retrain_hash', sa.String(), nullable=False), @@ -39,10 +44,11 @@ def upgrade(): def downgrade(): + op.execute("ALTER TABLE triage_metadata.triage_runs RENAME TO experiment_runs") op.drop_column('experiment_runs', 'run_type', schema='triage_metadata') - op.drop_column('experiment_runs', 'retrain_hash', schema='triage_metadata') + op.drop_column('experiment_runs', 'run_hash', schema='triage_metadata') op.drop_table('retrain_models', schema='triage_metadata') op.drop_table('retrain', schema='triage_metadata') - op.drop_column('models', 'built_by_retrain', schema='triage_metadata') + op.add_column('models', sa.Column('built_by_experiment', sa.Text(), nullable=True), schema='triage_metadata') op.alter_column('models', 'built_in_triage_run', nullable=False, new_column_name='built_in_experiment_run', schema='triage_metadata') diff --git a/src/triage/component/results_schema/schema.py b/src/triage/component/results_schema/schema.py index 2d6cbc730..bb646b70f 100644 --- a/src/triage/component/results_schema/schema.py +++ b/src/triage/component/results_schema/schema.py @@ -60,7 +60,11 @@ class Experiment(Base): __tablename__ = "experiments" __table_args__ = {"schema": "triage_metadata"} - experiment_hash = Column(String, primary_key=True) + experiment_hash = Column( + String, + # ForeignKey("triage_metadata.experiment_runs.run_hash"), + primary_key=True + ) config = Column(JSONB) time_splits = Column(Integer) as_of_times = Column(Integer) @@ -70,27 +74,33 @@ class Experiment(Base): matrices_needed = Column(Integer) grid_size = Column(Integer) models_needed = Column(Integer) + # run_rel = relationship("ExperimentRun") class Retrain(Base): __tablename__ = "retrain" __table_args__ = {"schema": "triage_metadata"} - retrain_hash = Column(String, primary_key=True) + retrain_hash = Column( + String, + # ForeignKey("triage_metadata.experiment_runs.run_hash"), + primary_key=True + ) config = Column(JSONB) prediction_date = Column(DateTime) + # run_rel = relationship("ExperimentRun") -class ExperimentRunStatus(enum.Enum): +class TriageRunStatus(enum.Enum): started = 1 completed = 2 failed = 3 -class ExperimentRun(Base): -# class TriageRun(Base): +# class ExperimentRun(Base): +class TriageRun(Base): - __tablename__ = "experiment_runs" + __tablename__ = "triage_runs" __table_args__ = {"schema": "triage_metadata"} run_id = Column("id", Integer, primary_key=True) @@ -100,14 +110,15 @@ class ExperimentRun(Base): triage_version = Column(String) python_version = Column(String) run_type = Column(String) - experiment_hash = Column( - String, - ForeignKey("triage_metadata.experiments.experiment_hash") - ) - retrain_hash = Column( - String, - ForeignKey("triage_metadata.retrain.retrain_hash") - ) + run_hash = Column(String) + # experiment_hash = Column( + # String, + # ForeignKey("triage_metadata.experiments.experiment_hash") + # ) + # retrain_hash = Column( + # String, + # ForeignKey("triage_metadata.retrain.retrain_hash") + # ) platform = Column(Text) os_user = Column(Text) working_directory = Column(Text) @@ -125,15 +136,15 @@ class ExperimentRun(Base): models_skipped = Column(Integer, default=0) models_errored = Column(Integer, default=0) last_updated_time = Column(DateTime, onupdate=datetime.datetime.now) - current_status = Column(Enum(ExperimentRunStatus)) + current_status = Column(Enum(TriageRunStatus)) stacktrace = Column(Text) random_seed = Column(Integer) - experiment_rel = relationship("Experiment") - retrain_rel = relationship("Retrain") + # experiment_rel = relationship("Experiment") + # retrain_rel = relationship("Retrain") - @hybrid_property - def external_hash(self): - return self.experiment_hash or self.retrain_hash + # @hybrid_property + # def external_hash(self): + # return self.experiment_hash or self.retrain_hash class Subset(Base): @@ -243,14 +254,14 @@ class Model(Base): model_comment = Column(Text) batch_comment = Column(Text) config = Column(JSON) - built_by_experiment = Column( - String, ForeignKey("triage_metadata.experiments.experiment_hash") - ) - built_by_retrain = Column( - String, ForeignKey("triage_metadata.retrain.retrain_hash") - ) + # built_by_experiment = Column( + # String, ForeignKey("triage_metadata.experiments.experiment_hash") + # ) + # built_by_retrain = Column( + # String, ForeignKey("triage_metadata.retrain.retrain_hash") + # ) built_in_triage_run = Column( - Integer, ForeignKey("triage_metadata.experiment_runs.id"), nullable=True + Integer, ForeignKey("triage_metadata.triage_runs.id"), nullable=True ) train_end_time = Column(DateTime) test = Column(Boolean) diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index be684b444..054d683cd 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -1,4 +1,4 @@ -from triage.component.results_schema import upgrade_db, Retrain, ExperimentRun, ExperimentRunStatus +from triage.component.results_schema import upgrade_db, Retrain, TriageRun, TriageRunStatus from triage.component.architect.entity_date_table_generators import EntityDateTableGenerator, DEFAULT_ACTIVE_STATE from triage.component.architect.features import ( FeatureGenerator, @@ -210,7 +210,7 @@ def __init__(self, db_engine, project_path, model_group_id): self.project_storage = ProjectStorage(project_path) self.model_group_id = model_group_id self.matrix_storage_engine = self.project_storage.matrix_storage_engine() - self.experiment_run_id, self.experiment_config = experiment_config_from_model_group_id(self.db_engine, self.model_group_id) + self.triage_run_id, self.experiment_config = experiment_config_from_model_group_id(self.db_engine, self.model_group_id) self.training_label_timespan = self.experiment_config['temporal_config']['training_label_timespans'][0] self.test_label_timespan = self.experiment_config['temporal_config']['test_label_timespans'][0] self.feature_start_time=self.experiment_config['temporal_config']['feature_start_time'] @@ -245,7 +245,7 @@ def __init__(self, db_engine, project_path, model_group_id): model_storage_engine=ModelStorageEngine(self.project_storage), db_engine=self.db_engine, replace=True, - run_id=self.experiment_run_id, + run_id=self.triage_run_id, ) def get_temporal_config_for_retrain(self, prediction_date): @@ -347,15 +347,15 @@ def retrain(self, prediction_date): as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'], "%Y-%m-%d") # Set ExperimentRun - run = ExperimentRun( + run = TriageRun( start_time=datetime.now(), git_hash=infer_git_hash(), triage_version=infer_triage_version(), python_version=infer_python_version(), run_type="retrain", - retrain_hash=self.retrain_hash, + run_hash=self.retrain_hash, last_updated_time=datetime.now(), - current_status=ExperimentRunStatus.started, + current_status=TriageRunStatus.started, installed_libraries=infer_installed_libraries(), platform=platform.platform(), os_user=getpass.getuser(), @@ -363,7 +363,7 @@ def retrain(self, prediction_date): ec2_instance_type=infer_ec2_instance_type(), log_location=infer_log_location(), experiment_class_path=classpath(self.__class__), - random_seed = retrieve_experiment_seed_from_run_id(self.db_engine, self.experiment_run_id), + random_seed = retrieve_experiment_seed_from_run_id(self.db_engine, self.triage_run_id), ) run_id = None with scoped_session(self.db_engine) as session: diff --git a/src/triage/predictlist/utils.py b/src/triage/predictlist/utils.py index 5c93cdf92..6a1f523db 100644 --- a/src/triage/predictlist/utils.py +++ b/src/triage/predictlist/utils.py @@ -16,8 +16,9 @@ def experiment_config_from_model_id(db_engine, model_id): Returns: (dict) experiment config """ get_experiment_query = '''select experiments.config - from triage_metadata.experiments - join triage_metadata.models on (experiments.experiment_hash = models.built_by_experiment) + from triage_metadata.triage_runs + join triage_metadata.models on (triage_runs.id = models.built_in_triage_run) + join triage_metadata.experiments on (experiments.experiment_hash = triage_runs.run_hash) where model_id = %s ''' (config,) = db_engine.execute(get_experiment_query, model_id).first() @@ -33,14 +34,14 @@ def experiment_config_from_model_group_id(db_engine, model_group_id): Returns: (dict) experiment config """ get_experiment_query = ''' - select experiment_runs.id as run_id, experiments.config - from triage_metadata.experiments - join triage_metadata.models - on (experiments.experiment_hash = models.built_by_experiment) - join triage_metadata.experiment_runs - on (experiment_runs.id = models.built_in_triage_run) + select triage_runs.id as run_id, experiments.config + from triage_metadata.triage_runs + join triage_metadata.models + on (triage_runs.id = models.built_in_triage_run) + join triage_metadata.experiments + on (experiments.experiment_hash = triage_runs.run_hash) where model_group_id = %s - order by experiment_runs.start_time desc + order by triage_runs.start_time desc ''' (run_id, config) = db_engine.execute(get_experiment_query, model_group_id).first() return run_id, config diff --git a/src/triage/tracking.py b/src/triage/tracking.py index 08f7b791e..380c1507b 100644 --- a/src/triage/tracking.py +++ b/src/triage/tracking.py @@ -21,7 +21,7 @@ pip_freeze = None -from triage.component.results_schema import ExperimentRun, ExperimentRunStatus +from triage.component.results_schema import TriageRun, TriageRunStatus def infer_git_hash(): @@ -100,7 +100,7 @@ def initialize_tracking_and_get_run_id( experiment_kwargs, db_engine ): - """Create a row in the ExperimentRun table with some initial info and return the created run_id + """Create a row in the TriageRun table with some initial info and return the created run_id Args: experiment_hash (str) An experiment hash that exists in the experiments table @@ -115,15 +115,15 @@ def initialize_tracking_and_get_run_id( k: (classpath(v) if isinstance(v, type) else v) for k, v in experiment_kwargs.items() } - run = ExperimentRun( + run = TriageRun( start_time=datetime.datetime.now(), git_hash=infer_git_hash(), triage_version=infer_triage_version(), python_version=infer_python_version(), run_type="experiment", - experiment_hash=experiment_hash, + run_hash=experiment_hash, last_updated_time=datetime.datetime.now(), - current_status=ExperimentRunStatus.started, + current_status=TriageRunStatus.started, installed_libraries=infer_installed_libraries(), platform=platform.platform(), os_user=getpass.getuser(), @@ -145,7 +145,7 @@ def initialize_tracking_and_get_run_id( def get_run_for_update(db_engine, run_id): - """Yields an ExperimentRun at the given run_id for update + """Yields an TriageRun at the given run_id for update Will kick the last_update_time timestamp of the row each time. @@ -153,7 +153,7 @@ def get_run_for_update(db_engine, run_id): db_engine (sqlalchemy.engine) run_id (int) The identifier/primary key of the run """ - return get_for_update(db_engine, ExperimentRun, run_id) + return get_for_update(db_engine, TriageRun, run_id) def experiment_entrypoint(entrypoint_func): @@ -162,8 +162,8 @@ def experiment_entrypoint(entrypoint_func): To update the database, it requires the instance of the wrapped method to have a db_engine and run_id. - Upon method entry, will update the ExperimentRun row with the wrapped method name. - Upon method exit, will update the ExperimentRun row with the status (either failed or completed) + Upon method entry, will update the TriageRun row with the wrapped method name. + Upon method exit, will update the TriageRun row with the status (either failed or completed) """ @wraps(entrypoint_func) def with_entrypoint(self, *args, **kwargs): @@ -175,12 +175,12 @@ def with_entrypoint(self, *args, **kwargs): return_value = entrypoint_func(self, *args, **kwargs) except Exception as exc: with get_run_for_update(self.db_engine, self.run_id) as run_obj: - run_obj.current_status = ExperimentRunStatus.failed + run_obj.current_status = TriageRunStatus.failed run_obj.stacktrace = str(exc) raise exc with get_run_for_update(self.db_engine, self.run_id) as run_obj: - run_obj.current_status = ExperimentRunStatus.completed + run_obj.current_status = TriageRunStatus.completed return return_value @@ -188,7 +188,7 @@ def with_entrypoint(self, *args, **kwargs): def increment_field(field, run_id, db_engine): - """Increment an ExperimentRun's named field. + """Increment an TriageRun's named field. Expects that the field is an integer in the database. @@ -202,8 +202,8 @@ def increment_field(field, run_id, db_engine): with scoped_session(db_engine) as session: # Use an update query instead of a session merge so it happens in one atomic query # and protect against race conditions - session.query(ExperimentRun).filter_by(run_id=run_id).update({ - field: getattr(ExperimentRun, field) + 1, + session.query(TriageRun).filter_by(run_id=run_id).update({ + field: getattr(TriageRun, field) + 1, 'last_updated_time': datetime.datetime.now() }) @@ -231,7 +231,7 @@ def record_model_building_started(run_id, db_engine): def built_matrix(run_id, db_engine): - """Increment the matrix build counter for the ExperimentRun + """Increment the matrix build counter for the TriageRun Args: run_id (int) The identifier/primary key of the run @@ -241,7 +241,7 @@ def built_matrix(run_id, db_engine): def skipped_matrix(run_id, db_engine): - """Increment the matrix skip counter for the ExperimentRun + """Increment the matrix skip counter for the TriageRun Args: run_id (int) The identifier/primary key of the run @@ -251,7 +251,7 @@ def skipped_matrix(run_id, db_engine): def errored_matrix(run_id, db_engine): - """Increment the matrix error counter for the ExperimentRun + """Increment the matrix error counter for the TriageRun Args: run_id (int) The identifier/primary key of the run @@ -261,7 +261,7 @@ def errored_matrix(run_id, db_engine): def built_model(run_id, db_engine): - """Increment the model build counter for the ExperimentRun + """Increment the model build counter for the TriageRun Args: run_id (int) The identifier/primary key of the run @@ -271,7 +271,7 @@ def built_model(run_id, db_engine): def skipped_model(run_id, db_engine): - """Increment the model skip counter for the ExperimentRun + """Increment the model skip counter for the TriageRun Args: run_id (int) The identifier/primary key of the run @@ -281,7 +281,7 @@ def skipped_model(run_id, db_engine): def errored_model(run_id, db_engine): - """Increment the model error counter for the ExperimentRun + """Increment the model error counter for the TriageRun Args: run_id (int) The identifier/primary key of the run From d6f14f5da6b5ffa612ff5e18fdd623e2f60492f6 Mon Sep 17 00:00:00 2001 From: tweddielin Date: Sat, 21 Aug 2021 21:15:46 -0400 Subject: [PATCH 46/52] get retrain_config --- src/tests/test_predictlist.py | 4 +- src/triage/component/results_schema/schema.py | 25 --------- src/triage/predictlist/__init__.py | 54 +++++++++++-------- src/triage/predictlist/utils.py | 12 +++++ 4 files changed, 45 insertions(+), 50 deletions(-) diff --git a/src/tests/test_predictlist.py b/src/tests/test_predictlist.py index 3b903832f..9c95abece 100644 --- a/src/tests/test_predictlist.py +++ b/src/tests/test_predictlist.py @@ -86,7 +86,7 @@ def test_retrain_should_write_model(finished_experiment): model_group_id=model_group_id, ) retrain_info = retrainer.retrain(prediction_date) - model_comment = retrain_info['retrained_model_comment'] + model_comment = retrain_info['retrain_model_comment'] records = [ row @@ -95,7 +95,7 @@ def test_retrain_should_write_model(finished_experiment): ) ] assert len(records) == 1 - assert retrainer.retrained_model_hash == records[0][0] + assert retrainer.retrain_model_hash == records[0][0] retrainer.predict(prediction_date) diff --git a/src/triage/component/results_schema/schema.py b/src/triage/component/results_schema/schema.py index bb646b70f..08694e298 100644 --- a/src/triage/component/results_schema/schema.py +++ b/src/triage/component/results_schema/schema.py @@ -62,7 +62,6 @@ class Experiment(Base): experiment_hash = Column( String, - # ForeignKey("triage_metadata.experiment_runs.run_hash"), primary_key=True ) config = Column(JSONB) @@ -74,7 +73,6 @@ class Experiment(Base): matrices_needed = Column(Integer) grid_size = Column(Integer) models_needed = Column(Integer) - # run_rel = relationship("ExperimentRun") class Retrain(Base): @@ -83,12 +81,10 @@ class Retrain(Base): retrain_hash = Column( String, - # ForeignKey("triage_metadata.experiment_runs.run_hash"), primary_key=True ) config = Column(JSONB) prediction_date = Column(DateTime) - # run_rel = relationship("ExperimentRun") class TriageRunStatus(enum.Enum): @@ -97,7 +93,6 @@ class TriageRunStatus(enum.Enum): failed = 3 -# class ExperimentRun(Base): class TriageRun(Base): __tablename__ = "triage_runs" @@ -111,14 +106,6 @@ class TriageRun(Base): python_version = Column(String) run_type = Column(String) run_hash = Column(String) - # experiment_hash = Column( - # String, - # ForeignKey("triage_metadata.experiments.experiment_hash") - # ) - # retrain_hash = Column( - # String, - # ForeignKey("triage_metadata.retrain.retrain_hash") - # ) platform = Column(Text) os_user = Column(Text) working_directory = Column(Text) @@ -139,13 +126,7 @@ class TriageRun(Base): current_status = Column(Enum(TriageRunStatus)) stacktrace = Column(Text) random_seed = Column(Integer) - # experiment_rel = relationship("Experiment") - # retrain_rel = relationship("Retrain") - # @hybrid_property - # def external_hash(self): - # return self.experiment_hash or self.retrain_hash - class Subset(Base): @@ -254,12 +235,6 @@ class Model(Base): model_comment = Column(Text) batch_comment = Column(Text) config = Column(JSON) - # built_by_experiment = Column( - # String, ForeignKey("triage_metadata.experiments.experiment_hash") - # ) - # built_by_retrain = Column( - # String, ForeignKey("triage_metadata.retrain.retrain_hash") - # ) built_in_triage_run = Column( Integer, ForeignKey("triage_metadata.triage_runs.id"), nullable=True ) diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 054d683cd..9e7d5362d 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -38,6 +38,7 @@ get_feature_needs_imputation_in_production, associate_models_with_retrain, save_retrain_and_get_hash, + get_retrain_config_from_model_id, ) @@ -213,6 +214,7 @@ def __init__(self, db_engine, project_path, model_group_id): self.triage_run_id, self.experiment_config = experiment_config_from_model_group_id(self.db_engine, self.model_group_id) self.training_label_timespan = self.experiment_config['temporal_config']['training_label_timespans'][0] self.test_label_timespan = self.experiment_config['temporal_config']['test_label_timespans'][0] + self.test_duration = self.experiment_config['temporal_config']['test_durations'][0] self.feature_start_time=self.experiment_config['temporal_config']['feature_start_time'] self.label_name = self.experiment_config['label_config']['name'] self.cohort_name = self.experiment_config['cohort_config']['name'] @@ -318,16 +320,20 @@ def retrain(self, prediction_date): Args: prediction_date(str) """ - + # Retrain config and hash retrain_config = { "model_group_id": self.model_group_id, "prediction_date": prediction_date, + "test_label_timespan": self.test_label_timespan, + "test_duration": self.test_duration, + } self.retrain_hash = save_retrain_and_get_hash(retrain_config, self.db_engine) with get_for_update(self.db_engine, Retrain, self.retrain_hash) as retrain: retrain.prediction_date = prediction_date + # Timechop prediction_date = dt_from_str(prediction_date) temporal_config = self.get_temporal_config_for_retrain(prediction_date) @@ -335,17 +341,17 @@ def retrain(self, prediction_date): chops = timechopper.chop_time() assert len(chops) == 1 chops_train_matrix = chops[0]['train_matrix'] + as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'], "%Y-%m-%d") retrain_definition = { 'first_as_of_time': chops_train_matrix['first_as_of_time'], 'last_as_of_time': chops_train_matrix['last_as_of_time'], 'matrix_info_end_time': chops_train_matrix['matrix_info_end_time'], - 'as_of_times': chops_train_matrix['as_of_times'], + 'as_of_times': [as_of_date], 'training_label_timespan': chops_train_matrix['training_label_timespan'], 'max_training_history': chops_train_matrix['max_training_history'], 'training_as_of_date_frequency': chops_train_matrix['training_as_of_date_frequency'], } - as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'], "%Y-%m-%d") - + # Set ExperimentRun run = TriageRun( start_time=datetime.now(), @@ -448,14 +454,14 @@ def retrain(self, prediction_date): matrix_uuid=matrix_uuid, matrix_type="train", ) - retrained_model_comment = 'retrain_' + str(datetime.now()) + retrain_model_comment = 'retrain_' + str(datetime.now()) misc_db_parameters = { 'train_end_time': dt_from_str(as_of_date), 'test': False, 'train_matrix_uuid': matrix_uuid, 'training_label_timespan': self.training_label_timespan, - 'model_comment': retrained_model_comment, + 'model_comment': retrain_model_comment, } # get the random seed from the last split @@ -470,34 +476,34 @@ def retrain(self, prediction_date): train_matrix_uuid=last_split_train_matrix_uuid ) - # create retrained model hash - retrained_model_hash = self.model_trainer._model_hash( + # create retrain model hash + retrain_model_hash = self.model_trainer._model_hash( self.matrix_storage_engine.get_store(matrix_uuid).metadata, class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], random_seed=random_seed, ) - associate_models_with_retrain(self.retrain_hash, (retrained_model_hash, ), self.db_engine) + associate_models_with_retrain(self.retrain_hash, (retrain_model_hash, ), self.db_engine) - retrained_model_id = self.model_trainer.process_train_task( + retrain_model_id = self.model_trainer.process_train_task( matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], - model_hash=retrained_model_hash, + model_hash=retrain_model_hash, misc_db_parameters=misc_db_parameters, random_seed=random_seed, retrain=True, model_group_id=self.model_group_id ) - self.retrained_model_hash = retrieve_model_hash_from_id(self.db_engine, retrained_model_id) - self.retrained_matrix_uuid = matrix_uuid - self.retrained_model_id = retrained_model_id - return {'retrained_model_comment': retrained_model_comment, 'retrained_model_id': retrained_model_id} + self.retrain_model_hash = retrieve_model_hash_from_id(self.db_engine, retrain_model_id) + self.retrain_matrix_uuid = matrix_uuid + self.retrain_model_id = retrain_model_id + return {'retrain_model_comment': retrain_model_comment, 'retrain_model_id': retrain_model_id} def predict(self, prediction_date): - """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrained model on it + """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrain model on it Args: prediction_date(str) @@ -518,7 +524,7 @@ def predict(self, prediction_date): # 3. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task( collate_aggregations, - self.retrained_model_id + self.retrain_model_id ) self.feature_generator.process_table_tasks(imputation_table_tasks) @@ -537,13 +543,15 @@ def predict(self, prediction_date): replace=True, ) # Use timechop to get the time definition for production - # temporal_config = self.experiment_config["temporal_config"] temporal_config = self.get_temporal_config_for_retrain(dt_from_str(prediction_date)) timechopper = Timechop(**temporal_config) + + retrain_config = get_retrain_config_from_model_id(self.db_engine, self.retrain_model_id) + prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(prediction_date), - test_duration=temporal_config['test_durations'][0], - test_label_timespan=temporal_config['test_label_timespans'][0] + test_duration=retrain_config['test_duration'], + test_label_timespan=retrain_config['test_label_timespan'] ) last_split_definition = prod_definitions[-1] matrix_metadata = Planner.make_metadata( @@ -557,7 +565,7 @@ def predict(self, prediction_date): user_metadata=self.user_metadata, ) - matrix_metadata['matrix_id'] = str(prediction_date) + f'_model_id_{self.retrained_model_id}' + '_risklist' + matrix_metadata['matrix_id'] = str(prediction_date) + f'_model_id_{self.retrain_model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) @@ -579,9 +587,9 @@ def predict(self, prediction_date): ) predictor.predict( - model_id=self.retrained_model_id, + model_id=self.retrain_model_id, matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, - train_matrix_columns=self.matrix_storage_engine.get_store(self.retrained_matrix_uuid).columns(), + train_matrix_columns=self.matrix_storage_engine.get_store(self.retrain_matrix_uuid).columns(), ) self.predict_matrix_uuid = matrix_uuid diff --git a/src/triage/predictlist/utils.py b/src/triage/predictlist/utils.py index 6a1f523db..126870b3b 100644 --- a/src/triage/predictlist/utils.py +++ b/src/triage/predictlist/utils.py @@ -117,6 +117,18 @@ def get_feature_needs_imputation_in_production(aggregation, db_engine): return features_imputed_in_production +def get_retrain_config_from_model_id(db_engine, model_id): + query = """ + SELECT re.config FROM triage_metadata.models m + LEFT JOIN triage_metadata.triage_runs r ON m.built_in_triage_run = r.id + LEFT JOIN triage_metadata.retrain re on re.retrain_hash = r.run_hash + WHERE m.model_id = %s; + """ + + (config,) = db_engine.execute(query, model_id).first() + return config + + @db_retry def associate_models_with_retrain(retrain_hash, model_hashes, db_engine): session = sessionmaker(bind=db_engine)() From d76359b24fef9cf6d6053487fa14b7fe195bd186 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 26 Aug 2021 19:14:49 -0400 Subject: [PATCH 47/52] explicitly include run_type in joins to triage_runs --- src/triage/predictlist/utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/triage/predictlist/utils.py b/src/triage/predictlist/utils.py index 126870b3b..6b74c35cb 100644 --- a/src/triage/predictlist/utils.py +++ b/src/triage/predictlist/utils.py @@ -18,7 +18,8 @@ def experiment_config_from_model_id(db_engine, model_id): get_experiment_query = '''select experiments.config from triage_metadata.triage_runs join triage_metadata.models on (triage_runs.id = models.built_in_triage_run) - join triage_metadata.experiments on (experiments.experiment_hash = triage_runs.run_hash) + join triage_metadata.experiments + on (experiments.experiment_hash = triage_runs.run_hash and triage_runs.run_type='experiment') where model_id = %s ''' (config,) = db_engine.execute(get_experiment_query, model_id).first() @@ -39,7 +40,7 @@ def experiment_config_from_model_group_id(db_engine, model_group_id): join triage_metadata.models on (triage_runs.id = models.built_in_triage_run) join triage_metadata.experiments - on (experiments.experiment_hash = triage_runs.run_hash) + on (experiments.experiment_hash = triage_runs.run_hash and triage_runs.run_type='experiment') where model_group_id = %s order by triage_runs.start_time desc ''' @@ -120,8 +121,10 @@ def get_feature_needs_imputation_in_production(aggregation, db_engine): def get_retrain_config_from_model_id(db_engine, model_id): query = """ SELECT re.config FROM triage_metadata.models m - LEFT JOIN triage_metadata.triage_runs r ON m.built_in_triage_run = r.id - LEFT JOIN triage_metadata.retrain re on re.retrain_hash = r.run_hash + LEFT JOIN triage_metadata.triage_runs r + ON m.built_in_triage_run = r.id + LEFT JOIN triage_metadata.retrain re + ON (re.retrain_hash = r.run_hash and r.run_type='retrain') WHERE m.model_id = %s; """ From 9698500043b9a0a2012e368b31e6f3f50c53984b Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 26 Aug 2021 19:24:11 -0400 Subject: [PATCH 48/52] DB migration updates --- .../alembic/versions/5dd2ba8222b1_add_run_type.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py b/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py index 0a8bbaf45..745057748 100644 --- a/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py +++ b/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py @@ -18,9 +18,10 @@ def upgrade(): op.add_column('experiment_runs', sa.Column('run_type', sa.Text(), nullable=True), schema='triage_metadata') + op.execute("UPATE triage_metadata.experiment_runs SET run_type='experiment' WHERE run_type IS NULL") + + op.alter_column('experiment_runs', 'experiment_hash', nullable=True, new_column_name='run_hash', schema='triage_metadata') - op.add_column('experiment_runs', sa.Column('run_hash', sa.Text(), nullable=True), schema='triage_metadata') - op.drop_column('experiment_runs', 'experiment_hash', schema='triage_metadata') op.execute("ALTER TABLE triage_metadata.experiment_runs RENAME TO triage_runs") op.create_table('retrain', @@ -32,6 +33,7 @@ def upgrade(): ) op.alter_column('models', 'built_in_experiment_run', nullable=False, new_column_name='built_in_triage_run', schema='triage_metadata') + op.execute("CREATE TABLE triage_metadata.deprecated_models_built_by_experiment AS SELECT model_id, model_hash, built_by_experiment FROM triage_metadata.models") op.drop_column('models', 'built_by_experiment', schema='triage_metadata') op.create_table('retrain_models', @@ -46,7 +48,7 @@ def upgrade(): def downgrade(): op.execute("ALTER TABLE triage_metadata.triage_runs RENAME TO experiment_runs") op.drop_column('experiment_runs', 'run_type', schema='triage_metadata') - op.drop_column('experiment_runs', 'run_hash', schema='triage_metadata') + op.alter_column('experiment_runs', 'run_hash', nullable=True, new_column_name='experiment_hash', schema='triage_metadata') op.drop_table('retrain_models', schema='triage_metadata') op.drop_table('retrain', schema='triage_metadata') op.add_column('models', sa.Column('built_by_experiment', sa.Text(), nullable=True), schema='triage_metadata') From a8a29f1b20e8f09ecb99c3a1576b0f54b56fe7c2 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 26 Aug 2021 19:27:44 -0400 Subject: [PATCH 49/52] update argument name in docs --- docs/sources/predictlist/index.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/sources/predictlist/index.md b/docs/sources/predictlist/index.md index e7b615432..c5abcacd2 100644 --- a/docs/sources/predictlist/index.md +++ b/docs/sources/predictlist/index.md @@ -4,10 +4,10 @@ Use an existing model group to retrain a new model on all the data up to the cur ## Examples Both examples assume you have already run a Triage Experiment in the past, and know these two pieces of information: 1. A `model_group_id` from a Triage model group that you want to use to retrain a model and generate prediction -2. A `today` to generate your predictions on. +2. A `prediction_date` to generate your predictions on. ### CLI -`triage retrainpredict ` +`triage retrainpredict ` Example: `triage retrainpredict 30 2021-04-04` @@ -26,8 +26,8 @@ retrainer = Retrainer( project_path='/home/you/triage/project2' model_group_id=36, ) -retrainer.retrain(today='2021-04-04') -retrainer.predict(today='2021-04-04') +retrainer.retrain(prediction_date='2021-04-04') +retrainer.predict(prediction_date='2021-04-04') ``` From 694edcc0a3c197d4112d3d671b51a28a930dc9be Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 26 Aug 2021 21:04:05 -0400 Subject: [PATCH 50/52] ensure correct temporal config is used for predicting forward --- src/triage/predictlist/__init__.py | 17 +++++++++- src/triage/predictlist/utils.py | 54 ++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 9e7d5362d..0d0c8d010 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -39,6 +39,7 @@ associate_models_with_retrain, save_retrain_and_get_hash, get_retrain_config_from_model_id, + temporal_params_from_matrix_metadata, ) @@ -150,6 +151,7 @@ def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_ # Use timechop to get the time definition for production temporal_config = experiment_config["temporal_config"] + temporal_config.update(temporal_params_from_matrix_metadata(db_engine, model_id)) timechopper = Timechop(**temporal_config) prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(as_of_date), @@ -210,16 +212,29 @@ def __init__(self, db_engine, project_path, model_group_id): upgrade_db(db_engine=self.db_engine) self.project_storage = ProjectStorage(project_path) self.model_group_id = model_group_id + self.model_group_info = get_model_group_info(self.db_engine, self.model_group_id) self.matrix_storage_engine = self.project_storage.matrix_storage_engine() self.triage_run_id, self.experiment_config = experiment_config_from_model_group_id(self.db_engine, self.model_group_id) + + # This feels like it needs some refactoring since in some edge cases at least the test matrix temporal parameters + # might differ across models in the mdoel group (the training ones shouldn't), but this should probably work for + # the vast majorty of use cases... + self.experiment_config['temporal_config'].update(self.db_engine, self.model_group_info['model_id_last_split']) + + # Since "testing" here is predicting forward to a single new date, the test_duration should always be '0day' + # (regardless of what it may have been before) + self.experiment_config['temporal_config']['test_durations'] = ['0day'] + + # These lists should now only contain one item (the value actually used for the last model in this group) self.training_label_timespan = self.experiment_config['temporal_config']['training_label_timespans'][0] self.test_label_timespan = self.experiment_config['temporal_config']['test_label_timespans'][0] self.test_duration = self.experiment_config['temporal_config']['test_durations'][0] self.feature_start_time=self.experiment_config['temporal_config']['feature_start_time'] + self.label_name = self.experiment_config['label_config']['name'] self.cohort_name = self.experiment_config['cohort_config']['name'] self.user_metadata = self.experiment_config['user_metadata'] - self.model_group_info = get_model_group_info(self.db_engine, self.model_group_id) + self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name='triage_production', db_engine=self.db_engine diff --git a/src/triage/predictlist/utils.py b/src/triage/predictlist/utils.py index 6b74c35cb..4aacb3df3 100644 --- a/src/triage/predictlist/utils.py +++ b/src/triage/predictlist/utils.py @@ -76,6 +76,60 @@ def train_matrix_info_from_model_id(db_engine, model_id): return db_engine.execute(get_train_matrix_query, model_id).first() +def test_matrix_info_from_model_id(db_engine, model_id): + """Get original test matrix information from model_id + + Note: because a model may have been tested on multiple matrices, this + chooses the matrix associated with the most recently run experiment + (then randomly if multiple test matrices are associated with the model_id + in that experiment). Generally, this will be an edge case, but may be + worth considering providing more control over which to choose here. + + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + + Returns: (str, dict) matrix uuid and matrix metadata + """ + get_test_matrix_query = """ + select matrix_uuid, mat.matrix_metadata + from triage_metadata.matrices mat + join test_results.prediction_metadata pm on (pm.matrix_uuid = mat.matrix_uuid) + join triage_metadata.triage_runs tr + on (mat.built_by_experiment = tr.run_hash AND tr.run_type='experiment') + where pm.model_id = %s + order by start_time DESC, RANDOM() + """ + return db_engine.execute(get_test_matrix_query, model_id).first() + + + +def temporal_params_from_matrix_metadata(db_engine, model_id): + """Read temporal parameters associated with model training/testing from the associated + matrices. Because a grid of multiple values may be provided in the experiment config + for these parameters, we need to find the specific values that were actually used for + the given model at runtime. + + Args: + db_engine (sqlalchemy.db.engine) + model_id (int) The id of a given model in the database + + Returns: (dict) The parameters for use in a temporal config for timechop + """ + train_uuid, train_metadata = train_matrix_info_from_model_id(db_engine, model_id) + test_uuid, test_metadata = test_matrix_info_from_model_id(db_engine, model_id) + + temporal_params = {} + + temporal_params['training_as_of_date_frequencies'] = train_metadata['training_as_of_date_frequency'] + temporal_params['test_as_of_date_frequencies'] = test_metadata['test_as_of_date_frequency'] + temporal_params['max_training_histories'] = [ train_metadata['max_training_history'] ] + temporal_params['test_durations'] = [ test_metadata['test_duration'] ] + temporal_params['training_label_timespans'] = [ train_metadata.get('training_label_timespan', train_metadata['label_timespan']) ] + temporal_params['test_label_timespans'] = [ test_metadata.get('test_label_timespan', test_metadata['label_timespan']) ] + + return temporal_params + def get_feature_names(aggregation, matrix_metadata): """Returns a feature group name and a list of feature names from a SpacetimeAggregation object""" feature_prefix = aggregation.prefix From 583e9bddd67035f291fb9775bb4fd20f908ef416 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 26 Aug 2021 22:18:12 -0400 Subject: [PATCH 51/52] debug --- .../alembic/versions/5dd2ba8222b1_add_run_type.py | 4 +++- src/triage/predictlist/__init__.py | 14 +++++++------- src/triage/predictlist/utils.py | 3 ++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py b/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py index 745057748..3819bb05a 100644 --- a/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py +++ b/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py @@ -18,9 +18,10 @@ def upgrade(): op.add_column('experiment_runs', sa.Column('run_type', sa.Text(), nullable=True), schema='triage_metadata') - op.execute("UPATE triage_metadata.experiment_runs SET run_type='experiment' WHERE run_type IS NULL") + op.execute("UPDATE triage_metadata.experiment_runs SET run_type='experiment' WHERE run_type IS NULL") op.alter_column('experiment_runs', 'experiment_hash', nullable=True, new_column_name='run_hash', schema='triage_metadata') + op.drop_constraint('experiment_runs_experiment_hash_fkey', 'experiment_runs', type_='foreignkey', schema='triage_metadata') op.execute("ALTER TABLE triage_metadata.experiment_runs RENAME TO triage_runs") @@ -49,6 +50,7 @@ def downgrade(): op.execute("ALTER TABLE triage_metadata.triage_runs RENAME TO experiment_runs") op.drop_column('experiment_runs', 'run_type', schema='triage_metadata') op.alter_column('experiment_runs', 'run_hash', nullable=True, new_column_name='experiment_hash', schema='triage_metadata') + op.create_foreign_key('experiment_runs_experiment_hash_fkey', 'experiment_runs', 'experiments', ['experiment_hash'], ['experiment_hash'], schema='triage_metadata') op.drop_table('retrain_models', schema='triage_metadata') op.drop_table('retrain', schema='triage_metadata') op.add_column('models', sa.Column('built_by_experiment', sa.Text(), nullable=True), schema='triage_metadata') diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 0d0c8d010..29050ec9b 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -219,7 +219,7 @@ def __init__(self, db_engine, project_path, model_group_id): # This feels like it needs some refactoring since in some edge cases at least the test matrix temporal parameters # might differ across models in the mdoel group (the training ones shouldn't), but this should probably work for # the vast majorty of use cases... - self.experiment_config['temporal_config'].update(self.db_engine, self.model_group_info['model_id_last_split']) + self.experiment_config['temporal_config'].update(temporal_params_from_matrix_metadata(self.db_engine, self.model_group_info['model_id_last_split'])) # Since "testing" here is predicting forward to a single new date, the test_duration should always be '0day' # (regardless of what it may have been before) @@ -268,15 +268,15 @@ def __init__(self, db_engine, project_path, model_group_id): def get_temporal_config_for_retrain(self, prediction_date): temporal_config = self.experiment_config['temporal_config'].copy() temporal_config['feature_end_time'] = datetime.strftime(prediction_date, "%Y-%m-%d") - temporal_config['label_start_time'] = datetime.strftime( - prediction_date - - convert_str_to_relativedelta(self.training_label_timespan) - - convert_str_to_relativedelta(self.test_label_timespan), - "%Y-%m-%d") temporal_config['label_end_time'] = datetime.strftime( prediction_date + convert_str_to_relativedelta(self.test_label_timespan), "%Y-%m-%d") - temporal_config['model_update_frequency'] = self.test_label_timespan + # just needs to be bigger than the gap between the label start and end times + # to ensure we only get one time split for the retraining + temporal_config['model_update_frequency'] = '%syears' % ( + dt_from_str(temporal_config['label_end_time']).year - + dt_from_str(temporal_config['label_start_time']).year + 10 + ) return temporal_config diff --git a/src/triage/predictlist/utils.py b/src/triage/predictlist/utils.py index 4aacb3df3..9b5eeaf51 100644 --- a/src/triage/predictlist/utils.py +++ b/src/triage/predictlist/utils.py @@ -92,13 +92,14 @@ def test_matrix_info_from_model_id(db_engine, model_id): Returns: (str, dict) matrix uuid and matrix metadata """ get_test_matrix_query = """ - select matrix_uuid, mat.matrix_metadata + select mat.matrix_uuid, mat.matrix_metadata from triage_metadata.matrices mat join test_results.prediction_metadata pm on (pm.matrix_uuid = mat.matrix_uuid) join triage_metadata.triage_runs tr on (mat.built_by_experiment = tr.run_hash AND tr.run_type='experiment') where pm.model_id = %s order by start_time DESC, RANDOM() + limit 1 """ return db_engine.execute(get_test_matrix_query, model_id).first() From 815a258a53fd3c2a397b7aba9fb33a8968b479f8 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 26 Aug 2021 22:45:35 -0400 Subject: [PATCH 52/52] debug --- .../alembic/versions/5dd2ba8222b1_add_run_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py b/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py index 3819bb05a..81b36615a 100644 --- a/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py +++ b/src/triage/component/results_schema/alembic/versions/5dd2ba8222b1_add_run_type.py @@ -50,7 +50,7 @@ def downgrade(): op.execute("ALTER TABLE triage_metadata.triage_runs RENAME TO experiment_runs") op.drop_column('experiment_runs', 'run_type', schema='triage_metadata') op.alter_column('experiment_runs', 'run_hash', nullable=True, new_column_name='experiment_hash', schema='triage_metadata') - op.create_foreign_key('experiment_runs_experiment_hash_fkey', 'experiment_runs', 'experiments', ['experiment_hash'], ['experiment_hash'], schema='triage_metadata') + op.create_foreign_key('experiment_runs_experiment_hash_fkey', 'experiment_runs', 'experiments', ['experiment_hash'], ['experiment_hash'], source_schema='triage_metadata', referent_schema='triage_metadata') op.drop_table('retrain_models', schema='triage_metadata') op.drop_table('retrain', schema='triage_metadata') op.add_column('models', sa.Column('built_by_experiment', sa.Text(), nullable=True), schema='triage_metadata')