From 680c83cf8eab01a3c33d0a3206173c448bc35bda Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Fri, 14 Dec 2018 10:36:31 -0600 Subject: [PATCH 01/22] Feature Generation Rewrite Introduces the 'FeatureBlock' as the main interface for Triage to create features, instead of directly interfacing with collate. - Add FeatureBlock abstract base class. The abstract methods are aimed at providing an easy, flexible interface for feature generators to implement. The concrete methods are aimed at building on those abstract methods to provide a slightly easier, higher-level interface for Experiments, et al to use. - Splits up the bloated architect.FeatureGenerator into a couple different places: the code that generates is now in SpacetimeAggregations in feature_block _generators, and the code that runs the generated queries now lives in concrete methods of the FeatureBlock base class. - ExperimentBase now passes on config to generate_feature_blocks and owns a collection of resulting FeatureBlocks, instead of a FeatureGenerator object. - Instead of a FeatureDictionaryCreator object, the FeatureDictionary class is introduced that knows how to build itself from a collection of FeatureBlocks. The Experiment holds references to these as before. - Continuing the trend of molding collate to more closely fit how Triage is using it and removing unneeded flexibility, the unused Aggregation base class is folded into the SpacetimeAggregation so the latter can directly inherit from FeatureBlock. This way we get to remove/condense many tests, and sunset the collate integration tests because the methods being tested no longer exist and are tested in either SpacetimeAggregation or FeatureBlock. In addition, many of the arguments within SpacetimeAggregation are changed to their FeatureBlock equivalents to more tightly fit as a FeatureBlock subclass. Some of the more generic helper methods within SpacetimeAggregation are moved to FeatureBlock as concrete methods - With feature data now residing at a new config key, the experiment's config version is bumped to v7. --- docs/mkdocs.yml | 1 + .../sources/experiments/extending-features.md | 143 +++ docs/sources/experiments/feature-testing.md | 26 +- docs/sources/experiments/upgrade-to-v7.md | 66 ++ example/config/experiment.yaml | 65 +- .../test_feature_block_generators.py | 246 +++++ .../architect_tests/test_feature_blocks.py | 118 +++ .../test_feature_dictionary_creator.py | 87 -- .../test_feature_generators.py | 941 ------------------ src/tests/architect_tests/test_integration.py | 108 +- src/tests/collate_tests/test_collate.py | 48 +- .../collate_tests/test_imputation_output.py | 34 +- src/tests/collate_tests/test_integration.py | 118 --- src/tests/collate_tests/test_spacetime.py | 144 ++- src/tests/test_partial_experiments.py | 10 +- src/tests/test_utils_db.py | 23 + src/tests/utils.py | 4 +- src/triage/cli.py | 19 +- src/triage/component/architect/README.md | 2 +- src/triage/component/architect/builders.py | 2 +- .../architect/database_reflection.py | 14 + .../component/architect/feature_block.py | 235 +++++ .../architect/feature_block_generators.py | 291 ++++++ .../component/architect/feature_dictionary.py | 64 ++ .../architect/feature_dictionary_creator.py | 69 -- .../component/architect/feature_generators.py | 715 ------------- .../architect/feature_query_runners.py | 141 +++ src/triage/component/architect/features.py | 9 +- src/triage/component/architect/utils.py | 5 + src/triage/component/collate/__init__.py | 5 +- src/triage/component/collate/collate.py | 385 +------ src/triage/component/collate/imputations.py | 8 +- src/triage/component/collate/spacetime.py | 392 ++++++-- src/triage/database_reflection.py | 14 + src/triage/experiments/__init__.py | 2 +- src/triage/experiments/base.py | 105 +- src/triage/experiments/multicore.py | 27 +- src/triage/experiments/rq.py | 62 +- src/triage/experiments/singlethreaded.py | 5 +- src/triage/experiments/validate.py | 76 +- src/triage/util/db.py | 6 + 41 files changed, 2036 insertions(+), 2799 deletions(-) create mode 100644 docs/sources/experiments/extending-features.md create mode 100644 docs/sources/experiments/upgrade-to-v7.md create mode 100644 src/tests/architect_tests/test_feature_block_generators.py create mode 100644 src/tests/architect_tests/test_feature_blocks.py delete mode 100644 src/tests/architect_tests/test_feature_dictionary_creator.py delete mode 100644 src/tests/architect_tests/test_feature_generators.py delete mode 100755 src/tests/collate_tests/test_integration.py create mode 100644 src/tests/test_utils_db.py create mode 100644 src/triage/component/architect/feature_block.py create mode 100644 src/triage/component/architect/feature_block_generators.py create mode 100644 src/triage/component/architect/feature_dictionary.py delete mode 100644 src/triage/component/architect/feature_dictionary_creator.py delete mode 100644 src/triage/component/architect/feature_generators.py create mode 100644 src/triage/component/architect/feature_query_runners.py diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 97272ce8a..787533bd5 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -25,5 +25,6 @@ pages: - Feature Generation Recipe Book: experiments/features.md - Experiment Algorithm: experiments/algorithm.md - Experiment Architecture: experiments/architecture.md + - Extending Experiment Features: experiments/extending-features.md - Audition: https://github.com/dssg/triage/tree/master/src/triage/component/audition - Postmodeling: https://github.com/dssg/triage/tree/master/src/triage/component/postmodeling diff --git a/docs/sources/experiments/extending-features.md b/docs/sources/experiments/extending-features.md new file mode 100644 index 000000000..1ca5530db --- /dev/null +++ b/docs/sources/experiments/extending-features.md @@ -0,0 +1,143 @@ +# Extending Feature Generation + +This document describes how to extend Triage's feature generation capabilities by writing new FeatureBlock classes and incorporating them into Experiments. + +## What is a FeatureBlock? + +A FeatureBlock represents a single feature table in the database and how to generate it. If you're familiar with `collate` parlance, a `SpacetimeAggregation` is similar in scope to a FeatureBlock. A `FeatureBlock` class can be instantiated with whatever arguments it needs,and from there can provide queries to produce its output feature table. Full-size Triage experiments tend to contain multiple feature blocks. These all live in a collection as the `experiment.feature_blocks` property in the Experiment. + +## What existing FeatureBlock classes can I use? + +Class name | Experiment config key | Use +------------ | ------------- | ------------ +triage.component.collate.SpacetimeAggregation | spacetime_aggregations | Temporal aggregations of event-based data + +## Writing a new FeatureBlock class + +The `FeatureBlock` base class defines a set of abstract methods that any child class must implement, as well as a number of initialization arguments that it must take and implement in order to fulfill expectations Triage users have on feature generators. Triage expects these classes to define the queries they need to run, as opposed to generating the tables themselves, so that Triage can implement scaling by parallelization. + +### Abstract methods + +Any method here without parentheses afterwards is expected to be a property. + +Method | Task | Return Type +------------ | ------------- | ------------- +final_feature_table_name | The name of the final table with all features filled in (no missing values) | string +feature_columns | The list of feature columns in the final, postimputation table. Should exclude any index columns (e.g. entity id, date) | list +preinsert_queries | Return all queries that should be run before inserting any data. The creation of your feature table should happen here, and is expected to have `entity_id(integer)` and `as_of_date(timestamp)` columns. | list +insert_queries | Return all inserts to populate this data. Each query in this list should be parallelizable, and should be valid after all `preinsert_queries` are run. | list +postinsert_queries | Return all queries that should be run after inserting all data | list +imputation_queries | Return all queries that should be run to fill in missing data with imputed values. | list + +Any of the query list properties can be empty: for instance, if your implementation doesn't have inserts separate from table creation and is just one big query (e.g. a `CREATE TABLE AS`), you could just define `preinsert_queries` so be that one mega-query and leave the other properties as empty lists. + +### Properties Provided by Base Class + +There are several attributes/properties that can be used within subclass implementations that the base class provides. Triage experiments take care of providing this data during runtime: if you want to instantiate a FeatureBlock object on your own, you'll have to provide them in the constructor. + +Name | Type | Purpose +------------ | ------------- | ------------- +as_of_dates | list | Features are created "as of" specific dates, and expects that each of these dates will be populated with a row for each member of the cohort on that date. +cohort_table | string | The final shape of the feature table should at least include every entity id/date pair in this cohort table. +db_engine | sqlalchemy.engine | The engine to use to access the database. Although these instances are mostly returning queries, the engine may be useful for implementing imputation. +features_schema_name | string | The database schema where all feature tables should reside. Defaults to None, which ends up in the public schema. +feature_start_time | string/datetime | A time before which no data should be considered for features. This is generally only applicable if your FeatureBlock is doing temporal aggregations. Defaults to None, which means no data will be excluded. +features_ignore_cohort | bool | If True (the default), features are only computed for members of the cohort. If False, the shape of the final feature table could include more. + + +`FeatureBlock` child classes can, and in almost all cases will, include more configuration at initialization time that are specific to them. They probably also define many more methods to use internally. But as long as they adhere to this interface, they'll work with Triage. + +### Making the new FeatureBlock available to experiments + +Triage Experiments run on serializable configuration, and although it's possible to take fully generated `FeatureBlock` instances and bypass this (e.g. `experiment.feature_blocks = `), it's not recommended. The last step is to pick a config key for use within the `features` key of experiment configs, within `triage.component.architect.feature_block_generators.FEATURE_BLOCK_GENERATOR_LOOKUP` and point it to a function that instantiates a bunch of your objects based on config. + +## Example + +That's a lot of information! Let's see this in action. Let's say that we want to create a very flexible type of feature that simply runs a configured query with a parametrized as-of-date and returns its result as a feature. + +```python +from triage.component.architect.feature_block import FeatureBlock + + +class SimpleQueryFeature(FeatureBlock): + def __init__(self, query, *args, **kwargs): + self.query = query + super().__init__(*args, **kwargs) + + @property + def final_feature_table_name(self): + return f"{self.features_schema_name}.mytable" + + @property + def feature_columns(self): + return ['myfeature'] + + @property + def preinsert_queries(self): + return [f"create table {self.final_feature_table_name}" "(entity_id bigint, as_of_date timestamp, myfeature float)"] + + @property + def insert_queries(self): + if self.features_ignore_cohort: + final_query = self.query + else: + final_query = f""" + select * from (self.query) raw + join {self.cohort_table} using (entity_id, as_of_date) + """ + return [ + final_query.format(as_of_date=date) + for date in self.as_of_dates + ] + + @property + def postinsert_queries(self): + return [f"create index on {self.final_feature_table_name} (entity_id, as_of_date)"] + + @property + def imputation_queries(self): + return [f"update {self.final_feature_table_name} set myfeature = 0.0 where myfeature is null"] +``` + +This class would allow many different uses: basically any query a user can come up with would be a feature. To instantiate this class outside of triage with a simple query, you could: + +```python +feature_block = SimpleQueryFeature( + query="select entity_id, as_of_date, quantity from source_table where date < '{as_of_date}'", + as_of_dates=["2016-01-01"], + cohort_table="my_cohort_table", + db_engine=triage.create_engine(<..mydbinfo..>) +) + +feature_block.run_preimputation() +feature_block.run_imputation() +``` + +To use it from a Triage experiment, modify `triage.component.architect.feature_block_generators.py` and submit a pull request: + +Before: + +```python +FEATURE_BLOCK_GENERATOR_LOOKUP = { + 'spacetime_aggregations': generate_spacetime_aggregations +} +``` + +After: + +```python +FEATURE_BLOCK_GENERATOR_LOOKUP = { + 'spacetime_aggregations': generate_spacetime_aggregations, + 'simple_query': SimpleQueryFeature, +} +``` + +At this point, you could use it in an experiment configuration like this: + +```yaml + +features: + simple_query: + - query: "select entity_id, as_of_date, quantity from source_table where date < '{as_of_date}'" + - query: "select entity_id, as_of_date, other_quantity from other_source_table where date < '{as_of_date}'" +``` diff --git a/docs/sources/experiments/feature-testing.md b/docs/sources/experiments/feature-testing.md index 19e70b5a8..35789c1fa 100644 --- a/docs/sources/experiments/feature-testing.md +++ b/docs/sources/experiments/feature-testing.md @@ -2,26 +2,27 @@ Developing features for Triage experiments can be a daunting task. There are a lot of things to configure, a small amount of configuration can result in a ton of SQL, and it can take a long time to validate your feature configuration in the context of an Experiment being run on real data. -To speed up the process of iterating on features, you can run a list of feature aggregations, without imputation, on just one as-of-date. This functionality can be accessed through the `triage` command line tool or called directly from code (say, in a Jupyter notebook) using the `FeatureGenerator` component. +To speed up the process of iterating on features, you can run a list of feature aggregations, without imputation, on just one as-of-date. This functionality can be accessed through the `triage` command line tool or called directly from code (say, in a Jupyter notebook) using the `feature_blocks_from_config` utility. ## Using Triage CLI ![triage featuretest cli help screen](featuretest-cli.png) The command-line interface for testing features takes in two arguments: - - A feature config file. Refer to [example_feature_config.yaml](https://github.com/dssg/triage/blob/master/example/config/feature.yaml). Essentially this is the content of the [example_experiment_config.yaml](https://github.com/dssg/triage/blob/master/example/config/experiment.yaml)'s `feature_aggregations` section. It consists of a YAML list, with one or more feature_aggregation rows present. + - An experiment config file, with a feature section and optionally a cohort section. - An as-of-date. This should be in the format `2016-01-01`. -Example: `triage experiment featuretest example/config/feature.yaml 2016-01-01` +Example: `triage experiment featuretest example/config/experiment.yaml 2016-01-01` All given feature aggregations will be processed for the given date. You will see a bunch of queries pass by in your terminal, populating tables in the `features_test` schema which you can inspect afterwards. ![triage feature test result](featuretest-result.png) ## Using Python Code -If you'd like to call this from a notebook or from any other Python code, the arguments look similar but are a bit different. You have to supply your own sqlalchemy database engine to create a 'FeatureGenerator' object, and then call the `create_features_before_imputation` method with your feature config as a list of dictionaries, along with an as-of-date as a string. Make sure your logging level is set to INFO if you want to see all of the queries. +If you'd like to call this from a notebook or from any other Python code, the arguments look similar but are a bit different. You have to supply the same arguments plus a few others to the `feature_blocks_from_config` function to create a set of feature blocks, and then call the `run_preimputation` method on each feature block. Make sure your logging level is set to INFO if you want to see all of the queries. + ``` -from triage.component.architect.feature_generators import FeatureGenerator +from triage.component.architect.feature_block_generators import feature_blocks_from_config from triage.util.db import create_engine import logging import yaml @@ -32,7 +33,7 @@ logging.basicConfig(level=logging.INFO) db_url = 'your db url here' db_engine = create_engine(db_url) -feature_config = [{ +feature_config = {'spacetime_aggregations': [{ 'prefix': 'aprefix', 'aggregates': [ { @@ -50,10 +51,15 @@ feature_config = [{ 'intervals': ['all'], 'knowledge_date_column': 'knowledge_date', 'from_obj': 'data' -}] +}]} -FeatureGenerator(db_engine, 'features_test').create_features_before_imputation( - feature_aggregation_config=feature_config, - feature_dates=['2016-01-01'] +feature_blocks = feature_blocks_from_config( + feature_config, + as_of_dates=['2016-01-01'], + cohort_table=None, + db_engine=db_engine, + features_schema_name="features_test", ) +for feature_block in feature_blocks: + feature_block.run_preimputation(verbose=True) ``` diff --git a/docs/sources/experiments/upgrade-to-v7.md b/docs/sources/experiments/upgrade-to-v7.md new file mode 100644 index 000000000..89476e048 --- /dev/null +++ b/docs/sources/experiments/upgrade-to-v7.md @@ -0,0 +1,66 @@ +# Upgrading your experiment configuration to v7 + + +This document details the steps needed to update a triage v6 configuration to +v7, mimicking the old behavior. + +Experiment configuration v7 includes only one change from v6: The features are given at a different key. Instead of `feature_aggregations`, to make space for non-collate features to be added in the future, there is now a more generic `features` key, under which collate features reside at `spacetime_aggregations`. + + +Old: + +``` +feature_aggregations: + - + prefix: 'prefix' + from_obj: 'cool_stuff' + knowledge_date_column: 'open_date' + aggregates_imputation: + all: + type: 'constant' + value: 0 + aggregates: + - + quantity: 'homeless::INT' + metrics: ['count', 'sum'] + intervals: ['1 year', '2 year'] + groups: ['entity_id'] +``` + +New: + +``` +features: + spacetime_aggregations: + - + prefix: 'prefix' + from_obj: 'cool_stuff' + knowledge_date_column: 'open_date' + aggregates_imputation: + all: + type: 'constant' + value: 0 + aggregates: + - + quantity: 'homeless::INT' + metrics: ['count', 'sum'] + intervals: ['1 year', '2 year'] + groups: ['entity_id'] +``` + +## Upgrading the experiment config version + +At this point, you should be able to bump the top-level experiment config version to v7: + +Old: + +``` +config_version: 'v6' +``` + +New: + +``` +config_version: 'v7' +``` + diff --git a/example/config/experiment.yaml b/example/config/experiment.yaml index 6c4f52643..f19f78c45 100644 --- a/example/config/experiment.yaml +++ b/example/config/experiment.yaml @@ -5,7 +5,7 @@ # old configuration files are released. Be sure to assign the config version # that matches the triage.experiments.CONFIG_VERSION in the triage release # you are developing against! -config_version: 'v6' +config_version: 'v7' # EXPERIMENT METADATA # model_comment (optional) will end up in the model_comment column of the @@ -72,37 +72,38 @@ label_config: # FEATURE GENERATION -# The aggregate features to generate for each train/test split -# -# Implemented by wrapping collate: https://github.com/dssg/collate -# Most terminology here is taken directly from collate -# -# Each entry describes a collate.SpacetimeAggregation object, and the -# arguments needed to create it. Generally, each of these entries controls -# the features from one source table, though in the case of multiple groups -# may result in multiple output tables -# -# Rules specifying how to handle imputation of null values must be explicitly -# defined in your config file. These can be specified in two places: either -# within each feature or overall for each type of feature (aggregates_imputation, -# categoricals_imputation, array_categoricals_imputation). In either case, a rule must be given for -# each aggregation function (e.g., sum, max, avg, etc) used, or a catch-all -# can be specified with `all`. Aggregation function-specific rules will take -# precedence over the `all` rule and feature-specific rules will take -# precedence over the higher-level rules. Several examples are provided below. -# -# Available Imputation Rules: -# * mean: The average value of the feature (for SpacetimeAggregation the -# mean is taken within-date). -# * constant: Fill with a constant value from a required `value` parameter. -# * zero: Fill with zero. -# * null_category: Only available for categorical features. Just flag null -# values with the null category column. -# * binary_mode: Only available for aggregate column types. Takes the modal -# value for a binary feature. -# * error: Raise an exception if any null values are encountered for this -# feature. -feature_aggregations: +features: + spacetime_aggregations: + # The aggregate features to generate for each train/test split + # + # Implemented by wrapping collate: https://github.com/dssg/collate + # Most terminology here is taken directly from collate + # + # Each entry describes a collate.SpacetimeAggregation object, and the + # arguments needed to create it. Generally, each of these entries controls + # the features from one source table, though in the case of multiple groups + # may result in multiple output tables + # + # Rules specifying how to handle imputation of null values must be explicitly + # defined in your config file. These can be specified in two places: either + # within each feature or overall for each type of feature (aggregates_imputation, + # categoricals_imputation, array_categoricals_imputation). In either case, a rule must be given for + # each aggregation function (e.g., sum, max, avg, etc) used, or a catch-all + # can be specified with `all`. Aggregation function-specific rules will take + # precedence over the `all` rule and feature-specific rules will take + # precedence over the higher-level rules. Several examples are provided below. + # + # Available Imputation Rules: + # * mean: The average value of the feature (for SpacetimeAggregation the + # mean is taken within-date). + # * constant: Fill with a constant value from a required `value` parameter. + # * zero: Fill with zero. + # * null_category: Only available for categorical features. Just flag null + # values with the null category column. + # * binary_mode: Only available for aggregate column types. Takes the modal + # value for a binary feature. + # * error: Raise an exception if any null values are encountered for this + # feature. - # prefix given to the resultant tables prefix: 'prefix' diff --git a/src/tests/architect_tests/test_feature_block_generators.py b/src/tests/architect_tests/test_feature_block_generators.py new file mode 100644 index 000000000..283ae0d8a --- /dev/null +++ b/src/tests/architect_tests/test_feature_block_generators.py @@ -0,0 +1,246 @@ +from datetime import datetime, date + +from triage.component.architect.feature_block_generators import generate_spacetime_aggregations +import triage.component.collate as collate + +import pytest +from unittest.mock import patch + + +def test_spacetime_generation(db_engine): + aggregation_config = [ + { + "prefix": "aprefix", + "aggregates": [ + { + "quantity": "quantity_one", + "metrics": ["sum", "count"], + "imputation": { + "sum": {"type": "constant", "value": 137}, + "count": {"type": "zero"}, + }, + } + ], + "categoricals_imputation": {"all": {"type": "null_category"}}, + "categoricals": [ + {"column": "cat_one", "choices": ["good", "bad"], "metrics": ["sum"]} + ], + "groups": ["entity_id", "zip_code"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + } + ] + aggregations = generate_spacetime_aggregations( + feature_aggregation_config=aggregation_config, + as_of_dates=["2017-01-02", "2017-02-02"], + cohort_table="my_cohort", + db_engine=db_engine, + features_schema_name="features", + feature_start_time="2011-01-01", + ) + assert len(aggregations) == 1 + aggregation = aggregations[0] + assert isinstance(aggregation, collate.SpacetimeAggregation) + assert aggregation.as_of_dates == ["2017-01-02", "2017-02-02"] + assert aggregation.feature_start_time == "2011-01-01" + assert aggregation.groups == {"entity_id": "entity_id", "zip_code": "zip_code"} + assert aggregation.intervals == {"entity_id": ["all"], "zip_code": ["all"]} + assert aggregation.from_obj == "data" + assert len(aggregation.aggregates) == 2 + for aggregate in aggregation.aggregates: + if isinstance(aggregate, collate.Categorical): + assert aggregate.quantities == { + "cat_one__NULL": ('(cat_one is NULL)::INT',), + "cat_one_bad": ("(cat_one = 'bad')::INT",), + "cat_one_good": ("(cat_one = 'good')::INT",), + } + assert aggregate.functions == ["sum"] + else: + assert aggregate.quantities == {"quantity_one": ("quantity_one",)} + assert aggregate.functions == ["sum", "count"] + + + +INPUT_DATA = [ + # entity_id, knowledge_date, zip_code, cat_one, quantity_one + (1, date(2014, 1, 1), "60120", "good", 10000), + (1, date(2014, 10, 11), "60120", "good", None), + (3, date(2012, 6, 8), "60653", "bad", 342), + (3, date(2014, 12, 21), "60653", "inbetween", 600), + (4, date(2014, 4, 4), "60653", "bad", 1236), +] + +INPUT_STATES = [ + # entity_id, as_of_date + (1, date(2013, 9, 30)), + (1, date(2014, 9, 30)), + (1, date(2015, 1, 1)), + (3, date(2013, 9, 30)), + (3, date(2014, 9, 30)), + (3, date(2015, 1, 1)), + (4, date(2014, 9, 30)), + (4, date(2015, 1, 1)), +] + +@pytest.fixture(name='test_engine', scope='function') +def fixture_test_engine(db_engine): + """Local extension to the shared db_engine fixture to set up test + database tables. + + """ + db_engine.execute( + """\ + create table data ( + entity_id int, + knowledge_date date, + zip_code text, + cat_one varchar, + quantity_one float + ) + """ + ) + for row in INPUT_DATA: + db_engine.execute("insert into data values (%s, %s, %s, %s, %s)", row) + + db_engine.execute( + """\ + create table states ( + entity_id int, + as_of_date date + ) + """ + ) + for row in INPUT_STATES: + db_engine.execute("insert into states values (%s, %s)", row) + + return db_engine + + +def test_choice_query(test_engine): + aggregation_config = [ + { + "prefix": "aprefix", + "categoricals": [ + { + "column": "cat_one", + "choice_query": "select distinct(cat_one) from data", + "metrics": ["sum"], + "imputation": {"all": {"type": "null_category"}}, + } + ], + "groups": ["entity_id"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + } + ] + aggregations = generate_spacetime_aggregations( + feature_aggregation_config=aggregation_config, + as_of_dates=["2017-01-02", "2017-02-02"], + cohort_table="my_cohort", + db_engine=test_engine, + features_schema_name="features", + feature_start_time="2011-01-01", + ) + assert aggregations[0].aggregates[0].quantities == { + "cat_one__NULL": ('(cat_one is NULL)::INT',), + "cat_one_bad": ("(cat_one = 'bad')::INT",), + "cat_one_good": ("(cat_one = 'good')::INT",), + "cat_one_inbetween": ("(cat_one = 'inbetween')::INT",), + } + +def test_array_categoricals(test_engine): + aggregation_config = [ + { + "prefix": "aprefix", + "array_categoricals": [ + { + "column": "cat_one", + "choices": ["good", "bad", "inbetween"], + "metrics": ["sum"], + "imputation": {"all": {"type": "null_category"}}, + } + ], + "groups": ["entity_id"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + } + ] + aggregations = generate_spacetime_aggregations( + feature_aggregation_config=aggregation_config, + as_of_dates=["2017-01-02", "2017-02-02"], + cohort_table="my_cohort", + db_engine=test_engine, + features_schema_name="features", + feature_start_time="2011-01-01", + ) + + assert aggregations[0].aggregates[0].quantities == { + "cat_one__NULL": ('(cat_one is NULL)::INT',), + "cat_one_bad": ("(cat_one @> array['bad'::varchar])::INT",), + "cat_one_good": ("(cat_one @> array['good'::varchar])::INT",), + "cat_one_inbetween": ("(cat_one @> array['inbetween'::varchar])::INT",), + } + +def test_materialize_off(db_engine): + aggregation_config = [{ + "prefix": "aprefix", + "categoricals": [ + { + "column": "cat_one", + "choices": ["good", "bad"], + "metrics": ["sum"], + "imputation": {"all": {"type": "null_category"}}, + } + ], + "groups": ["entity_id", "zip_code"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + }] + + with patch("triage.component.architect.feature_block_generators.FromObj") as fromobj_mock: + feature_generator = generate_spacetime_aggregations( + feature_aggregation_config=aggregation_config, + as_of_dates=["2017-01-02", "2017-02-02"], + cohort_table="my_cohort", + db_engine=db_engine, + features_schema_name="features", + materialize_subquery_fromobjs=False + ) + assert not fromobj_mock.called + + +def test_aggregations_materialize_on(db_engine): + aggregation_config = [{ + "prefix": "aprefix", + "categoricals": [ + { + "column": "cat_one", + "choices": ["good", "bad"], + "metrics": ["sum"], + "imputation": {"all": {"type": "null_category"}}, + } + ], + "groups": ["entity_id", "zip_code"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + }] + + with patch("triage.component.architect.feature_block_generators.FromObj") as fromobj_mock: + feature_generator = generate_spacetime_aggregations( + feature_aggregation_config=aggregation_config, + as_of_dates=["2017-01-02", "2017-02-02"], + cohort_table="my_cohort", + db_engine=db_engine, + features_schema_name="features", + materialize_subquery_fromobjs=True + ) + fromobj_mock.assert_called_once_with( + from_obj="data", + knowledge_date_column="knowledge_date", + name="features.aprefix" + ) diff --git a/src/tests/architect_tests/test_feature_blocks.py b/src/tests/architect_tests/test_feature_blocks.py new file mode 100644 index 000000000..24153e88a --- /dev/null +++ b/src/tests/architect_tests/test_feature_blocks.py @@ -0,0 +1,118 @@ +from triage.component.architect.feature_block import FeatureBlock +import pytest + + +class FeatureBlockExample(FeatureBlock): + """A sample, functional FeatureBlock class + + Implements very simple versions of all of the abstract methods/properties + that allows testing of the concrete methods in the base class + """ + @property + def final_feature_table_name(self): + return "myfeatures" + + @property + def feature_columns(self): + return set(["feature_one", "feature_two"]) + + @property + def preinsert_queries(self): + return [ + "drop table if exists myfeatures", + "create table myfeatures (entity_id int, as_of_date timestamp, f_one int, f_two int)" + ] + + @property + def insert_queries(self): + return [ + "insert into myfeatures values (1, '2016-01-01', 1, 0)", + "insert into myfeatures values (1, '2016-02-01', 0, 0)", + "insert into myfeatures values (2, '2016-01-01', 0, 1)", + "insert into myfeatures values (2, '2016-02-01', 0, NULL)" + ] + + @property + def postinsert_queries(self): + return [ + "create index on myfeatures (as_of_date)" + ] + + @property + def imputation_queries(self): + return [ + "update myfeatures set f_one = 1 where f_one is null", + "update myfeatures set f_two = 1 where f_two is null", + ] + + +def populate_cohort(db_engine): + db_engine.execute("create table mycohort (entity_id int, as_of_date timestamp)") + db_engine.execute("insert into mycohort values (1, '2016-01-01'), " + "(1, '2016-02-01'), (2, '2016-01-01'), (2, '2016-02-01')") + + +def test_FeatureBlock_generate_preimpute_tasks(db_engine): + block = FeatureBlockExample(db_engine=db_engine, cohort_table="mycohort", as_of_dates=['2016-01-01', '2016-02-01']) + block.needs_features = lambda: True + assert block.generate_preimpute_tasks(replace=False) == { + "prepare": block.preinsert_queries, + "inserts": block.insert_queries, + "finalize": block.postinsert_queries + } + block.needs_features = lambda: False + assert block.generate_preimpute_tasks(replace=False) == {} + + assert block.generate_preimpute_tasks(replace=True) == { + "prepare": block.preinsert_queries, + "inserts": block.insert_queries, + "finalize": block.postinsert_queries + } + + +def test_FeatureBlock_generate_impute_tasks(db_engine): + block = FeatureBlockExample(db_engine=db_engine, cohort_table="mycohort", as_of_dates=['2016-01-01', '2016-02-01']) + block.needs_features = lambda: True + assert block.generate_impute_tasks(replace=False) == { + "prepare": block.imputation_queries, + "inserts": [], + "finalize": [] + } + block.needs_features = lambda: False + assert block.generate_impute_tasks(replace=False) == {} + + assert block.generate_impute_tasks(replace=True) == { + "prepare": block.imputation_queries, + "inserts": [], + "finalize": [] + } + + +def test_FeatureBlock_needs_features(db_engine): + # needs_features should function as following: + # if there are members of the cohort without features, needs_features should return true + # 1. a freshly created table should definitely need features + block = FeatureBlockExample(db_engine=db_engine, cohort_table="mycohort", as_of_dates=['2016-01-01', '2016-02-01']) + populate_cohort(db_engine) + assert block.needs_features() + block.run_preimputation() + block.run_imputation() + assert not block.needs_features() + + # 2. a table that already has features, but is merely a subset of the cohort, + # should also need features + db_engine.execute("insert into mycohort values (3, '2016-01-01')") + assert block.needs_features() + + +def test_FeatureBlock_verify_nonulls(db_engine): + # verify_no_nulls should function as following: + # if there are members of the cohort without features, needs_features should return true + # 1. a freshly created table should definitely need features + block = FeatureBlockExample(db_engine=db_engine, cohort_table="mycohort", as_of_dates=['2016-01-01', '2016-02-01']) + populate_cohort(db_engine) + block.run_preimputation() + with pytest.raises(ValueError): + block.verify_no_nulls() + block.run_imputation() + block.verify_no_nulls() diff --git a/src/tests/architect_tests/test_feature_dictionary_creator.py b/src/tests/architect_tests/test_feature_dictionary_creator.py deleted file mode 100644 index 5ac63a8eb..000000000 --- a/src/tests/architect_tests/test_feature_dictionary_creator.py +++ /dev/null @@ -1,87 +0,0 @@ -from triage.component.architect.features import FeatureDictionaryCreator -import testing.postgresql -from sqlalchemy import create_engine - - -def test_feature_dictionary_creator(): - with testing.postgresql.Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - engine.execute("create schema features") - engine.execute( - """ - create table features.prefix1_entity_id ( - entity_id int, - as_of_date date, - feature_one float, - feature_two float - ) - """ - ) - engine.execute( - """ - create table features.prefix1_zipcode ( - zipcode text, - as_of_date date, - feature_three float, - feature_four float - ) - """ - ) - engine.execute( - """ - create table features.prefix1_aggregation ( - entity_id int, - as_of_date date, - zipcode text, - feature_one float, - feature_two float, - feature_three float, - feature_four float - ) - """ - ) - engine.execute( - """ - create table features.prefix1_aggregation_imputed ( - entity_id int, - as_of_date date, - zipcode text, - feature_one float, - feature_two float, - feature_three float, - feature_three_imp int, - feature_four float - ) - """ - ) - engine.execute( - """ - create table features.random_other_table ( - another_column float - ) - """ - ) - - creator = FeatureDictionaryCreator( - features_schema_name="features", db_engine=engine - ) - feature_dictionary = creator.feature_dictionary( - feature_table_names=[ - "prefix1_entity_id", - "prefix1_zip_code", - "prefix1_aggregation", - "prefix1_aggregation_imputed", - ], - index_column_lookup={ - "prefix1_aggregation_imputed": ["entity_id", "zipcode", "as_of_date"] - }, - ) - assert feature_dictionary == { - "prefix1_aggregation_imputed": [ - "feature_one", - "feature_two", - "feature_three", - "feature_three_imp", - "feature_four", - ] - } diff --git a/src/tests/architect_tests/test_feature_generators.py b/src/tests/architect_tests/test_feature_generators.py deleted file mode 100644 index 88ae38ff0..000000000 --- a/src/tests/architect_tests/test_feature_generators.py +++ /dev/null @@ -1,941 +0,0 @@ -import copy -from datetime import date - -import pandas -import pytest -import sqlalchemy -from sqlalchemy import text as t - -from triage.component.architect.feature_generators import FeatureGenerator -from triage.component.collate import Aggregate, Categorical, SpacetimeAggregation - -from unittest.mock import patch - - -INPUT_DATA = [ - # entity_id, knowledge_date, zip_code, cat_one, quantity_one - (1, date(2014, 1, 1), "60120", "good", 10000), - (1, date(2014, 10, 11), "60120", "good", None), - (3, date(2012, 6, 8), "60653", "bad", 342), - (3, date(2014, 12, 21), "60653", "inbetween", 600), - (4, date(2014, 4, 4), "60653", "bad", 1236), -] - -INPUT_STATES = [ - # entity_id, as_of_date - (1, date(2013, 9, 30)), - (1, date(2014, 9, 30)), - (1, date(2015, 1, 1)), - (3, date(2013, 9, 30)), - (3, date(2014, 9, 30)), - (3, date(2015, 1, 1)), - (4, date(2014, 9, 30)), - (4, date(2015, 1, 1)), -] - - -@pytest.fixture(name='test_engine', scope='function') -def fixture_test_engine(db_engine): - """Local extension to the shared db_engine fixture to set up test - database tables. - - """ - db_engine.execute( - """\ - create table data ( - entity_id int, - knowledge_date date, - zip_code text, - cat_one varchar, - quantity_one float - ) - """ - ) - for row in INPUT_DATA: - db_engine.execute("insert into data values (%s, %s, %s, %s, %s)", row) - - db_engine.execute( - """\ - create table states ( - entity_id int, - as_of_date date - ) - """ - ) - for row in INPUT_STATES: - db_engine.execute("insert into states values (%s, %s)", row) - - return db_engine - - -def test_feature_generation(test_engine): - aggregate_config = [ - { - "prefix": "aprefix", - "aggregates": [ - { - "quantity": "quantity_one", - "metrics": ["sum", "count"], - "imputation": { - "sum": {"type": "constant", "value": 137}, - "count": {"type": "zero"}, - }, - } - ], - "categoricals_imputation": {"all": {"type": "null_category"}}, - "categoricals": [ - {"column": "cat_one", "choices": ["good", "bad"], "metrics": ["sum"]} - ], - "groups": ["entity_id", "zip_code"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - - expected_output = { - "aprefix_aggregation_imputed": [ - { - "entity_id": 1, - "as_of_date": date(2013, 9, 30), - "zip_code": None, - "aprefix_entity_id_all_quantity_one_sum": 137, - "aprefix_entity_id_all_quantity_one_count": 0, - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 1, - "aprefix_zip_code_all_quantity_one_sum": 137, - "aprefix_zip_code_all_quantity_one_count": 0, - "aprefix_zip_code_all_cat_one_good_sum": 0, - "aprefix_zip_code_all_cat_one_bad_sum": 0, - "aprefix_zip_code_all_cat_one__NULL_sum": 1, - "aprefix_entity_id_all_quantity_one_sum_imp": 1, - "aprefix_entity_id_all_quantity_one_count_imp": 1, - "aprefix_zip_code_all_quantity_one_sum_imp": 1, - "aprefix_zip_code_all_quantity_one_count_imp": 1, - }, - { - "entity_id": 1, - "as_of_date": date(2014, 9, 30), - "zip_code": "60120", - "aprefix_entity_id_all_quantity_one_sum": 10000, - "aprefix_entity_id_all_quantity_one_count": 1, - "aprefix_entity_id_all_cat_one_good_sum": 1, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - "aprefix_zip_code_all_quantity_one_sum": 10000, - "aprefix_zip_code_all_quantity_one_count": 1, - "aprefix_zip_code_all_cat_one_good_sum": 1, - "aprefix_zip_code_all_cat_one_bad_sum": 0, - "aprefix_zip_code_all_cat_one__NULL_sum": 0, - "aprefix_entity_id_all_quantity_one_sum_imp": 0, - "aprefix_entity_id_all_quantity_one_count_imp": 0, - "aprefix_zip_code_all_quantity_one_sum_imp": 0, - "aprefix_zip_code_all_quantity_one_count_imp": 0, - }, - { - "entity_id": 3, - "as_of_date": date(2013, 9, 30), - "zip_code": "60653", - "aprefix_entity_id_all_quantity_one_sum": 342, - "aprefix_entity_id_all_quantity_one_count": 1, - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - "aprefix_zip_code_all_quantity_one_sum": 342, - "aprefix_zip_code_all_quantity_one_count": 1, - "aprefix_zip_code_all_cat_one_good_sum": 0, - "aprefix_zip_code_all_cat_one_bad_sum": 1, - "aprefix_zip_code_all_cat_one__NULL_sum": 0, - "aprefix_entity_id_all_quantity_one_sum_imp": 0, - "aprefix_entity_id_all_quantity_one_count_imp": 0, - "aprefix_zip_code_all_quantity_one_sum_imp": 0, - "aprefix_zip_code_all_quantity_one_count_imp": 0, - }, - { - "entity_id": 3, - "as_of_date": date(2014, 9, 30), - "zip_code": "60653", - "aprefix_entity_id_all_quantity_one_sum": 342, - "aprefix_entity_id_all_quantity_one_count": 1, - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - "aprefix_zip_code_all_quantity_one_sum": 1578, - "aprefix_zip_code_all_quantity_one_count": 2, - "aprefix_zip_code_all_cat_one_good_sum": 0, - "aprefix_zip_code_all_cat_one_bad_sum": 2, - "aprefix_zip_code_all_cat_one__NULL_sum": 0, - "aprefix_entity_id_all_quantity_one_sum_imp": 0, - "aprefix_entity_id_all_quantity_one_count_imp": 0, - "aprefix_zip_code_all_quantity_one_sum_imp": 0, - "aprefix_zip_code_all_quantity_one_count_imp": 0, - }, - { - "entity_id": 4, - "as_of_date": date(2014, 9, 30), - "zip_code": "60653", - "aprefix_entity_id_all_quantity_one_sum": 1236, - "aprefix_entity_id_all_quantity_one_count": 1, - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - "aprefix_zip_code_all_quantity_one_sum": 1578, - "aprefix_zip_code_all_quantity_one_count": 2, - "aprefix_zip_code_all_cat_one_good_sum": 0, - "aprefix_zip_code_all_cat_one_bad_sum": 2, - "aprefix_zip_code_all_cat_one__NULL_sum": 0, - "aprefix_entity_id_all_quantity_one_sum_imp": 0, - "aprefix_entity_id_all_quantity_one_count_imp": 0, - "aprefix_zip_code_all_quantity_one_sum_imp": 0, - "aprefix_zip_code_all_quantity_one_count_imp": 0, - }, - ] - } - - features_schema_name = "features" - - output_tables = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).create_all_tables( - feature_dates=["2013-09-30", "2014-09-30"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - - for output_table in output_tables: - records = pandas.read_sql( - "select * from {}.{} order by entity_id, as_of_date".format( - features_schema_name, - output_table, - ), - test_engine, - ).to_dict("records") - - for record, expected_record in zip(records, expected_output[output_table]): - assert record == expected_record - - -def test_index_column_lookup(test_engine): - aggregations = [ - SpacetimeAggregation( - prefix="prefix1", - aggregates=[ - Categorical( - col="cat_one", - function="sum", - choices=["good", "bad", "inbetween"], - impute_rules={"coltype": "categorical", "all": {"type": "zero"}}, - ) - ], - groups=["entity_id"], - intervals=["all"], - date_column="knowledge_date", - output_date_column="as_of_date", - dates=["2013-09-30", "2014-09-30"], - state_table="states", - state_group="entity_id", - schema="features", - from_obj="data", - ), - SpacetimeAggregation( - prefix="prefix2", - aggregates=[ - Aggregate( - quantity="quantity_one", - function="count", - impute_rules={"coltype": "aggregate", "all": {"type": "zero"}}, - ) - ], - groups=["entity_id", "zip_code"], - intervals=["all"], - date_column="knowledge_date", - output_date_column="as_of_date", - dates=["2013-09-30", "2014-09-30"], - state_table="states", - state_group="entity_id", - schema="features", - from_obj="data", - ), - ] - - features_schema_name = "features" - feature_generator = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ) - lookup = feature_generator.index_column_lookup(aggregations) - assert lookup == { - "prefix1_aggregation_imputed": ["as_of_date", "entity_id"], - "prefix2_aggregation_imputed": ["as_of_date", "entity_id", "zip_code"], - } - - -def test_feature_generation_feature_start_time(test_engine): - aggregate_config = [ - { - "prefix": "aprefix", - "aggregates_imputation": {"all": {"type": "constant", "value": 7}}, - "aggregates": [{"quantity": "quantity_one", "metrics": ["sum"]}], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - - expected_output = { - "aprefix_aggregation_imputed": [ - { - "entity_id": 1, - "as_of_date": date(2015, 1, 1), - "aprefix_entity_id_all_quantity_one_sum": 10000, - }, - { - "entity_id": 3, - "as_of_date": date(2015, 1, 1), - "aprefix_entity_id_all_quantity_one_sum": 600, - }, - { - "entity_id": 4, - "as_of_date": date(2015, 1, 1), - "aprefix_entity_id_all_quantity_one_sum": 1236, - }, - ] - } - - features_schema_name = "features" - output_tables = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - feature_start_time="2013-01-01", - ).create_all_tables( - feature_dates=["2015-01-01"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - - for output_table in output_tables: - records = pandas.read_sql( - "select * from {}.{} order by as_of_date, entity_id".format( - features_schema_name, - output_table, - ), - test_engine, - ).to_dict("records") - - assert records == expected_output[output_table] - - -def test_dynamic_categoricals(test_engine): - aggregate_config = [ - { - "prefix": "aprefix", - "categoricals": [ - { - "column": "cat_one", - "choice_query": "select distinct(cat_one) from data", - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - expected_output = { - "aprefix_aggregation_imputed": [ - { - "entity_id": 1, - "as_of_date": date(2013, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 1, - }, - { - "entity_id": 3, - "as_of_date": date(2013, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 1, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 1, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 3, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 4, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - ] - } - - features_schema_name = "features" - - output_tables = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).create_all_tables( - feature_dates=["2013-09-30", "2014-09-30"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - - for output_table in output_tables: - records = pandas.read_sql( - "select * from {}.{} order by as_of_date, entity_id".format( - features_schema_name, output_table - ), - test_engine, - ).to_dict("records") - - assert records == expected_output[output_table] - - -def test_array_categoricals(db_engine): - aggregate_config = [ - { - "prefix": "aprefix", - "array_categoricals": [ - { - "column": "cat_one", - "choices": ["good", "bad", "inbetween"], - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - expected_output = { - "aprefix_aggregation_imputed": [ - { - "entity_id": 1, - "as_of_date": date(2013, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 1, - }, - { - "entity_id": 3, - "as_of_date": date(2013, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 1, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 1, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 0, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 3, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - { - "entity_id": 4, - "as_of_date": date(2014, 9, 30), - "aprefix_entity_id_all_cat_one_good_sum": 0, - "aprefix_entity_id_all_cat_one_inbetween_sum": 0, - "aprefix_entity_id_all_cat_one_bad_sum": 1, - "aprefix_entity_id_all_cat_one__NULL_sum": 0, - }, - ] - } - - input_data = [ - # entity_id, knowledge_date, cat_one, quantity_one - (1, date(2014, 1, 1), ["good", "good"], 10000), - (1, date(2014, 10, 11), ["good"], None), - (3, date(2012, 6, 8), ["bad"], 342), - (3, date(2014, 12, 21), ["inbetween"], 600), - (4, date(2014, 4, 4), ["bad"], 1236), - ] - - db_engine.execute( - """\ - create table data ( - entity_id int, - knowledge_date date, - cat_one varchar[], - quantity_one float - ) - """ - ) - for row in input_data: - db_engine.execute("insert into data values (%s, %s, %s, %s)", row) - - db_engine.execute( - """\ - create table states ( - entity_id int, - as_of_date date - ) - """ - ) - for row in INPUT_STATES: - db_engine.execute("insert into states values (%s, %s)", row) - - features_schema_name = "features" - - output_tables = FeatureGenerator( - db_engine=db_engine, - features_schema_name=features_schema_name, - ).create_all_tables( - feature_dates=["2013-09-30", "2014-09-30"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - - for output_table in output_tables: - records = pandas.read_sql( - "select * from {}.{} order by as_of_date, entity_id".format( - features_schema_name, output_table - ), - db_engine, - ).to_dict("records") - - assert records == expected_output[output_table] - - -def test_generate_table_tasks(test_engine): - test_engine.execute('create schema features') - aggregations = [ - SpacetimeAggregation( - prefix="prefix1", - aggregates=[ - Categorical( - col="cat_one", - function="sum", - choices=["good", "bad", "inbetween"], - impute_rules={"coltype": "categorical", "all": {"type": "zero"}}, - ) - ], - groups=["entity_id"], - intervals=["all"], - date_column="knowledge_date", - output_date_column="as_of_date", - dates=["2013-09-30", "2014-09-30"], - state_table="states", - state_group="entity_id", - schema="features", - from_obj="data", - ), - SpacetimeAggregation( - prefix="prefix2", - aggregates=[ - Aggregate( - quantity="quantity_one", - function="count", - impute_rules={"coltype": "aggregate", "all": {"type": "zero"}}, - ) - ], - groups=["entity_id"], - intervals=["all"], - date_column="knowledge_date", - output_date_column="as_of_date", - dates=["2013-09-30", "2014-09-30"], - state_table="states", - state_group="entity_id", - schema="features", - from_obj="data", - ), - ] - features_schema_name = "features" - - table_tasks = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).generate_all_table_tasks(aggregations, task_type="aggregation") - for table_name, task in table_tasks.items(): - assert "DROP TABLE" in task["prepare"][0] - assert "CREATE TABLE" in str(task["prepare"][1]) - assert "CREATE INDEX" in task["finalize"][0] - assert isinstance(task["inserts"], list) - - # build the aggregation tables to check the imputation tasks - FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).process_table_tasks(table_tasks) - - table_tasks = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).generate_all_table_tasks(aggregations, task_type="imputation") - - for table_name, task in table_tasks.items(): - assert "DROP TABLE" in task["prepare"][0] - assert "CREATE TABLE" in str(task["prepare"][1]) - assert "CREATE INDEX" in task["finalize"][0] - assert isinstance(task["inserts"], list) - - -def test_aggregations(test_engine): - aggregate_config = [ - { - "prefix": "prefix1", - "categoricals": [ - { - "column": "cat_one", - "choice_query": "select distinct(cat_one) from data", - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - }, - { - "prefix": "prefix2", - "aggregates_imputation": {"all": {"type": "mean"}}, - "aggregates": [{"quantity": "quantity_one", "metrics": ["count"]}], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - }, - ] - features_schema_name = "features" - - aggregations = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - ).aggregations( - feature_dates=["2013-09-30", "2014-09-30"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - for aggregation in aggregations: - assert isinstance(aggregation, SpacetimeAggregation) - - -def test_replace(test_engine): - # test the replace=False functionality, wherein we see if the cohort is fully represented - # in the imputed table and reuse the features if so - aggregate_config = [ - { - "prefix": "aprefix", - "aggregates_imputation": {"all": {"type": "mean"}}, - "aggregates": [{"quantity": "quantity_one", "metrics": ["sum", "count"]}], - "categoricals": [ - { - "column": "cat_one", - "choices": ["good", "bad"], - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - - features_schema_name = "features" - feature_tables = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - replace=False, - ).create_all_tables( - feature_dates=["2013-09-30", "2014-09-30", "2015-01-01"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - - assert len(feature_tables) == 1 - assert list(feature_tables)[0] == "aprefix_aggregation_imputed" - - # now try and run feature generation with replace=False. We should - # be able to see that the entire cohort is there and reuse the features - feature_generator = FeatureGenerator( - db_engine=test_engine, - features_schema_name=features_schema_name, - replace=False, - ) - aggregations = feature_generator.aggregations( - feature_dates=["2013-09-30", "2014-09-30", "2015-01-01"], - feature_aggregation_config=aggregate_config, - state_table="states", - ) - table_tasks = feature_generator.generate_all_table_tasks( - aggregations, - task_type="aggregation", - ) - - assert len(table_tasks["aprefix_entity_id"]) == 0 - assert len(table_tasks["aprefix_aggregation"]) == 0 - - imp_tasks = feature_generator.generate_all_table_tasks( - aggregations, - task_type="imputation", - ) - - assert len(imp_tasks["aprefix_aggregation_imputed"]) == 0 - - # add a new member of the cohort. now we should need to rebuild everything - test_engine.execute("insert into states values (%s, %s)", 999, "2015-01-01") - table_tasks = feature_generator.generate_all_table_tasks( - aggregations, - task_type="aggregation", - ) - assert len(table_tasks["aprefix_entity_id"]) == 3 - assert len(table_tasks["aprefix_aggregation"]) == 3 - feature_generator.process_table_tasks(table_tasks) - imp_tasks = feature_generator.generate_all_table_tasks( - aggregations, - task_type="imputation", - ) - - assert len(imp_tasks["aprefix_aggregation_imputed"]) == 3 - -def test_aggregations_materialize_off(test_engine): - aggregate_config = { - "prefix": "aprefix", - "categoricals": [ - { - "column": "cat_one", - "choices": ["good", "bad"], - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id", "zip_code"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - - feature_generator = FeatureGenerator( - db_engine=test_engine, - features_schema_name="features", - materialize_subquery_fromobjs=False - ) - - with patch("triage.component.architect.feature_generators.FromObj") as fromobj_mock: - feature_generator.aggregations([aggregate_config], "2016-01-01", "states") - assert not fromobj_mock.called - - -def test_aggregations_materialize_on(test_engine): - aggregate_config = { - "prefix": "aprefix", - "categoricals": [ - { - "column": "cat_one", - "choices": ["good", "bad"], - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id", "zip_code"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - - feature_generator = FeatureGenerator( - db_engine=test_engine, - features_schema_name="features", - ) - - with patch("triage.component.architect.feature_generators.FromObj") as fromobj_mock: - feature_generator.aggregations([aggregate_config], "2016-01-01", "states") - fromobj_mock.assert_called_once_with( - from_obj="data", - knowledge_date_column="knowledge_date", - name="features.aprefix" - ) - - -def test_transaction_error(test_engine): - """Database connections are cleaned up regardless of in-transaction - query errors. - - """ - aggregate_config = [ - { - "prefix": "aprefix", - "aggregates": [ - { - "quantity": "quantity_one", - "metrics": ["sum"], - "imputation": { - "sum": {"type": "constant", "value": 137}, - "count": {"type": "zero"}, - }, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - - feature_generator = FeatureGenerator( - db_engine=test_engine, - features_schema_name="features", - ) - - with pytest.raises(sqlalchemy.exc.ProgrammingError): - feature_generator.create_all_tables( - feature_dates=["2013-09-30", "2014-09-30"], - feature_aggregation_config=aggregate_config, - state_table="statez", # WRONG! - ) - - ((query_count,),) = test_engine.execute( - t("""\ - select count(1) from pg_stat_activity - where datname = :datname and - query not ilike '%%pg_stat_activity%%' - """), - datname=test_engine.url.database, - ) - - assert query_count == 0 - - -class TestValidations: - - @pytest.fixture - def base_config(self): - return { - "prefix": "aprefix", - "categoricals": [ - { - "column": "cat_one", - "choices": ["good", "bad"], - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id", "zip_code"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - - @pytest.fixture - def feature_generator(self, test_engine): - return FeatureGenerator(test_engine, "features") - - def test_correct_keys(self, base_config, feature_generator): - feature_generator.validate([base_config]) - - with pytest.raises(ValueError): - no_group = copy.deepcopy(base_config) - del no_group["groups"] - feature_generator.validate([no_group]) - - with pytest.raises(ValueError): - no_intervals = copy.deepcopy(base_config) - del no_intervals["intervals"] - feature_generator.validate([no_intervals]) - - with pytest.raises(ValueError): - no_kdate = copy.deepcopy(base_config) - del no_kdate["knowledge_date_column"] - feature_generator.validate([no_kdate]) - - with pytest.raises(ValueError): - no_from_obj = copy.deepcopy(base_config) - del no_from_obj["from_obj"] - feature_generator.validate([no_from_obj]) - - with pytest.raises(ValueError): - no_aggs = copy.deepcopy(base_config) - del no_aggs["categoricals"] - feature_generator.validate([no_aggs]) - - with pytest.raises(ValueError): - no_imps = copy.deepcopy(base_config) - del no_imps["categoricals"][0]["imputation"] - feature_generator.validate([no_imps]) - - def test_bad_from_obj(self, base_config, feature_generator): - bad_from_obj = copy.deepcopy(base_config) - bad_from_obj["from_obj"] = "where thing is other_thing" - with pytest.raises(ValueError): - feature_generator.validate([bad_from_obj]) - - def test_bad_interval(self, base_config, feature_generator): - base_config["intervals"] = ["1y", "1fortnight"] - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_bad_group(self, base_config, feature_generator): - base_config["groups"] = ["zip_code", "otherthing"] - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_bad_choice_query(self, base_config, feature_generator): - del base_config["categoricals"][0]["choices"] - base_config["categoricals"][0][ - "choice_query" - ] = "select distinct cat_two from data" - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_wrong_imp_fcn(self, base_config, feature_generator): - del base_config["categoricals"][0]["imputation"]["all"] - base_config["categoricals"][0]["imputation"]["max"] = { - "type": "null_category" - } - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_bad_imp_rule(self, base_config, feature_generator): - base_config["categoricals"][0]["imputation"]["all"] = { - "type": "bad_rule_doesnt_exist" - } - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_no_imp_rule_type(self, base_config, feature_generator): - base_config["categoricals"][0]["imputation"]["all"] = {"value": "good"} - with pytest.raises(ValueError): - feature_generator.validate([base_config]) - - def test_missing_imp_arg(self, base_config, feature_generator): - # constant value imputation requires a 'value' parameter - base_config["categoricals"][0]["imputation"]["all"] = {"type": "constant"} - with pytest.raises(ValueError): - feature_generator.validate([base_config]) diff --git a/src/tests/architect_tests/test_integration.py b/src/tests/architect_tests/test_integration.py index cb2be100e..6182dd676 100644 --- a/src/tests/architect_tests/test_integration.py +++ b/src/tests/architect_tests/test_integration.py @@ -10,11 +10,11 @@ from triage.component.results_schema import Base from triage.component.timechop import Timechop from triage.component.architect.features import ( - FeatureGenerator, - FeatureDictionaryCreator, FeatureGroupCreator, FeatureGroupMixer, + FeatureDictionary, ) +from triage.component.architect.feature_block_generators import feature_blocks_from_config from triage.component.architect.label_generators import LabelGenerator from triage.component.architect.cohort_table_generators import CohortTableGenerator from triage.component.architect.planner import Planner @@ -170,14 +170,6 @@ def basic_integration_test( db_engine=db_engine, query=sample_config()["label_config"]["query"] ) - feature_generator = FeatureGenerator( - db_engine=db_engine, features_schema_name="features", replace=True - ) - - feature_dictionary_creator = FeatureDictionaryCreator( - db_engine=db_engine, features_schema_name="features" - ) - feature_group_creator = FeatureGroupCreator(feature_group_create_rules) feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules) @@ -227,64 +219,52 @@ def basic_integration_test( label_timespans=["6months"], ) - # create feature table tasks - # we would use FeatureGenerator#create_all_tables but want to use - # the tasks dict directly to create a feature dict - aggregations = feature_generator.aggregations( - feature_aggregation_config=[ - { - "prefix": "cat", - "from_obj": "cat_complaints", - "knowledge_date_column": "as_of_date", - "aggregates": [ - { - "quantity": "cat_sightings", - "metrics": ["count", "avg"], - "imputation": {"all": {"type": "mean"}}, - } - ], - "intervals": ["1y"], - "groups": ["entity_id"], - }, - { - "prefix": "dog", - "from_obj": "dog_complaints", - "knowledge_date_column": "as_of_date", - "aggregates_imputation": { - "count": {"type": "constant", "value": 7}, - "sum": {"type": "mean"}, - "avg": {"type": "zero"}, + feature_blocks = feature_blocks_from_config( + { + 'spacetime_aggregations': [ + { + "prefix": "cat", + "from_obj": "cat_complaints", + "knowledge_date_column": "as_of_date", + "aggregates": [ + { + "quantity": "cat_sightings", + "metrics": ["count", "avg"], + "imputation": {"all": {"type": "mean"}}, + } + ], + "intervals": ["1y"], + "groups": ["entity_id"], }, - "aggregates": [ - {"quantity": "dog_sightings", "metrics": ["count", "avg"]} - ], - "intervals": ["1y"], - "groups": ["entity_id"], - }, - ], - feature_dates=all_as_of_times, - state_table=cohort_table_generator.cohort_table_name, - ) - feature_table_agg_tasks = feature_generator.generate_all_table_tasks( - aggregations, task_type="aggregation" - ) - - # create feature aggregation tables - feature_generator.process_table_tasks(feature_table_agg_tasks) - - feature_table_imp_tasks = feature_generator.generate_all_table_tasks( - aggregations, task_type="imputation" + { + "prefix": "dog", + "from_obj": "dog_complaints", + "knowledge_date_column": "as_of_date", + "aggregates_imputation": { + "count": {"type": "constant", "value": 7}, + "sum": {"type": "mean"}, + "avg": {"type": "zero"}, + }, + "aggregates": [ + {"quantity": "dog_sightings", "metrics": ["count", "avg"]} + + ], + "intervals": ["1y"], + "groups": ["entity_id"], + }, + ] + }, + as_of_dates=all_as_of_times, + cohort_table=cohort_table_generator.cohort_table_name, + db_engine=db_engine, + features_schema_name='features', ) - # create feature imputation tables - feature_generator.process_table_tasks(feature_table_imp_tasks) + for feature_block in feature_blocks: + feature_block.run_preimputation() + feature_block.run_imputation() - # build feature dictionaries from feature tables and - # subsetting config - master_feature_dict = feature_dictionary_creator.feature_dictionary( - feature_table_names=feature_table_imp_tasks.keys(), - index_column_lookup=feature_generator.index_column_lookup(aggregations), - ) + master_feature_dict = FeatureDictionary(feature_blocks) feature_dicts = feature_group_mixer.generate( feature_group_creator.subsets(master_feature_dict) diff --git a/src/tests/collate_tests/test_collate.py b/src/tests/collate_tests/test_collate.py index a4585f20a..eb69c599b 100755 --- a/src/tests/collate_tests/test_collate.py +++ b/src/tests/collate_tests/test_collate.py @@ -4,7 +4,9 @@ Unit tests for `collate` module. """ -from triage.component.collate import Aggregate, Aggregation, Categorical +import testing.postgresql +import sqlalchemy +from triage.component.collate import Aggregate, Categorical def test_aggregate(): agg = Aggregate("*", "count", {}) @@ -116,50 +118,6 @@ def test_aggregate_format_kwargs(): ) == ["min('2012-01-01' - date)"] -def test_aggregation_table_name_no_schema(): - # no schema - assert ( - Aggregation( - [], from_obj="source", groups=[], state_table="tbl" - ).get_table_name() - == '"source_aggregation"' - ) - assert ( - Aggregation([], from_obj="source", groups=[], state_table="tbl").get_table_name( - imputed=True - ) - == '"source_aggregation_imputed"' - ) - - # prefix - assert ( - Aggregation( - [], from_obj="source", prefix="mysource", groups=[], state_table="tbl" - ).get_table_name() - == '"mysource_aggregation"' - ) - assert ( - Aggregation( - [], from_obj="source", prefix="mysource", groups=[], state_table="tbl" - ).get_table_name(imputed=True) - == '"mysource_aggregation_imputed"' - ) - - # schema - assert ( - Aggregation( - [], from_obj="source", schema="schema", groups=[], state_table="tbl" - ).get_table_name() - == '"schema"."source_aggregation"' - ) - assert ( - Aggregation( - [], from_obj="source", schema="schema", groups=[], state_table="tbl" - ).get_table_name(imputed=True) - == '"schema"."source_aggregation_imputed"' - ) - - def test_distinct(): assert list(map(str, Aggregate("distinct x", "count", {}).get_columns())) == [ "count(distinct x)" diff --git a/src/tests/collate_tests/test_imputation_output.py b/src/tests/collate_tests/test_imputation_output.py index 905401abe..c0cdc45cc 100644 --- a/src/tests/collate_tests/test_imputation_output.py +++ b/src/tests/collate_tests/test_imputation_output.py @@ -120,6 +120,7 @@ def test_imputation_output(feat_list, exp_imp_cols, feat_table): feat_sql = "\n".join( [", prefix_entity_id_1y_%s_max int" % f for f in feat_list] ) + engine.execute( """create table prefix_aggregation ( entity_id int @@ -157,40 +158,21 @@ def test_imputation_output(feat_list, exp_imp_cols, feat_table): ] st = SpacetimeAggregation( aggregates=aggs, + db_engine=engine, from_obj="prefix_events", prefix="prefix", groups=["entity_id"], intervals=["1y"], - dates=["2016-01-01", "2016-02-03", "2016-03-14"], - state_table="states", - state_group="entity_id", + as_of_dates=["2016-01-01", "2016-02-03", "2016-03-14"], + cohort_table="states", + entity_column="entity_id", date_column="as_of_date", - input_min_date="2000-01-01", + feature_start_time="2000-01-01", output_date_column="as_of_date", + drop_interim_tables=False, ) - conn = engine.connect() - - trans = conn.begin() - - # excute query to find columns with null values and create lists of columns - # that do and do not need imputation when creating the imputation table - res = conn.execute(st.find_nulls()) - null_counts = list(zip(res.keys(), res.fetchone())) - impute_cols = [col for col, val in null_counts if val > 0] - nonimpute_cols = [col for col, val in null_counts if val == 0] - - # sql to drop and create the imputation table - drop_imp = st.get_drop(imputed=True) - create_imp = st.get_impute_create( - impute_cols=impute_cols, nonimpute_cols=nonimpute_cols - ) - - # create the imputation table - conn.execute(drop_imp) - conn.execute(create_imp) - - trans.commit() + st.run_imputation() # check the results df = pd.read_sql("SELECT * FROM prefix_aggregation_imputed", engine) diff --git a/src/tests/collate_tests/test_integration.py b/src/tests/collate_tests/test_integration.py deleted file mode 100755 index fd089d944..000000000 --- a/src/tests/collate_tests/test_integration.py +++ /dev/null @@ -1,118 +0,0 @@ -# -*- coding: utf-8 -*- -"""Integration tests for `collate` module.""" -import testing.postgresql -from sqlalchemy import create_engine -from sqlalchemy.sql import expression as ex - -from triage.component.collate import Aggregation, Aggregate -from triage.component.collate.spacetime import SpacetimeAggregation - -from . import initialize_db - - -IMPUTE_RULES = { - "coltype": "aggregate", - "count": {"type": "mean"}, - "mode": {"type": "mean"}, -} - -Postgresql = testing.postgresql.PostgresqlFactory( - cache_initialized_db=True, on_initialized=initialize_db.handler -) - - -def teardown_module(): - Postgresql.clear_cache() - - -def test_engine(): - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - ((result,),) = engine.execute("SELECT COUNT(*) FROM food_inspections") - assert result == 966 - - -def test_st_explicit_execute(): - agg = Aggregate({"F": "results='Fail'"}, ["count"], IMPUTE_RULES) - mode = Aggregate("", "mode", IMPUTE_RULES, order="zip") - st = SpacetimeAggregation( - [agg, agg + agg, mode], - from_obj=ex.table("food_inspections"), - groups={"license": ex.column("license_no"), "zip": ex.column("zip")}, - intervals={"license": ["1 year", "2 years", "all"], "zip": ["1 year"]}, - dates=["2016-08-30", "2015-11-06"], - state_table="inspection_states", - state_group="license_no", - date_column="inspection_date", - prefix="food_inspections", - ) - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - st.execute(engine.connect()) - - -def test_st_lazy_execute(): - agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES) - st = SpacetimeAggregation( - [agg], - from_obj="food_inspections", - groups=["license_no", "zip"], - intervals={"license_no": ["1 year", "2 years", "all"], "zip": ["1 year"]}, - dates=["2016-08-30", "2015-11-06"], - state_table="inspection_states", - state_group="license_no", - date_column='"inspection_date"', - ) - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - st.execute(engine.connect()) - - -def test_st_execute_broadcast_intervals(): - agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES) - st = SpacetimeAggregation( - [agg], - from_obj="food_inspections", - groups=["license_no", "zip"], - intervals=["1 year", "2 years", "all"], - dates=["2016-08-30", "2015-11-06"], - state_table="inspection_states", - state_group="license_no", - date_column='"inspection_date"', - ) - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - st.execute(engine.connect()) - - -def test_execute(): - agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES) - st = Aggregation( - [agg], - from_obj="food_inspections", - groups=["license_no", "zip"], - state_table="all_licenses", - state_group="license_no", - ) - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - st.execute(engine.connect()) - - -def test_execute_schema_output_date_column(): - agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES) - st = SpacetimeAggregation( - [agg], - from_obj="food_inspections", - groups=["license_no", "zip"], - intervals={"license_no": ["1 year", "2 years", "all"], "zip": ["1 year"]}, - dates=["2016-08-30", "2015-11-06"], - state_table="inspection_states_diff_colname", - state_group="license_no", - schema="agg", - date_column='"inspection_date"', - output_date_column="aggregation_date", - ) - with Postgresql() as postgresql: - engine = create_engine(postgresql.url()) - st.execute(engine.connect()) diff --git a/src/tests/collate_tests/test_spacetime.py b/src/tests/collate_tests/test_spacetime.py index 0be1e9f17..c7e975a02 100755 --- a/src/tests/collate_tests/test_spacetime.py +++ b/src/tests/collate_tests/test_spacetime.py @@ -69,17 +69,19 @@ def test_basic_spacetime(): from_obj="events", groups=["entity_id"], intervals=["1y", "2y", "all"], - dates=["2016-01-01", "2015-01-01"], - state_table="states", - state_group="entity_id", + as_of_dates=["2016-01-01", "2015-01-01"], + features_schema_name="schema", + cohort_table="states", + entity_column="entity_id", date_column="event_date", output_date_column="as_of_date", + db_engine=engine, + drop_interim_tables=False, ) - - st.execute(engine.connect()) - + engine.execute(st.get_create_schema()) + st.run_preimputation() r = engine.execute( - "select * from events_entity_id order by entity_id, as_of_date" + "select * from schema.events_entity_id order by entity_id, as_of_date" ) rows = [x for x in r] assert rows[0]["entity_id"] == 1 @@ -144,9 +146,10 @@ def test_basic_spacetime(): assert rows[6]["events_entity_id_all_outcome::int_avg"] == 0 assert len(rows) == 7 + st.run_imputation() # check some imputation results r = engine.execute( - "select * from events_aggregation_imputed order by entity_id, as_of_date" + "select * from schema.events_aggregation_imputed order by entity_id, as_of_date" ) rows = [x for x in r] assert rows[6]["entity_id"] == 4 @@ -185,8 +188,23 @@ def test_basic_spacetime(): assert rows[7]["events_entity_id_all_outcome::int_avg_imp"] == 0 assert len(rows) == 8 - -def test_input_min_date(): + assert st.feature_columns == { + "events_entity_id_1y_outcome::int_sum", + "events_entity_id_1y_outcome::int_sum_imp", + "events_entity_id_1y_outcome::int_avg", + "events_entity_id_1y_outcome::int_avg_imp", + "events_entity_id_2y_outcome::int_sum", + "events_entity_id_2y_outcome::int_sum_imp", + "events_entity_id_2y_outcome::int_avg", + "events_entity_id_2y_outcome::int_avg_imp", + "events_entity_id_all_outcome::int_sum", + "events_entity_id_all_outcome::int_sum_imp", + "events_entity_id_all_outcome::int_avg", + "events_entity_id_all_outcome::int_avg_imp" + } + + +def test_feature_start_time(): with testing.postgresql.Postgresql() as psql: engine = sqlalchemy.create_engine(psql.url()) engine.execute("create table events (entity_id int, date date, outcome bool)") @@ -212,14 +230,16 @@ def test_input_min_date(): from_obj="events", groups=["entity_id"], intervals=["all"], - dates=["2016-01-01"], - state_table="states", - state_group="entity_id", + as_of_dates=["2016-01-01"], + cohort_table="states", + entity_column="entity_id", date_column='"date"', - input_min_date="2015-11-10", + feature_start_time="2015-11-10", + db_engine=engine, + drop_interim_tables=False, ) - st.execute(engine.connect()) + st.run_preimputation() r = engine.execute("select * from events_entity_id order by entity_id") rows = [x for x in r] @@ -240,19 +260,18 @@ def test_input_min_date(): from_obj="events", groups=["entity_id"], intervals=["1y", "all"], - dates=["2016-01-01", "2015-01-01"], - state_table="states", - state_group="entity_id", + as_of_dates=["2016-01-01", "2015-01-01"], + cohort_table="states", + entity_column="entity_id", date_column='"date"', - input_min_date="2014-11-10", + feature_start_time="2014-11-10", + db_engine=engine ) with pytest.raises(ValueError): st.validate(engine.connect()) - with pytest.raises(ValueError): - st.execute(engine.connect()) -def test_join_with_cohort_table(db_engine): +def test_features_ignore_cohort(db_engine): # if we specify joining with the cohort table # only entity_id/date pairs in the cohort table should show up db_engine.execute("create table events (entity_id int, date date, outcome bool)") @@ -271,7 +290,7 @@ def test_join_with_cohort_table(db_engine): for state in smaller_cohort: db_engine.execute("insert into cohort values (%s, %s)", state) - # create our test aggregation with the important 'join_with_cohort_table' flag + # create our test aggregation with the important 'features_ignore_cohort' flag agg = Aggregate( "outcome::int", ["sum", "avg"], @@ -287,14 +306,16 @@ def test_join_with_cohort_table(db_engine): from_obj="events", groups=["entity_id"], intervals=["all"], - dates=["2016-01-01", "2015-01-01"], - state_table="cohort", - state_group="entity_id", + as_of_dates=["2016-01-01", "2015-01-01"], + cohort_table="cohort", + entity_column="entity_id", date_column='"date"', - join_with_cohort_table=True, + features_ignore_cohort=False, + db_engine=db_engine, + drop_interim_tables=False, ) - st.execute(db_engine.connect()) + st.run_preimputation() r = db_engine.execute("select * from events_entity_id order by entity_id, date") rows = [x for x in r] @@ -320,3 +341,70 @@ def test_join_with_cohort_table(db_engine): assert rows[3]["date"] == date(2016, 1, 1) assert rows[3]["events_entity_id_all_outcome::int_sum"] == 1 assert rows[3]["events_entity_id_all_outcome::int_avg"] == 0.5 + + +def test_aggregation_table_name_no_schema(): + # no schema + assert ( + SpacetimeAggregation( + [], from_obj="source", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[], + ).get_table_name() + == '"source_aggregation"' + ) + assert ( + SpacetimeAggregation([], from_obj="source", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[]).get_table_name( + imputed=True + ) + == '"source_aggregation_imputed"' + ) + + # prefix + assert ( + SpacetimeAggregation( + [], from_obj="source", prefix="mysource", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[], + ).get_table_name() + == '"mysource_aggregation"' + ) + assert ( + SpacetimeAggregation( + [], from_obj="source", prefix="mysource", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[], + ).get_table_name(imputed=True) + == '"mysource_aggregation_imputed"' + ) + + # schema + assert ( + SpacetimeAggregation( + [], from_obj="source", features_schema_name="schema", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[], + ).get_table_name() + == '"schema"."source_aggregation"' + ) + assert ( + SpacetimeAggregation( + [], from_obj="source", features_schema_name="schema", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[], + ).get_table_name(imputed=True) + == '"schema"."source_aggregation_imputed"' + ) + + +def test_get_feature_columns(): + with testing.postgresql.Postgresql() as psql: + db_engine = sqlalchemy.create_engine(psql.url()) + n = Aggregate("x", "sum", {}) + d = Aggregate("1", "count", {}) + m = Aggregate("y", "avg", {}) + assert SpacetimeAggregation( + aggregates=[n, d, m], + from_obj="source", + features_schema_name="schema", + prefix="prefix", + groups=["entity_id"], + cohort_table="tbl", + db_engine=db_engine, + as_of_dates=[], + ).feature_columns == set([ + "prefix_entity_id_all_x_sum", + "prefix_entity_id_all_1_count", + "prefix_entity_id_all_y_avg" + ]) + diff --git a/src/tests/test_partial_experiments.py b/src/tests/test_partial_experiments.py index b40c71e4d..c753c3157 100644 --- a/src/tests/test_partial_experiments.py +++ b/src/tests/test_partial_experiments.py @@ -105,7 +105,7 @@ def test_validate_strict(self): class PreimputationFeatures(TestCase): config = { "temporal_config": sample_config()["temporal_config"], - "feature_aggregations": sample_config()["feature_aggregations"], + "features": sample_config()["features"], "config_version": sample_config()["config_version"], } @@ -120,7 +120,7 @@ def test_run(self): if "_aggregation" in table ] - assert len(generated_tables) == len(sample_config()["feature_aggregations"]) + assert len(generated_tables) == len(sample_config()["features"]["spacetime_aggregations"]) for table in generated_tables: table_should_have_data(table, experiment.db_engine) @@ -137,7 +137,7 @@ def test_validate_strict(self): class PostimputationFeatures(TestCase): config = { "temporal_config": sample_config()["temporal_config"], - "feature_aggregations": sample_config()["feature_aggregations"], + "features": sample_config()["features"], "cohort_config": sample_config()["cohort_config"], "config_version": sample_config()["config_version"], } @@ -153,7 +153,7 @@ def test_run(self): if "_aggregation_imputed" in table ] - assert len(generated_tables) == len(sample_config()["feature_aggregations"]) + assert len(generated_tables) == len(sample_config()["features"]["spacetime_aggregations"]) for table in generated_tables: table_should_have_data(table, experiment.db_engine) @@ -170,7 +170,7 @@ def test_validate_strict(self): class Matrices(TestCase): config = { "temporal_config": sample_config()["temporal_config"], - "feature_aggregations": sample_config()["feature_aggregations"], + "features": sample_config()["features"], "cohort_config": sample_config()["cohort_config"], "label_config": sample_config()["label_config"], "config_version": sample_config()["config_version"], diff --git a/src/tests/test_utils_db.py b/src/tests/test_utils_db.py new file mode 100644 index 000000000..c9976d3d9 --- /dev/null +++ b/src/tests/test_utils_db.py @@ -0,0 +1,23 @@ +from triage.util.db import run_statements +import pytest +import sqlalchemy +from sqlalchemy import text as t + + +def test_run_statements(db_engine): + """Test that database connections are cleaned up regardless of in-transaction + query errors. + """ + with pytest.raises(sqlalchemy.exc.ProgrammingError): + run_statements(['insert into blah'], db_engine) + + ((query_count,),) = db_engine.execute( + t("""\ + select count(1) from pg_stat_activity + where datname = :datname and + query not ilike '%%pg_stat_activity%%' + """), + datname=db_engine.url.database, + ) + + assert query_count == 0 diff --git a/src/tests/utils.py b/src/tests/utils.py index 6cc07c053..5313ff8f4 100644 --- a/src/tests/utils.py +++ b/src/tests/utils.py @@ -355,7 +355,7 @@ def sample_config(): } } - feature_config = [ + spacetime_agg_config = [ { "prefix": "entity_features", "from_obj": "cat_complaints", @@ -401,7 +401,7 @@ def sample_config(): "entity_column_name": "entity_id", "model_comment": "test2-final-final", "model_group_keys": ["label_name", "label_type", "custom_key"], - "feature_aggregations": feature_config, + "features": {"spacetime_aggregations": spacetime_agg_config}, "cohort_config": cohort_config, "temporal_config": temporal_config, "grid_config": grid_config, diff --git a/src/triage/cli.py b/src/triage/cli.py index bee32086e..242f7920d 100755 --- a/src/triage/cli.py +++ b/src/triage/cli.py @@ -10,7 +10,7 @@ from argcmdr import RootCommand, Command, main, cmdmethod from sqlalchemy.engine.url import URL -from triage.component.architect.feature_generators import FeatureGenerator +from triage.component.architect.feature_block_generators import feature_blocks_from_config from triage.component.architect.cohort_table_generators import CohortTableGenerator from triage.component.audition import AuditionRunner from triage.component.results_schema import upgrade_db, stamp_db, REVISION_MAPPING @@ -112,7 +112,7 @@ def __call__(self, args): self.root.setup() # Loading configuration (if exists) db_engine = create_engine(self.root.db_url) full_config = yaml.load(args.feature_config_file) - feature_config = full_config['feature_aggregations'] + feature_config = full_config['features'] cohort_config = full_config.get('cohort_config', None) if cohort_config: CohortTableGenerator( @@ -122,11 +122,18 @@ def __call__(self, args): replace=True ).generate_cohort_table(as_of_dates=[args.as_of_date]) - FeatureGenerator(db_engine, "features_test").create_features_before_imputation( - feature_aggregation_config=feature_config, - feature_dates=[args.as_of_date], - state_table="features_test.test_cohort" + feature_blocks = feature_blocks_from_config( + feature_config, + as_of_dates=[args.as_of_date], + cohort_table="features_test.test_cohort" if cohort_config else None, + db_engine=db_engine, + features_schema_name="features_test", + materialize_subquery_fromobjs=False, + features_ignore_cohort=bool(cohort_config), ) + for feature_block in feature_blocks: + feature_block.run_preimputation(verbose=True) + logging.info( "Features created for feature_config %s and date %s", feature_config, diff --git a/src/triage/component/architect/README.md b/src/triage/component/architect/README.md index f268b9936..72ef47a40 100644 --- a/src/triage/component/architect/README.md +++ b/src/triage/component/architect/README.md @@ -13,7 +13,7 @@ The Architect addresses these issues with functionality aimed at all tasks betwe ## Components - [LabelGenerator](architect/label_generators.py): Create binary labels suitable for a design matrix by querying a database table containing outcome events. -- [FeatureGenerator](architect/feature_generators.py): Create aggregate features suitable for a design matrix from a set of database tables containing events. Uses [collate](https://github.com/dssg/collate/) to build aggregation SQL queries. +- [FeatureBlockGenerator](architect/feature_block_generators.py): Create features suitable for a design matrix from a set of database tables containing events. Uses [collate](https://github.com/dssg/collate/) to build aggregation SQL queries. - [FeatureGroupCreator](architect/feature_group_creator.py), [FeatureGroupMixer](architect/feature_group_mixer.py): Create groupings of features, and mix them using different strategies (like 'leave one out') to test their effectiveness. - [Planner](architect/planner.py), [Builder](architect/builders.py): Build all design matrices needed for an experiment, taking into account different labels, state configurations, and feature groups. diff --git a/src/triage/component/architect/builders.py b/src/triage/component/architect/builders.py index 1fd05916e..f781dac17 100644 --- a/src/triage/component/architect/builders.py +++ b/src/triage/component/architect/builders.py @@ -436,7 +436,7 @@ def load_features_data( table=entity_date_table_name, ), # collate imputation shouldn't leave any nulls and we double-check - # the imputed table in FeatureGenerator.create_all_tables() but as + # the imputed table in FeatureBlock.run_imputation() but as # a final check, raise a divide by zero error on export if the # database encounters any during the outer join right_column_selections=[', "{0}"'.format(fn) for fn in feature_names], diff --git a/src/triage/component/architect/database_reflection.py b/src/triage/component/architect/database_reflection.py index f56697379..7d993f94d 100644 --- a/src/triage/component/architect/database_reflection.py +++ b/src/triage/component/architect/database_reflection.py @@ -98,6 +98,20 @@ def table_has_column(table_name, column, db_engine): return column in reflected_table(table_name, db_engine).columns +def table_columns(table_name, db_engine): + """Retrieve a list of columns. + + The table is expected to exist. + + Args: + table_name (string) A table name (with schema) + db_engine (sqlalchemy.engine) + + Returns: (list) Every column currently in the table + """ + return reflected_table(table_name, db_engine).columns + + def column_type(table_name, column, db_engine): """Find the database type of the given column in the given table diff --git a/src/triage/component/architect/feature_block.py b/src/triage/component/architect/feature_block.py new file mode 100644 index 000000000..bd4e00941 --- /dev/null +++ b/src/triage/component/architect/feature_block.py @@ -0,0 +1,235 @@ +from abc import ABC, abstractmethod + +import logging +import sqlparse +from triage.database_reflection import table_exists, table_columns +from triage.util.db import run_statements + + +class FeatureBlock(ABC): + def __init__( + self, + db_engine, + cohort_table, + as_of_dates, + features_schema_name=None, + feature_start_time=None, + features_ignore_cohort=False, + ): + self.db_engine = db_engine + self.cohort_table_name = cohort_table + self.as_of_dates = as_of_dates + self.features_schema_name = features_schema_name + self.feature_start_time = feature_start_time + self.features_ignore_cohort = features_ignore_cohort + + @property + @abstractmethod + def final_feature_table_name(self): + "The name of the final table with all features filled in (no missing values)" + pass + + @property + @abstractmethod + def feature_columns(self): + """ + The list of feature columns in the final, postimputation table + + Should exclude any index columns (e.g. entity id, date) + """ + pass + + @property + @abstractmethod + def preinsert_queries(self): + """ + Return all queries that should be run before inserting any data. + + Returns a list of queries/executable statements + """ + pass + + @property + @abstractmethod + def insert_queries(self): + """ + Return all inserts to populate this data. Each query in this list should be parallelizable. + + Returns a list of queries/executable statements + """ + pass + + @property + @abstractmethod + def postinsert_queries(self): + """ + Return all queries that should be run after inserting all data + + Returns a list of queries/executable statements + """ + pass + + @property + @abstractmethod + def imputation_queries(self): + """ + Return all queries that should be run to fill in missing data with imputed values. + + Returns a list of queries/executable statements + """ + pass + + def _cohort_table_sub(self): + """Helper function to ensure we only include state table records + in our set of input dates and after the feature_start_time. + """ + datestr = ", ".join(["'%s'::date" % dt for dt in self.as_of_dates]) + mindtstr = ( + " AND as_of_date >= '%s'::date" % (self.feature_start_time,) + if self.feature_start_time is not None + else "" + ) + return """( + SELECT * + FROM {st} + WHERE as_of_date IN ({datestr}) + {mindtstr})""".format( + st=self.cohort_table_name, + datestr=datestr, + mindtstr=mindtstr, + ) + + def verify_no_nulls(self): + """ + Verify that there are no nulls remaining in the imputed table + + Should raise an error if there are any. + """ + + query_template = """ + SELECT {cols} + FROM {state_tbl} t1 + LEFT JOIN {aggs_tbl} t2 USING(entity_id, as_of_date) + """ + cols_sql = ",\n".join( + [ + """SUM(CASE WHEN "{col}" IS NULL THEN 1 ELSE 0 END) AS "{col}" """.format( + col=column.name + ) + for column in table_columns(self.final_feature_table_name, self.db_engine) + ] + ) + + results = self.db_engine.execute(query_template.format( + cols=cols_sql, + state_tbl=self._cohort_table_sub(), + aggs_tbl=self.final_feature_table_name, + )) + null_counts = results.first().items() + nullcols = [col for (col, val) in null_counts if val > 0] + + if len(nullcols) > 0: + raise ValueError( + "Imputation failed for {} columns. Null values remain in: {}".format( + len(nullcols), nullcols + ) + ) + + def needs_features(self): + imputed_table = self.final_feature_table_name + + if table_exists(imputed_table, self.db_engine): + check_query = ( + f"select 1 from {self.cohort_table_name} " + f"left join {imputed_table} " + "using (entity_id, as_of_date) " + f"where {imputed_table}.entity_id is null limit 1" + ) + if self.db_engine.execute(check_query).scalar(): + logging.warning( + "Imputed feature table %s did not contain rows from the " + "entire cohort, need to rebuild features", imputed_table) + return True + else: + logging.warning("Imputed feature table %s did not exist, " + "need to build features", imputed_table) + return True + logging.warning("Imputed feature table %s looks good, " + "skipping feature building!", imputed_table) + return False + + def generate_preimpute_tasks(self, replace): + if not replace and not self.needs_features(): + return {} + return { + "prepare": self.preinsert_queries, + "inserts": self.insert_queries, + "finalize": self.postinsert_queries + } + + def generate_impute_tasks(self, replace): + if not replace and not self.needs_features(): + return {} + return { + "prepare": self.imputation_queries, + "inserts": [], + "finalize": [] + } + + def process_table_task(self, task, verbose=False): + if verbose: + self.log_verbose_task_info(task) + run_statements(task.get("prepare", []), self.db_engine) + run_statements(task.get("inserts", []), self.db_engine) + run_statements(task.get("finalize", []), self.db_engine) + + def run_preimputation(self, verbose=False): + self.process_table_task(self.generate_preimpute_tasks(replace=True), verbose=verbose) + + def run_imputation(self, verbose=False): + self.process_table_task(self.generate_impute_tasks(replace=True), verbose=verbose) + self.verify_no_nulls() + + def log_verbose_task_info(self, task): + prepares = task.get("prepare", []) + inserts = task.get("inserts", []) + finalize = task.get("finalize", []) + logging.info("------------------") + logging.info( + "%s prepare queries, %s insert queries, %s finalize queries", + len(prepares), + len(inserts), + len(finalize), + ) + logging.info("------------------") + logging.info("") + logging.info("------------------") + logging.info("PREPARATION QUERIES") + logging.info("------------------") + for query_num, query in enumerate(prepares, 1): + logging.info("") + logging.info( + "prepare query %s: %s", + query_num, + sqlparse.format(str(query), reindent=True), + ) + logging.info("------------------") + logging.info("INSERT QUERIES") + logging.info("------------------") + for query_num, query in enumerate(inserts, 1): + logging.info("") + logging.info( + "insert query %s: %s", + query_num, + sqlparse.format(str(query), reindent=True), + ) + logging.info("------------------") + logging.info("FINALIZE QUERIES") + logging.info("------------------") + for query_num, query in enumerate(finalize, 1): + logging.info("") + logging.info( + "finalize query %s: %s", + query_num, + sqlparse.format(str(query), reindent=True), + ) diff --git a/src/triage/component/architect/feature_block_generators.py b/src/triage/component/architect/feature_block_generators.py new file mode 100644 index 000000000..fd9b2995c --- /dev/null +++ b/src/triage/component/architect/feature_block_generators.py @@ -0,0 +1,291 @@ +import logging + +from triage.component.collate import ( + Aggregate, + Categorical, + Compare, + SpacetimeAggregation, + FromObj +) + + +def generate_spacetime_aggregations( + feature_aggregation_config, + as_of_dates, + cohort_table, + db_engine, + features_schema_name, + feature_start_time=None, + materialize_subquery_fromobjs=True, + features_ignore_cohort=False, +): + """Creates collate.SpacetimeAggregations from the given arguments + + Args: + feature_aggregation_config (list) all values, except for feature + date, necessary to instantiate a collate.SpacetimeAggregation + as_of_dates (list) dates to generate features as of + cohort_table (string) schema.table_name for state table with all entity/date pairs + db_engine (sqlalchemy.db.engine) + features_schema_name (string) Name of schema where feature + tables should be written to + feature_start_time (string/datetime, optional) point in time before which + should not be included in features + materialize_subquery_fromobjs (boolean, optional) Whether or not to inspect from_obj + values and create persistent tables out of ones that look like subqueries, for the + purposes of making runs on many as-of-dates faster + features_ignore_cohort (boolean, optional) Whether or not features should be built + independently of the cohort. Takes longer but means that features can be reused + for different cohorts. + + Returns: (list) collate.SpacetimeAggregations + """ + if not cohort_table: + logging.warning("No cohort table passed. Imputation will not be possible.") + features_ignore_cohort = True + + return SpacetimeAggregationGenerator( + db_engine=db_engine, + features_schema_name=features_schema_name, + feature_start_time=feature_start_time, + materialize_subquery_fromobjs=materialize_subquery_fromobjs, + features_ignore_cohort=features_ignore_cohort, + ).aggregations( + feature_aggregation_config, + as_of_dates, + cohort_table + ) + + +class SpacetimeAggregationGenerator(object): + def __init__( + self, + db_engine, + features_schema_name, + feature_start_time=None, + materialize_subquery_fromobjs=True, + features_ignore_cohort=False, + ): + """Generates aggregate features using collate + + Args: + db_engine (sqlalchemy.db.engine) + features_schema_name (string) Name of schema where feature + tables should be written to + feature_start_time (string/datetime, optional) point in time before which + should not be included in features + materialize_subquery_fromobjs (boolean, optional) Whether or not to inspect from_obj + values and create persistent tables out of ones that look like subqueries, for the + purposes of making runs on many as-of-dates faster + features_ignore_cohort (boolean, optional) Whether or not features should be built + independently of the cohort. Takes longer but means that features can be reused + for different cohorts. + """ + self.db_engine = db_engine + self.features_schema_name = features_schema_name + self.categorical_cache = {} + self.feature_start_time = feature_start_time + self.materialize_subquery_fromobjs = materialize_subquery_fromobjs + self.features_ignore_cohort = features_ignore_cohort + self.entity_id_column = "entity_id" + self.from_objs = {} + + def _compute_choices(self, choice_query): + if choice_query not in self.categorical_cache: + with self.db_engine.begin() as conn: + self.categorical_cache[choice_query] = [ + row[0] for row in conn.execute(choice_query) + ] + + logging.info( + "Computed list of categoricals: %s for choice query: %s", + self.categorical_cache[choice_query], + choice_query, + ) + + return self.categorical_cache[choice_query] + + def _build_choices(self, categorical): + logging.info( + "Building categorical choices for column %s, metrics %s", + categorical["column"], + categorical["metrics"], + ) + if "choices" in categorical: + logging.info("Found list of configured choices: %s", categorical["choices"]) + return categorical["choices"] + else: + return self._compute_choices(categorical["choice_query"]) + + def _build_categoricals(self, categorical_config, impute_rules): + # TODO: only include null flag where necessary + return [ + Categorical( + col=categorical["column"], + choices=self._build_choices(categorical), + function=categorical["metrics"], + impute_rules=dict( + impute_rules, + coltype="categorical", + **categorical.get("imputation", {}) + ), + include_null=True, + coltype=categorical.get('coltype', None), + ) + for categorical in categorical_config + ] + + def _build_array_categoricals(self, categorical_config, impute_rules): + # TODO: only include null flag where necessary + return [ + Compare( + col=categorical["column"], + op="@>", + choices={ + choice: "array['{}'::varchar]".format(choice) + for choice in self._build_choices(categorical) + }, + function=categorical["metrics"], + impute_rules=dict( + impute_rules, + coltype="array_categorical", + **categorical.get("imputation", {}) + ), + op_in_name=False, + quote_choices=False, + include_null=True, + coltype=categorical.get('coltype', None) + ) + for categorical in categorical_config + ] + + def _aggregation(self, aggregation_config, feature_dates, state_table): + logging.info( + "Building collate.SpacetimeAggregation for config %s and %s as_of_dates", + aggregation_config, + len(feature_dates), + ) + + # read top-level imputation rules from the aggregation config; we'll allow + # these to be overridden by imputation rules at the individual feature + # level as those get parsed as well + agimp = aggregation_config.get("aggregates_imputation", {}) + catimp = aggregation_config.get("categoricals_imputation", {}) + arrcatimp = aggregation_config.get("array_categoricals_imputation", {}) + + aggregates = [ + Aggregate( + aggregate["quantity"], + aggregate["metrics"], + dict(agimp, coltype="aggregate", **aggregate.get("imputation", {})), + coltype=aggregate.get('coltype', None) + ) + for aggregate in aggregation_config.get("aggregates", []) + ] + logging.info("Found %s quantity aggregates", len(aggregates)) + categoricals = self._build_categoricals( + aggregation_config.get("categoricals", []), catimp + ) + logging.info("Found %s categorical aggregates", len(categoricals)) + array_categoricals = self._build_array_categoricals( + aggregation_config.get("array_categoricals", []), arrcatimp + ) + logging.info("Found %s array categorical aggregates", len(array_categoricals)) + return SpacetimeAggregation( + aggregates + categoricals + array_categoricals, + from_obj=aggregation_config["from_obj"], + intervals=aggregation_config["intervals"], + groups=aggregation_config["groups"], + as_of_dates=feature_dates, + cohort_table=state_table, + entity_column=self.entity_id_column, + date_column=aggregation_config["knowledge_date_column"], + output_date_column="as_of_date", + db_engine=self.db_engine, + feature_start_time=self.feature_start_time, + features_schema_name=self.features_schema_name, + prefix=aggregation_config["prefix"], + features_ignore_cohort=self.features_ignore_cohort + ) + + def aggregations(self, feature_aggregation_config, feature_dates, state_table): + return [ + self.preprocess_aggregation( + self._aggregation(aggregation_config, feature_dates, state_table) + ) + for aggregation_config in feature_aggregation_config + ] + + def preprocess_aggregation(self, aggregation): + create_schema = aggregation.get_create_schema() + + if create_schema is not None: + with self.db_engine.begin() as conn: + conn.execute(create_schema) + + if self.materialize_subquery_fromobjs: + # materialize from obj + from_obj = FromObj( + from_obj=aggregation.from_obj.text, + name=f"{aggregation.features_schema_name}.{aggregation.prefix}", + knowledge_date_column=aggregation.date_column + ) + from_obj.maybe_materialize(self.db_engine) + aggregation.from_obj = from_obj.table + return aggregation + + +FEATURE_BLOCK_GENERATOR_LOOKUP = { + 'spacetime_aggregations': generate_spacetime_aggregations +} + + +def feature_blocks_from_config( + config, + as_of_dates, + cohort_table, + db_engine, + features_schema_name, + feature_start_time=None, + features_ignore_cohort=False, + **kwargs +): + """ + Create a list of feature blocks from a block of configuration + Args: + config (dict) feature config, consisting of: + a key corresponding to a known feature generator (in FEATURE_BLOCK_GENERATOR_LOOKUP) + a value corresponding to any config needed for that feature generator + as_of_dates (list) dates to generate features as of + cohort_table (string) schema.table_name for cohort table with all entity/date pairs + db_engine (sqlalchemy.db.engine) + features_schema_name (string) Name of schema where feature + tables should be written to + feature_start_time (string/datetime, optional) point in time before which + should not be included in features + features_ignore_cohort (boolean, optional) Whether or not features should be built + independently of the cohort. Takes longer but means that features can be reused + for different cohorts. + + Returns: (list) of FeatureBlock objects + """ + feature_blocks = [] + for config_key, config_value in config.items(): + feature_block_generator = FEATURE_BLOCK_GENERATOR_LOOKUP.get(config_key, None) + if not feature_block_generator: + raise ValueError(f"feature config key {config_key} does not correspond to a recognized" + " feature generator. Recognized feature generator keys:" + f"{FEATURE_BLOCK_GENERATOR_LOOKUP.keys()}") + + for feature_block in feature_block_generator( + config_value, + as_of_dates=as_of_dates, + cohort_table=cohort_table, + db_engine=db_engine, + features_schema_name=features_schema_name, + feature_start_time=feature_start_time, + features_ignore_cohort=features_ignore_cohort, + **kwargs + ): + feature_blocks.append(feature_block) + return feature_blocks diff --git a/src/triage/component/architect/feature_dictionary.py b/src/triage/component/architect/feature_dictionary.py new file mode 100644 index 000000000..5bc29bcbc --- /dev/null +++ b/src/triage/component/architect/feature_dictionary.py @@ -0,0 +1,64 @@ +from triage.component.architect.utils import remove_schema_from_table_name +from triage.util.structs import FeatureNameList +from collections.abc import Iterable +from collections import MutableMapping + + +class FeatureDictionary(MutableMapping): + """A feature dictionary, consisting of table names as keys and column names as values + + If a list of feature_blocks is passed, will initialize the feature dictionary with their data. + """ + def __init__(self, feature_blocks=None, *args, **kwargs): + self.tables = dict() + self.update(dict(*args, **kwargs)) # use the free update to set keys + for feature_block in feature_blocks: + cleaned_table = remove_schema_from_table_name( + feature_block.final_feature_table_name + ) + self[cleaned_table] = feature_block.feature_columns + + def __getitem__(self, key): + return FeatureNameList(self.tables[key]) + + def __setitem__(self, table, feature_names): + if not isinstance(table, str): + raise ValueError("key of FeatureDictionary objects represents a table " + "name and must be a string") + if not isinstance(feature_names, Iterable): + raise ValueError("value of FeatureDictionary objects represents a list of " + "feature names, and therefore must be iterable") + for feature_name in feature_names: + if not isinstance(feature_name, str): + raise ValueError(f"invalid value: {feature_name}. " + f"invalid type: {type(feature_name)} " + "The value of FeatureDictionary objects represents a list of " + "feature names, and therefore each item must be a string") + self.tables[table] = feature_names + + def __delitem__(self, key): + del self.tables[key] + + def __iter__(self): + return iter(self.tables) + + def __len__(self): + return len(self.tables) + + def __sub__(self, other): + not_in_other = FeatureDictionary() + for table_name, feature_list in self.items(): + if table_name not in other: + not_in_other[table_name] = feature_list + continue + missing_feature_names = [ + feature_name + for feature_name in feature_list + if feature_name not in other[table_name] + ] + if missing_feature_names: + not_in_other[table_name] = missing_feature_names + return not_in_other + + def __repr__(self): + return str(self.tables) diff --git a/src/triage/component/architect/feature_dictionary_creator.py b/src/triage/component/architect/feature_dictionary_creator.py deleted file mode 100644 index 4d0082001..000000000 --- a/src/triage/component/architect/feature_dictionary_creator.py +++ /dev/null @@ -1,69 +0,0 @@ -import logging -from triage.component.architect.utils import str_in_sql -from triage.util.structs import FeatureNameList - - -class FeatureDictionaryCreator(object): - def __init__(self, features_schema_name, db_engine): - self.features_schema_name = features_schema_name - self.db_engine = db_engine - - def _tables_to_include(self, feature_table_names): - return [ - feature_table - for feature_table in feature_table_names - if "aggregation_imputed" in feature_table - ] - - def feature_dictionary(self, feature_table_names, index_column_lookup): - """ Create a dictionary of feature names, where keys are feature tables - and values are lists of feature names. - - :return: feature_dictionary - :rtype: dict - """ - feature_dictionary = {} - - # iterate! store each table name + features names as key-value pair - for feature_table_name in self._tables_to_include(feature_table_names): - feature_names = [ - row[0] - for row in self.db_engine.execute( - self._build_feature_names_query( - feature_table_name, index_column_lookup[feature_table_name] - ) - ) - ] - feature_dictionary[feature_table_name] = FeatureNameList(feature_names) - logging.info("Feature dictionary built: %s", feature_dictionary) - return feature_dictionary - - def _build_feature_names_query(self, table_name, index_columns): - """ For a given feature table, get the names of the feature columns. - - :param table_name: name of the feature table - :type table_name: str - - :return: names of the feature columns in given table - :rtype: list - """ - # format the query that gets column names, - # excluding indices from result - feature_names_query = """ - SELECT column_name - FROM information_schema.columns - WHERE table_name = '{table}' AND - table_schema = '{schema}' AND - column_name NOT IN ({index_columns}) - """.format( - table=table_name, - schema=self.features_schema_name, - index_columns=str_in_sql(index_columns), - ) - logging.info( - "Extracting all possible feature names for table %s with query %s", - table_name, - feature_names_query, - ) - - return feature_names_query diff --git a/src/triage/component/architect/feature_generators.py b/src/triage/component/architect/feature_generators.py deleted file mode 100644 index d7c386d42..000000000 --- a/src/triage/component/architect/feature_generators.py +++ /dev/null @@ -1,715 +0,0 @@ -import logging -from collections import OrderedDict - -import sqlalchemy -import sqlparse - -from triage.util.conf import convert_str_to_relativedelta -from triage.database_reflection import table_exists - -from triage.component.collate import ( - Aggregate, - Categorical, - Compare, - SpacetimeAggregation, - FromObj -) - - -class FeatureGenerator(object): - def __init__( - self, - db_engine, - features_schema_name, - replace=True, - feature_start_time=None, - materialize_subquery_fromobjs=True, - features_ignore_cohort=False, - ): - """Generates aggregate features using collate - - Args: - db_engine (sqlalchemy.db.engine) - features_schema_name (string) Name of schema where feature - tables should be written to - replace (boolean, optional) Whether or not existing features - should be replaced - feature_start_time (string/datetime, optional) point in time before which - should not be included in features - features_ignore_cohort (boolean, optional) Whether or not features should be built - independently of the cohort. Takes longer but means that features can be reused - for different cohorts. - """ - self.db_engine = db_engine - self.features_schema_name = features_schema_name - self.categorical_cache = {} - self.replace = replace - self.feature_start_time = feature_start_time - self.materialize_subquery_fromobjs = materialize_subquery_fromobjs - self.features_ignore_cohort = features_ignore_cohort - self.entity_id_column = "entity_id" - self.from_objs = {} - - def _validate_keys(self, aggregation_config): - for key in [ - "from_obj", - "intervals", - "groups", - "knowledge_date_column", - "prefix", - ]: - if key not in aggregation_config: - raise ValueError( - "{} required as key: aggregation config: {}".format( - key, aggregation_config - ) - ) - - def _validate_aggregates(self, aggregation_config): - if ( - "aggregates" not in aggregation_config - and "categoricals" not in aggregation_config - and "array_categoricals" not in aggregation_config - ): - raise ValueError( - "Need either aggregates, categoricals, or array_categoricals" - + " in {}".format(aggregation_config) - ) - - def _validate_categoricals(self, categoricals): - for categorical in categoricals: - if "choice_query" in categorical: - logging.info("Validating choice query") - - try: - with self.db_engine.begin() as conn: - conn.execute("explain {}".format(categorical["choice_query"])) - except Exception as exc: - raise ValueError( - "choice query does not run. \n" - 'choice query: "{}"\n' - "Full error: {}".format(categorical["choice_query"], exc) - ) - - def _validate_from_obj(self, from_obj): - logging.info("Validating from_obj") - try: - with self.db_engine.begin() as conn: - conn.execute("explain select * from {}".format(from_obj)) - except Exception as exc: - raise ValueError( - "from_obj query does not run. \n" - 'from_obj: "{}"\n' - "Full error: {}".format(from_obj, exc) - ) - - def _validate_time_intervals(self, intervals): - logging.info("Validating time intervals") - for interval in intervals: - if interval != "all": - convert_str_to_relativedelta(interval) - - def _validate_groups(self, groups): - if "entity_id" not in groups: - raise ValueError( - "One of the aggregation groups is required to be entity_id" - ) - - def _validate_imputation_rule(self, aggregate_type, impute_rule): - """Validate the imputation rule for a given aggregation type.""" - # dictionary of imputation type : required parameters - valid_imputations = { - "all": { - "mean": [], - "constant": ["value"], - "zero": [], - "zero_noflag": [], - "error": [], - }, - "aggregates": {"binary_mode": []}, - "categoricals": {"null_category": []}, - } - valid_imputations["array_categoricals"] = valid_imputations["categoricals"] - - # the valid imputation rules for the specific aggregation type being checked - valid_types = dict( - valid_imputations["all"], **valid_imputations[aggregate_type] - ) - - # no imputation rule was specified - if "type" not in impute_rule.keys(): - raise ValueError("Imputation type must be specified") - - # a rule was specified, but not valid for this type of aggregate - if impute_rule["type"] not in valid_types.keys(): - raise ValueError( - "Invalid imputation type %s for %s" - % (impute_rule["type"], aggregate_type) - ) - - # check that all required parameters exist in the keys of the imputation rule - required_params = valid_types[impute_rule["type"]] - for param in required_params: - if param not in impute_rule.keys(): - raise ValueError( - "Missing param %s for %s" % (param, impute_rule["type"]) - ) - - def _validate_imputations(self, aggregation_config): - """Validate the imputation rules in an aggregation config, looping - through all three types of aggregates. Most of the work here is - done by _validate_imputation_rule() to check the requirements of - each imputation rule found - """ - agg_types = ["aggregates", "categoricals", "array_categoricals"] - - for agg_type in agg_types: - # base_imp are the top-level rules, `such as aggregates_imputation` - base_imp = aggregation_config.get(agg_type + "_imputation", {}) - - # loop through the individual aggregates - for agg in aggregation_config.get(agg_type, []): - # combine any aggregate-level imputation rules with top-level ones - imp_dict = dict(base_imp, **agg.get("imputation", {})) - - # imputation rules are metric-specific, so check each metric's rule - for metric in agg["metrics"]: - # metric rules may be defined by the metric name (e.g., 'max') - # or with the 'all' catch-all, with named metrics taking - # precedence. If we fall back to {}, the rule validator will - # error out on no metric found. - impute_rule = imp_dict.get(metric, imp_dict.get("all", {})) - self._validate_imputation_rule(agg_type, impute_rule) - - def _validate_aggregation(self, aggregation_config): - logging.info("Validating aggregation config %s", aggregation_config) - self._validate_keys(aggregation_config) - self._validate_aggregates(aggregation_config) - self._validate_categoricals(aggregation_config.get("categoricals", [])) - self._validate_from_obj(aggregation_config["from_obj"]) - self._validate_time_intervals(aggregation_config["intervals"]) - self._validate_groups(aggregation_config["groups"]) - self._validate_imputations(aggregation_config) - - def validate(self, feature_aggregation_config): - """Validate a feature aggregation config applied to this object - - The validations range from basic type checks, key presence checks, - as well as validating the sql in from objects. - - Args: - feature_aggregation_config (list) all values, except for feature - date, necessary to instantiate a collate.SpacetimeAggregation - - Raises: ValueError if any part of the config is found to be invalid - """ - for aggregation in feature_aggregation_config: - self._validate_aggregation(aggregation) - - def _compute_choices(self, choice_query): - if choice_query not in self.categorical_cache: - with self.db_engine.begin() as conn: - self.categorical_cache[choice_query] = [ - row[0] for row in conn.execute(choice_query) - ] - - logging.info( - "Computed list of categoricals: %s for choice query: %s", - self.categorical_cache[choice_query], - choice_query, - ) - - return self.categorical_cache[choice_query] - - def _build_choices(self, categorical): - logging.info( - "Building categorical choices for column %s, metrics %s", - categorical["column"], - categorical["metrics"], - ) - if "choices" in categorical: - logging.info("Found list of configured choices: %s", categorical["choices"]) - return categorical["choices"] - else: - return self._compute_choices(categorical["choice_query"]) - - def _build_categoricals(self, categorical_config, impute_rules): - # TODO: only include null flag where necessary - return [ - Categorical( - col=categorical["column"], - choices=self._build_choices(categorical), - function=categorical["metrics"], - impute_rules=dict( - impute_rules, - coltype="categorical", - **categorical.get("imputation", {}) - ), - include_null=True, - coltype=categorical.get('coltype', None), - ) - for categorical in categorical_config - ] - - def _build_array_categoricals(self, categorical_config, impute_rules): - # TODO: only include null flag where necessary - return [ - Compare( - col=categorical["column"], - op="@>", - choices={ - choice: "array['{}'::varchar]".format(choice) - for choice in self._build_choices(categorical) - }, - function=categorical["metrics"], - impute_rules=dict( - impute_rules, - coltype="array_categorical", - **categorical.get("imputation", {}) - ), - op_in_name=False, - quote_choices=False, - include_null=True, - coltype=categorical.get('coltype', None) - ) - for categorical in categorical_config - ] - - def _aggregation(self, aggregation_config, feature_dates, state_table): - logging.info( - "Building collate.SpacetimeAggregation for config %s and %s as_of_dates", - aggregation_config, - len(feature_dates), - ) - - # read top-level imputation rules from the aggregation config; we'll allow - # these to be overridden by imputation rules at the individual feature - # level as those get parsed as well - agimp = aggregation_config.get("aggregates_imputation", {}) - catimp = aggregation_config.get("categoricals_imputation", {}) - arrcatimp = aggregation_config.get("array_categoricals_imputation", {}) - - aggregates = [ - Aggregate( - aggregate["quantity"], - aggregate["metrics"], - dict(agimp, coltype="aggregate", **aggregate.get("imputation", {})), - coltype=aggregate.get('coltype', None) - ) - for aggregate in aggregation_config.get("aggregates", []) - ] - logging.info("Found %s quantity aggregates", len(aggregates)) - categoricals = self._build_categoricals( - aggregation_config.get("categoricals", []), catimp - ) - logging.info("Found %s categorical aggregates", len(categoricals)) - array_categoricals = self._build_array_categoricals( - aggregation_config.get("array_categoricals", []), arrcatimp - ) - logging.info("Found %s array categorical aggregates", len(array_categoricals)) - return SpacetimeAggregation( - aggregates + categoricals + array_categoricals, - from_obj=aggregation_config["from_obj"], - intervals=aggregation_config["intervals"], - groups=aggregation_config["groups"], - dates=feature_dates, - state_table=state_table, - state_group=self.entity_id_column, - date_column=aggregation_config["knowledge_date_column"], - output_date_column="as_of_date", - input_min_date=self.feature_start_time, - schema=self.features_schema_name, - prefix=aggregation_config["prefix"], - join_with_cohort_table=not self.features_ignore_cohort - ) - - def aggregations(self, feature_aggregation_config, feature_dates, state_table): - """Creates collate.SpacetimeAggregations from the given arguments - - Args: - feature_aggregation_config (list) all values, except for feature - date, necessary to instantiate a collate.SpacetimeAggregation - feature_dates (list) dates to generate features as of - state_table (string) schema.table_name for state table with all entity/date pairs - - Returns: (list) collate.SpacetimeAggregations - """ - return [ - self.preprocess_aggregation( - self._aggregation(aggregation_config, feature_dates, state_table) - ) - for aggregation_config in feature_aggregation_config - ] - - def preprocess_aggregation(self, aggregation): - create_schema = aggregation.get_create_schema() - - if create_schema is not None: - with self.db_engine.begin() as conn: - conn.execute(create_schema) - - if self.materialize_subquery_fromobjs: - # materialize from obj - from_obj = FromObj( - from_obj=aggregation.from_obj.text, - name=f"{aggregation.schema}.{aggregation.prefix}", - knowledge_date_column=aggregation.date_column - ) - from_obj.maybe_materialize(self.db_engine) - aggregation.from_obj = from_obj.table - return aggregation - - def generate_all_table_tasks(self, aggregations, task_type): - """Generates SQL commands for creating, populating, and indexing - feature group tables - - Args: - aggregations (list) collate.SpacetimeAggregation objects - type (str) either 'aggregation' or 'imputation' - - Returns: (dict) keys are group table names, values are themselves dicts, - each with keys for different stages of table creation (prepare, inserts, finalize) - and with values being lists of SQL commands - """ - - logging.debug("---------------------") - - # pick the method to use for generating tasks depending on whether we're - # building the aggregations or imputations - if task_type == "aggregation": - task_generator = self._generate_agg_table_tasks_for - logging.debug("---------FEATURE GENERATION------------") - elif task_type == "imputation": - task_generator = self._generate_imp_table_tasks_for - logging.debug("---------FEATURE IMPUTATION------------") - else: - raise ValueError("Table task type must be aggregation or imputation") - - logging.debug("---------------------") - - table_tasks = OrderedDict() - for aggregation in aggregations: - table_tasks.update(task_generator(aggregation)) - logging.info("Created %s tables", len(table_tasks.keys())) - return table_tasks - - def create_features_before_imputation( - self, feature_aggregation_config, feature_dates, state_table=None - ): - """Create features before imputation for a set of dates""" - all_tasks = self.generate_all_table_tasks( - self.aggregations( - feature_aggregation_config, feature_dates, state_table=state_table - ), - task_type="aggregation", - ) - logging.info("Generated a total of %s table tasks", len(all_tasks)) - for task_num, task in enumerate(all_tasks.values(), 1): - prepares = task.get("prepare", []) - inserts = task.get("inserts", []) - finalize = task.get("finalize", []) - logging.info("------------------") - logging.info("TASK %s ", task_num) - logging.info( - "%s prepare queries, %s insert queries, %s finalize queries", - len(prepares), - len(inserts), - len(finalize), - ) - logging.info("------------------") - logging.info("") - logging.info("------------------") - logging.info("PREPARATION QUERIES") - logging.info("------------------") - for query_num, query in enumerate(prepares, 1): - logging.info("") - logging.info( - "prepare query %s: %s", - query_num, - sqlparse.format(str(query), reindent=True), - ) - logging.info("------------------") - logging.info("INSERT QUERIES") - logging.info("------------------") - for query_num, query in enumerate(inserts, 1): - logging.info("") - logging.info( - "insert query %s: %s", - query_num, - sqlparse.format(str(query), reindent=True), - ) - logging.info("------------------") - logging.info("FINALIZE QUERIES") - logging.info("------------------") - for query_num, query in enumerate(finalize, 1): - logging.info("") - logging.info( - "finalize query %s: %s", - query_num, - sqlparse.format(str(query), reindent=True), - ) - self.process_table_task(task) - - def create_all_tables(self, feature_aggregation_config, feature_dates, state_table): - """Create all feature tables. - - First builds the aggregation tables, and then performs - imputation on any null values, (requiring a two-step process to - determine which columns contain nulls after the initial - aggregation tables are built). - - Args: - feature_aggregation_config (list) all values, except for - feature date, necessary to instantiate a - `collate.SpacetimeAggregation` - feature_dates (list) dates to generate features as of - state_table (string) schema.table_name for state table with - all entity/date pairs - - Returns: (list) table names - - """ - aggs = self.aggregations(feature_aggregation_config, feature_dates, state_table) - - # first, generate and run table tasks for aggregations - table_tasks_aggregate = self.generate_all_table_tasks( - aggs, task_type="aggregation" - ) - self.process_table_tasks(table_tasks_aggregate) - - # second, perform the imputations (this will query the tables - # constructed above to identify features containing nulls) - table_tasks_impute = self.generate_all_table_tasks(aggs, task_type="imputation") - impute_keys = self.process_table_tasks(table_tasks_impute) - - # double-check that the imputation worked and no nulls remain - # in the data: - nullcols = [] - with self.db_engine.begin() as conn: - for agg in aggs: - results = conn.execute(agg.find_nulls(imputed=True)) - null_counts = results.first().items() - nullcols += [col for (col, val) in null_counts if val > 0] - - if len(nullcols) > 0: - raise ValueError( - "Imputation failed for {} columns. Null values remain in: {}".format( - len(nullcols), nullcols - ) - ) - - return impute_keys - - def process_table_task(self, task): - self.run_commands(task.get("prepare", [])) - self.run_commands(task.get("inserts", [])) - self.run_commands(task.get("finalize", [])) - - def process_table_tasks(self, table_tasks): - for table_name, task in table_tasks.items(): - logging.info("Running feature table queries for %s", table_name) - self.process_table_task(task) - return table_tasks.keys() - - def _explain_selects(self, aggregations): - with self.db_engine.begin() as conn: - for aggregation in aggregations: - for selectlist in aggregation.get_selects().values(): - for select in selectlist: - query = "explain " + str(select) - results = list(conn.execute(query)) - logging.debug(str(select)) - logging.debug(results) - - def _clean_table_name(self, table_name): - # remove the schema and quotes from the name - return table_name.split(".")[1].replace('"', "") - - def _table_exists(self, table_name): - try: - with self.db_engine.begin() as conn: - conn.execute( - "select 1 from {}.{} limit 1".format( - self.features_schema_name, table_name - ) - ).first() - except sqlalchemy.exc.ProgrammingError: - return False - else: - return True - - def run_commands(self, command_list): - with self.db_engine.begin() as conn: - for command in command_list: - logging.debug("Executing feature generation query: %s", command) - conn.execute(command) - - def _aggregation_index_query(self, aggregation, imputed=False): - return "CREATE INDEX ON {} ({}, {})".format( - aggregation.get_table_name(imputed=imputed), - self.entity_id_column, - aggregation.output_date_column, - ) - - def _aggregation_index_columns(self, aggregation): - return sorted( - [group for group in aggregation.groups.keys()] - + [aggregation.output_date_column] - ) - - def index_column_lookup(self, aggregations, imputed=True): - return dict( - ( - self._clean_table_name(aggregation.get_table_name(imputed=imputed)), - self._aggregation_index_columns(aggregation), - ) - for aggregation in aggregations - ) - - def _needs_features(self, aggregation): - imputed_table = self._clean_table_name( - aggregation.get_table_name(imputed=True) - ) - - if self._table_exists(imputed_table): - check_query = ( - f"select 1 from {aggregation.state_table} " - f"left join {self.features_schema_name}.{imputed_table} " - "using (entity_id, as_of_date) " - f"where {self.features_schema_name}.{imputed_table}.entity_id is null limit 1" - ) - if self.db_engine.execute(check_query).scalar(): - logging.warning( - "Imputed feature table %s did not contain rows from the " - "entire cohort, need to rebuild features", imputed_table) - return True - else: - logging.warning("Imputed feature table %s did not exist, " - "need to build features", imputed_table) - return True - logging.warning("Imputed feature table %s looks good, " - "skipping feature building!", imputed_table) - return False - - def _generate_agg_table_tasks_for(self, aggregation): - """Generates SQL commands for preparing, populating, and finalizing - each feature group table in the given aggregation - - Args: - aggregation (collate.SpacetimeAggregation) - - Returns: (dict) of structure { - 'prepare': list of commands to prepare table for population - 'inserts': list of commands to populate table - 'finalize': list of commands to finalize table after population - } - """ - creates = aggregation.get_creates() - drops = aggregation.get_drops() - indexes = aggregation.get_indexes() - inserts = aggregation.get_inserts() - table_tasks = OrderedDict() - for group in aggregation.groups: - group_table = self._clean_table_name( - aggregation.get_table_name(group=group) - ) - if self.replace or self._needs_features(aggregation): - table_tasks[group_table] = { - "prepare": [drops[group], creates[group]], - "inserts": inserts[group], - "finalize": [indexes[group]], - } - logging.info("Created table tasks for %s", group_table) - else: - logging.info("Skipping feature table creation for %s", group_table) - table_tasks[group_table] = {} - logging.info("Created table tasks for aggregation") - if self.replace or self._needs_features(aggregation): - table_tasks[self._clean_table_name(aggregation.get_table_name())] = { - "prepare": [aggregation.get_drop(), aggregation.get_create()], - "inserts": [], - "finalize": [self._aggregation_index_query(aggregation)], - } - else: - table_tasks[self._clean_table_name(aggregation.get_table_name())] = {} - - return table_tasks - - def _generate_imp_table_tasks_for(self, aggregation, drop_preagg=True): - """Generate SQL statements for preparing, populating, and - finalizing imputations, for each feature group table in the - given aggregation. - - Requires the existance of the underlying feature and aggregation - tables defined in `_generate_agg_table_tasks_for()`. - - Args: - aggregation (collate.SpacetimeAggregation) - drop_preagg: boolean to specify dropping pre-imputation - tables - - Returns: (dict) of structure { - 'prepare': list of commands to prepare table for population - 'inserts': list of commands to populate table - 'finalize': list of commands to finalize table after population - } - - """ - table_tasks = OrderedDict() - imp_tbl_name = self._clean_table_name(aggregation.get_table_name(imputed=True)) - - if not self.replace and not self._needs_features(aggregation): - logging.warning("Skipping imputation table creation for %s", imp_tbl_name) - table_tasks[imp_tbl_name] = {} - return table_tasks - - if not aggregation.state_table: - logging.warning( - "No state table defined in aggregation, cannot create imputation table for %s", - imp_tbl_name, - ) - table_tasks[imp_tbl_name] = {} - return table_tasks - - if not table_exists(aggregation.state_table, self.db_engine): - logging.warning( - "State table %s does not exist, cannot create imputation table for %s", - aggregation.state_table, - imp_tbl_name, - ) - table_tasks[imp_tbl_name] = {} - return table_tasks - - # excute query to find columns with null values and create lists of columns - # that do and do not need imputation when creating the imputation table - with self.db_engine.begin() as conn: - results = conn.execute(aggregation.find_nulls()) - null_counts = results.first().items() - impute_cols = [col for (col, val) in null_counts if val > 0] - nonimpute_cols = [col for (col, val) in null_counts if val == 0] - - # table tasks for imputed aggregation table, most of the work is done here - # by collate's get_impute_create() - table_tasks[imp_tbl_name] = { - "prepare": [ - aggregation.get_drop(imputed=True), - aggregation.get_impute_create( - impute_cols=impute_cols, nonimpute_cols=nonimpute_cols - ), - ], - "inserts": [], - "finalize": [self._aggregation_index_query(aggregation, imputed=True)], - } - logging.info("Created table tasks for imputation: %s", imp_tbl_name) - - # do some cleanup: - # drop the group-level and aggregation tables, just leaving the - # imputation table if drop_preagg=True - if drop_preagg: - drops = aggregation.get_drops() - table_tasks[imp_tbl_name]["finalize"] += list(drops.values()) + [ - aggregation.get_drop() - ] - logging.info("Added drop table cleanup tasks: %s", imp_tbl_name) - - return table_tasks diff --git a/src/triage/component/architect/feature_query_runners.py b/src/triage/component/architect/feature_query_runners.py new file mode 100644 index 000000000..25891c613 --- /dev/null +++ b/src/triage/component/architect/feature_query_runners.py @@ -0,0 +1,141 @@ +import logging + +import sqlparse + +from triage.database_reflection import table_exists + + +def run_statements(statement_list, db_engine): + with db_engine.begin() as conn: + for statement in statement_list: + logging.debug("Executing feature generation query: %s", statement) + conn.execute(statement) + + +def process_table_task(task, db_engine): + run_statements(task.get("prepare", []), db_engine) + run_statements(task.get("inserts", []), db_engine) + run_statements(task.get("finalize", []), db_engine) + + +def process_table_tasks(table_tasks, db_engine, verbose=False): + for task, task_num in enumerate(table_tasks, 1): + if verbose: + log_verbose_task_info(task, task_num) + process_table_task(task, db_engine) + + +def needs_features(feature_block, db_engine): + imputed_table = feature_block.get_final_feature_table_name() + + if table_exists(imputed_table, db_engine): + check_query = ( + f"select 1 from {feature_block.cohort_table} " + f"left join {imputed_table} " + "using (entity_id, as_of_date) " + f"where {imputed_table}.entity_id is null limit 1" + ) + if db_engine.execute(check_query).scalar(): + logging.warning( + "Imputed feature table %s did not contain rows from the " + "entire cohort, need to rebuild features", imputed_table) + return True + else: + logging.warning("Imputed feature table %s did not exist, " + "need to build features", imputed_table) + return True + logging.warning("Imputed feature table %s looks good, " + "skipping feature building!", imputed_table) + return False + + +def generate_preimpute_tasks(feature_blocks, replace, db_engine): + table_tasks = [] + for block in feature_blocks: + if replace or needs_features(block, db_engine): + table_tasks.append({ + "prepare": block.get_preinsert_queries(), + "inserts": block.get_inserts(), + "finalize": block.get_postinsert_queries() + }) + logging.info("Generated tasks to create %s feature block tables", len(table_tasks)) + else: + logging.info("Skipping feature table creation for %s", block) + return table_tasks + + +def generate_impute_tasks(feature_blocks, replace, db_engine): + table_tasks = [] + for block in feature_blocks: + if replace or needs_features(block, db_engine): + table_tasks.append({ + "prepare": block.get_impute_queries(), + "inserts": [], + "finalize": [] + }) + logging.info("Generated tasks to create %s feature block tables", len(table_tasks)) + else: + logging.info("Skipping feature table creation for %s", block) + return table_tasks + + +def create_all_tables(feature_blocks, replace, db_engine): + """Create all feature tables. + + First builds the aggregation tables, and then performs + imputation on any null values, (requiring a two-step process to + determine which columns contain nulls after the initial + aggregation tables are built). + """ + process_table_tasks(generate_preimpute_tasks(feature_blocks, replace, db_engine)) + process_table_tasks(generate_impute_tasks(feature_blocks, replace, db_engine)) + + # perform a sanity check that no nulls were left after imputation + for feature_block in feature_blocks: + feature_block.verify_no_nulls() + + +def log_verbose_task_info(task, task_num): + prepares = task.get("prepare", []) + inserts = task.get("inserts", []) + finalize = task.get("finalize", []) + logging.info("------------------") + logging.info("TASK %s ", task_num) + logging.info( + "%s prepare queries, %s insert queries, %s finalize queries", + len(prepares), + len(inserts), + len(finalize), + ) + logging.info("------------------") + logging.info("") + logging.info("------------------") + logging.info("PREPARATION QUERIES") + logging.info("------------------") + for query_num, query in enumerate(prepares, 1): + logging.info("") + logging.info( + "prepare query %s: %s", + query_num, + sqlparse.format(str(query), reindent=True), + ) + logging.info("------------------") + logging.info("INSERT QUERIES") + logging.info("------------------") + for query_num, query in enumerate(inserts, 1): + logging.info("") + logging.info( + "insert query %s: %s", + query_num, + sqlparse.format(str(query), reindent=True), + ) + logging.info("------------------") + logging.info("FINALIZE QUERIES") + logging.info("------------------") + for query_num, query in enumerate(finalize, 1): + logging.info("") + logging.info( + "finalize query %s: %s", + query_num, + sqlparse.format(str(query), reindent=True), + ) diff --git a/src/triage/component/architect/features.py b/src/triage/component/architect/features.py index d297d9b4d..e843b83a7 100644 --- a/src/triage/component/architect/features.py +++ b/src/triage/component/architect/features.py @@ -1,13 +1,10 @@ -from triage.component.architect.feature_generators import FeatureGenerator -from triage.component.architect.feature_dictionary_creator import ( - FeatureDictionaryCreator, -) +from triage.component.architect.feature_dictionary import FeatureDictionary from triage.component.architect.feature_group_creator import FeatureGroupCreator from triage.component.architect.feature_group_mixer import FeatureGroupMixer + __all__ = ( - "FeatureGenerator", - "FeatureDictionaryCreator", + "FeatureDictionary", "FeatureGroupCreator", "FeatureGroupMixer", ) diff --git a/src/triage/component/architect/utils.py b/src/triage/component/architect/utils.py index d20337bfd..3d38a1ae8 100644 --- a/src/triage/component/architect/utils.py +++ b/src/triage/component/architect/utils.py @@ -39,6 +39,11 @@ def feature_list(feature_dictionary): )) +def remove_schema_from_table_name(table_name): + # remove the schema and quotes from the name + return table_name.split(".")[1].replace('"', "") + + def convert_string_column_to_date(column): return [datetime.datetime.strptime(date, "%Y-%m-%d").date() for date in column] diff --git a/src/triage/component/collate/__init__.py b/src/triage/component/collate/__init__.py index 9bc977b84..33055b035 100644 --- a/src/triage/component/collate/__init__.py +++ b/src/triage/component/collate/__init__.py @@ -1,11 +1,10 @@ # -*- coding: utf-8 -*- -from .collate import available_imputations, Aggregation, Aggregate, Compare, Categorical +from .collate import Aggregate, Compare, Categorical from .from_obj import FromObj -from .spacetime import SpacetimeAggregation +from .spacetime import SpacetimeAggregation, available_imputations __all__ = [ "available_imputations", - "Aggregation", "Aggregate", "FromObj", "Compare", diff --git a/src/triage/component/collate/collate.py b/src/triage/component/collate/collate.py index ecc47a81e..725b688c0 100644 --- a/src/triage/component/collate/collate.py +++ b/src/triage/component/collate/collate.py @@ -1,29 +1,13 @@ # -*- coding: utf-8 -*- from numbers import Number -from itertools import product, chain +from itertools import product import sqlalchemy.sql.expression as ex import re +import logging +from triage.database_reflection import table_exists +from triage.component.architect.utils import remove_schema_from_table_name from .sql import make_sql_clause, to_sql_name, CreateTableAs, InsertFromSelect -from .imputations import ( - ImputeMean, - ImputeConstant, - ImputeZero, - ImputeZeroNoFlag, - ImputeNullCategory, - ImputeBinaryMode, - ImputeError, -) - -available_imputations = { - "mean": ImputeMean, - "constant": ImputeConstant, - "zero": ImputeZero, - "zero_noflag": ImputeZeroNoFlag, - "null_category": ImputeNullCategory, - "binary_mode": ImputeBinaryMode, - "error": ImputeError, -} def make_list(a): @@ -432,364 +416,3 @@ def __init__( op_in_name=op_in_name, **kwargs ) - - -class Aggregation(object): - def __init__( - self, - aggregates, - groups, - from_obj, - state_table, - state_group=None, - prefix=None, - suffix=None, - schema=None, - ): - """ - Args: - aggregates: collection of Aggregate objects. - from_obj: defines the from clause, e.g. the name of the table. can use - groups: a list of expressions to group by in the aggregation or a dictionary - pairs group: expr pairs where group is the alias (used in column names) - state_table: schema.table to query for comprehensive set of state_group entities - regardless of what exists in the from_obj - state_group: the group level found in the state table (e.g., "entity_id") - prefix: prefix for aggregation tables and column names, defaults to from_obj - suffix: suffix for aggregation table, defaults to "aggregation" - schema: schema for aggregation tables - - The from_obj and group expressions are passed directly to the - SQLAlchemy Select object so could be anything supported there. - For details see: - http://docs.sqlalchemy.org/en/latest/core/selectable.html - - Aggregates will have {collate_date} in their quantities substituted with the date - of aggregation. - """ - self.aggregates = aggregates - self.from_obj = make_sql_clause(from_obj, ex.text) - self.groups = ( - groups if isinstance(groups, dict) else {str(g): g for g in groups} - ) - self.state_table = state_table - self.state_group = state_group if state_group else "entity_id" - self.prefix = prefix if prefix else str(from_obj) - self.suffix = suffix if suffix else "aggregation" - self.schema = schema - - def _get_aggregates_sql(self, group): - """ - Helper for getting aggregates sql - Args: - group: group clause, for naming columns - Returns: collection of aggregate column SQL strings - """ - prefix = "{prefix}_{group}_".format(prefix=self.prefix, group=group) - - return chain(*[a.get_columns(prefix=prefix) for a in self.aggregates]) - - def get_selects(self): - """ - Constructs select queries for this aggregation - - Returns: a dictionary of group : queries pairs where - group are the same keys as groups - queries is a list of Select queries, one for each date in dates - """ - queries = {} - - for group, groupby in self.groups.items(): - columns = [groupby] - columns += self._get_aggregates_sql(group) - - gb_clause = make_sql_clause(groupby, ex.literal_column) - query = ex.select(columns=columns, from_obj=self.from_obj).group_by( - gb_clause - ) - - queries[group] = [query] - - return queries - - def get_imputation_rules(self): - """ - Constructs a dictionary to lookup an imputation rule from an associated - column name. - - Returns: a dictionary of column : imputation_rule pairs - """ - imprules = {} - for group, groupby in self.groups.items(): - prefix = "{prefix}_{group}_".format(prefix=self.prefix, group=group) - for a in self.aggregates: - imprules.update(a.column_imputation_lookup(prefix=prefix)) - return imprules - - def get_table_name(self, group=None, imputed=False): - """ - Returns name for table for the given group - """ - if group is None and not imputed: - name = '"%s_%s"' % (self.prefix, self.suffix) - elif group is None and imputed: - name = '"%s_%s_%s"' % (self.prefix, self.suffix, "imputed") - elif imputed: - name = '"%s"' % to_sql_name("%s_%s_%s" % (self.prefix, group, "imputed")) - else: - name = '"%s"' % to_sql_name("%s_%s" % (self.prefix, group)) - schema = '"%s".' % self.schema if self.schema else "" - return "%s%s" % (schema, name) - - def get_creates(self): - """ - Construct create queries for this aggregation - Args: - selects: the dictionary of select queries to use - if None, use self.get_selects() - this allows you to customize select queries before creation - - Returns: - a dictionary of group : create pairs where - group are the same keys as groups - create is a CreateTableAs object - """ - return { - group: CreateTableAs(self.get_table_name(group), next(iter(sels)).limit(0)) - for group, sels in self.get_selects().items() - } - - def get_inserts(self): - """ - Construct insert queries from this aggregation - Args: - selects: the dictionary of select queries to use - if None, use self.get_selects() - this allows you to customize select queries before creation - - Returns: - a dictionary of group : inserts pairs where - group are the same keys as groups - inserts is a list of InsertFromSelect objects - """ - return { - group: [InsertFromSelect(self.get_table_name(group), sel) for sel in sels] - for group, sels in self.get_selects().items() - } - - def get_drops(self): - """ - Generate drop queries for this aggregation - - Returns: a dictionary of group : drop pairs where - group are the same keys as groups - drop is a raw drop table query for the corresponding table - """ - return { - group: "DROP TABLE IF EXISTS %s;" % self.get_table_name(group) - for group in self.groups - } - - def get_indexes(self): - """ - Generate create index queries for this aggregation - - Returns: a dictionary of group : index pairs where - group are the same keys as groups - index is a raw create index query for the corresponding table - """ - return { - group: "CREATE INDEX ON %s (%s);" % (self.get_table_name(group), groupby) - for group, groupby in self.groups.items() - } - - def get_join_table(self): - """ - Generate a query for a join table - """ - return ex.Select(columns=self.groups.values(), from_obj=self.from_obj).group_by( - *self.groups.values() - ) - - def get_create(self, join_table=None): - """ - Generate a single aggregation table creation query by joining - together the results of get_creates() - Returns: a CREATE TABLE AS query - """ - if not join_table: - join_table = "(%s) t1" % self.get_join_table() - - query = "SELECT * FROM %s\n" % join_table - for group, groupby in self.groups.items(): - query += "LEFT JOIN %s USING (%s)" % (self.get_table_name(group), groupby) - - return "CREATE TABLE %s AS (%s);" % (self.get_table_name(), query) - - def get_drop(self, imputed=False): - """ - Generate a drop table statement for the aggregation table - Returns: string sql query - """ - return "DROP TABLE IF EXISTS %s" % self.get_table_name(imputed=imputed) - - def get_create_schema(self): - """ - Generate a create schema statement - """ - if self.schema is not None: - return "CREATE SCHEMA IF NOT EXISTS %s" % self.schema - - def find_nulls(self, imputed=False): - """ - Generate query to count number of nulls in each column in the aggregation table - - Returns: a SQL SELECT statement - """ - query_template = """ - SELECT {cols} - FROM {state_tbl} t1 - LEFT JOIN {aggs_tbl} t2 USING({group}) - """ - cols_sql = ",\n".join( - [ - """SUM(CASE WHEN "{col}" IS NULL THEN 1 ELSE 0 END) AS "{col}" """.format( - col=column - ) - for column in self.get_imputation_rules().keys() - ] - ) - - return query_template.format( - cols=cols_sql, - state_tbl=self.state_table, - aggs_tbl=self.get_table_name(imputed=imputed), - group=self.state_group, - ) - - def _get_impute_select(self, impute_cols, nonimpute_cols, partitionby=None): - - imprules = self.get_imputation_rules() - - # check if we're missing any columns relative to the full set and raise an - # exception if we are - missing_cols = set(imprules.keys()) - set(nonimpute_cols + impute_cols) - if len(missing_cols) > 0: - raise ValueError("Missing columns in get_impute_create: %s" % missing_cols) - - # key columns and date column - query = "" - - # pre-sort and iterate through the combined set to ensure column order - for col in sorted(nonimpute_cols + impute_cols): - # just pass through columns that don't require imputation (no nulls found) - if col in nonimpute_cols: - query += '\n,"%s"' % col - - # for columns that do require imputation, include SQL to do the imputation work - # and a flag for whether the value was imputed - if col in impute_cols: - - impute_rule = imprules[col] - - try: - imputer = available_imputations[impute_rule["type"]] - except KeyError as err: - raise ValueError( - "Invalid imputation type %s for column %s" - % (impute_rule.get("type", ""), col) - ) from err - - imputer = imputer(column=col, partitionby=partitionby, **impute_rule) - - query += "\n,%s" % imputer.to_sql() - if not imputer.noflag: - # Add an imputation flag for non-categorical columns (this is handeled - # for categorical columns with a separate NULL category) - query += "\n,%s" % imputer.imputed_flag_sql() - - return query - - def get_impute_create(self, impute_cols, nonimpute_cols): - """ - Generates the CREATE TABLE query for the aggregation table with imputation. - - Args: - impute_cols: a list of column names with null values - nonimpute_cols: a list of column names without null values - - Returns: a CREATE TABLE AS query - """ - - # key columns and date column - query = "SELECT %s" % ", ".join(map(str, self.groups.values())) - - # columns with imputation filling as needed - query += self._get_impute_select(impute_cols, nonimpute_cols) - - # imputation starts from the state table and left joins into the aggregation table - query += "\nFROM %s t1" % self.state_table - query += "\nLEFT JOIN %s t2 USING(%s)" % ( - self.get_table_name(), - self.state_group, - ) - - return "CREATE TABLE %s AS (%s)" % (self.get_table_name(imputed=True), query) - - def execute(self, conn, join_table=None): - """ - Execute all SQL statements to create final aggregation table. - Args: - conn: the SQLAlchemy connection on which to execute - """ - self.validate(conn) - create_schema = self.get_create_schema() - creates = self.get_creates() - drops = self.get_drops() - indexes = self.get_indexes() - inserts = self.get_inserts() - drop = self.get_drop() - create = self.get_create(join_table=join_table) - - trans = conn.begin() - - if create_schema is not None: - conn.execute(create_schema) - - for group in self.groups: - conn.execute(drops[group]) - conn.execute(creates[group]) - for insert in inserts[group]: - conn.execute(insert) - conn.execute(indexes[group]) - - # create the aggregation table - conn.execute(drop) - conn.execute(create) - - # excute query to find columns with null values and create lists of columns - # that do and do not need imputation when creating the imputation table - res = conn.execute(self.find_nulls()) - null_counts = list(zip(res.keys(), res.fetchone())) - impute_cols = [col for col, val in null_counts if val > 0] - nonimpute_cols = [col for col, val in null_counts if val == 0] - res.close() - - # sql to drop and create the imputation table - drop_imp = self.get_drop(imputed=True) - create_imp = self.get_impute_create( - impute_cols=impute_cols, nonimpute_cols=nonimpute_cols - ) - - # create the imputation table - conn.execute(drop_imp) - conn.execute(create_imp) - - trans.commit() - - def validate(self, conn): - """ - Validate the Aggregation to ensure that it will perform as expected. - This is done against an active SQL connection in order to enable - validation of the SQL itself. - """ diff --git a/src/triage/component/collate/imputations.py b/src/triage/component/collate/imputations.py index 1a262f135..721c1ed11 100644 --- a/src/triage/component/collate/imputations.py +++ b/src/triage/component/collate/imputations.py @@ -1,3 +1,6 @@ +IMPUTATION_COLNAME_SUFFIX = "_imp" + + class BaseImputation(object): """Base class for various imputation methods """ @@ -24,8 +27,9 @@ def _base_sql(self): def imputed_flag_sql(self): if not self.noflag: - return """CASE WHEN "{col}" IS NULL THEN 1::SMALLINT ELSE 0::SMALLINT END AS "{col}_imp" """.format( - col=self.column + return """CASE WHEN "{col}" IS NULL THEN 1::SMALLINT ELSE 0::SMALLINT END AS "{col}{suffix}" """.format( + col=self.column, + suffix=IMPUTATION_COLNAME_SUFFIX ) else: # don't need to create a flag for categoricals (since this is handled with the diff --git a/src/triage/component/collate/spacetime.py b/src/triage/component/collate/spacetime.py index 9b8b4a968..fdccd569a 100644 --- a/src/triage/component/collate/spacetime.py +++ b/src/triage/component/collate/spacetime.py @@ -1,90 +1,307 @@ # -*- coding: utf-8 -*- from itertools import chain import sqlalchemy.sql.expression as ex - -from .sql import make_sql_clause -from .collate import Aggregation - - -class SpacetimeAggregation(Aggregation): +import logging + +from .sql import make_sql_clause, to_sql_name, CreateTableAs, InsertFromSelect +from triage.component.architect.utils import remove_schema_from_table_name +from triage.database_reflection import table_exists +from triage.component.architect.feature_block import FeatureBlock + +from .imputations import ( + ImputeMean, + ImputeConstant, + ImputeZero, + ImputeZeroNoFlag, + ImputeNullCategory, + ImputeBinaryMode, + ImputeError, + IMPUTATION_COLNAME_SUFFIX +) + +available_imputations = { + "mean": ImputeMean, + "constant": ImputeConstant, + "zero": ImputeZero, + "zero_noflag": ImputeZeroNoFlag, + "null_category": ImputeNullCategory, + "binary_mode": ImputeBinaryMode, + "error": ImputeError, +} + + +class SpacetimeAggregation(FeatureBlock): def __init__( self, aggregates, groups, - intervals, from_obj, - dates, - state_table, - state_group=None, + intervals=None, + entity_column=None, prefix=None, suffix=None, - schema=None, date_column=None, output_date_column=None, - input_min_date=None, - join_with_cohort_table=False, + drop_interim_tables=True, + *args, + **kwargs ): """ Args: + aggregates: collection of Aggregate objects. + from_obj: defines the from clause, e.g. the name of the table. can use + groups: a list of expressions to group by in the aggregation or a dictionary + pairs group: expr pairs where group is the alias (used in column names) + entity_column: the group level found in the state table (e.g., "entity_id") + prefix: prefix for aggregation tables and column names, defaults to from_obj + suffix: suffix for aggregation table, defaults to "aggregation" intervals: the intervals to aggregate over. either a list of datetime intervals, e.g. ["1 month", "1 year"], or a dictionary of group : intervals pairs where group is a group in groups and intervals is a collection of datetime intervals, e.g. {"address_id": ["1 month", "1 year]} - dates: list of PostgreSQL date strings, - e.g. ["2012-01-01", "2013-01-01"] - state_table: schema.table to query for valid state_group/date combinations - state_group: the group level found in the state table (e.g., "entity_id") + entity_column: the group level found in the cohort table (e.g., "entity_id") date_column: name of date column in from_obj, defaults to "date" output_date_column: name of date column in aggregated output, defaults to "date" - input_min_date: minimum date for which rows shall be included, defaults - to no absolute time restrictions on the minimum date of included rows - - For all other arguments see collate.Aggregation - """ - Aggregation.__init__( - self, - aggregates=aggregates, - from_obj=from_obj, - groups=groups, - state_table=state_table, - state_group=state_group, - prefix=prefix, - suffix=suffix, - schema=schema, + """ + super().__init__(*args, **kwargs) + self.groups = ( + groups if isinstance(groups, dict) else {str(g): g for g in groups} ) - if isinstance(intervals, dict): self.intervals = intervals - else: + elif intervals: self.intervals = {g: intervals for g in self.groups} - self.dates = dates + else: + self.intervals = {g: ["all"] for g in self.groups} + self.date_column = date_column if date_column else "date" self.output_date_column = output_date_column if output_date_column else "date" - self.input_min_date = input_min_date - self.join_with_cohort_table = join_with_cohort_table - - def _state_table_sub(self): - """Helper function to ensure we only include state table records - in our set of input dates and after the input_min_date. - """ - datestr = ", ".join(["'%s'::date" % dt for dt in self.dates]) - mindtstr = ( - " AND %s >= '%s'::date" % (self.output_date_column, self.input_min_date) - if self.input_min_date is not None - else "" + self.aggregates = aggregates + self.from_obj = make_sql_clause(from_obj, ex.text) + self.entity_column = entity_column if entity_column else "entity_id" + self.prefix = prefix if prefix else str(from_obj) + self.suffix = suffix if suffix else "aggregation" + self.drop_interim_tables = drop_interim_tables + + def get_table_name(self, group=None, imputed=False): + """ + Returns name for table for the given group + """ + if group is None and not imputed: + name = '"%s_%s"' % (self.prefix, self.suffix) + elif group is None and imputed: + name = '"%s_%s_%s"' % (self.prefix, self.suffix, "imputed") + elif imputed: + name = '"%s"' % to_sql_name("%s_%s_%s" % (self.prefix, group, "imputed")) + else: + name = '"%s"' % to_sql_name("%s_%s" % (self.prefix, group)) + schema = '"%s".' % self.features_schema_name if self.features_schema_name else "" + return "%s%s" % (schema, name) + + def get_drops(self): + """ + Generate drop queries for this aggregation + + Returns: a dictionary of group : drop pairs where + group are the same keys as groups + drop is a raw drop table query for the corresponding table + """ + return [ + "DROP TABLE IF EXISTS %s;" % self.get_table_name(group) + for group in self.groups + ] + + def get_drop(self, imputed=False): + """ + Generate a drop table statement for the aggregation table + Returns: string sql query + """ + return "DROP TABLE IF EXISTS %s" % self.get_table_name(imputed=imputed) + + def get_create_schema(self): + """ + Generate a create schema statement + """ + if self.features_schema_name is not None: + return "CREATE SCHEMA IF NOT EXISTS %s" % self.features_schema_name + + def imputed_flag_column_names(self): + # format the query that gets column names, + # excluding indices from result + feature_names_query = """ + SELECT column_name + FROM information_schema.columns + WHERE table_name = '{table}' AND + table_schema = '{schema}' AND + column_name like '%%{suffix}' + """.format( + table=remove_schema_from_table_name(self.get_table_name(imputed=True)), + schema=self.features_schema_name or 'public', + suffix=IMPUTATION_COLNAME_SUFFIX ) - return """( - SELECT * - FROM {st} - WHERE {datecol} IN ({datestr}) - {mindtstr})""".format( - st=self.state_table, - datecol=self.output_date_column, - datestr=datestr, - mindtstr=mindtstr, + print(feature_names_query) + feature_names = [ + row[0] + for row in self.db_engine.execute(feature_names_query) + ] + return feature_names + + def _get_impute_select(self, impute_cols, nonimpute_cols, partitionby=None): + + imprules = self.get_imputation_rules() + + # check if we're missing any columns relative to the full set and raise an + # exception if we are + missing_cols = set(imprules.keys()) - set(nonimpute_cols + impute_cols) + if len(missing_cols) > 0: + raise ValueError("Missing columns in get_impute_create: %s" % missing_cols) + + # key columns and date column + query = "" + + # pre-sort and iterate through the combined set to ensure column order + for col in sorted(nonimpute_cols + impute_cols): + # just pass through columns that don't require imputation (no nulls found) + if col in nonimpute_cols: + query += '\n,"%s"' % col + + # for columns that do require imputation, include SQL to do the imputation work + # and a flag for whether the value was imputed + if col in impute_cols: + + impute_rule = imprules[col] + + try: + imputer = available_imputations[impute_rule["type"]] + except KeyError as err: + raise ValueError( + "Invalid imputation type %s for column %s" + % (impute_rule.get("type", ""), col) + ) from err + + imputer = imputer(column=col, partitionby=partitionby, **impute_rule) + + query += "\n,%s" % imputer.to_sql() + if not imputer.noflag: + # Add an imputation flag for non-categorical columns (this is handeled + # for categorical columns with a separate NULL category) + query += "\n,%s" % imputer.imputed_flag_sql() + + return query + + def get_index(self, imputed=False): + return "CREATE INDEX ON {} ({})".format( + self.get_table_name(imputed=imputed), + self.entity_column, ) + def get_creates(self): + return { + group: CreateTableAs(self.get_table_name(group), next(iter(sels)).limit(0)) + for group, sels in self.get_selects().items() + } + + # implement the FeatureBlock interface + @property + def feature_columns(self): + """ + The list of feature columns in the final, postimputation table + + Should exclude any index columns (e.g. entity id, date) + """ + columns = self.feature_columns_sans_impflags + imputation_flag_feature_cols = self.imputed_flag_column_names() + print(imputation_flag_feature_cols) + for imp_flag_col in imputation_flag_feature_cols: + if imp_flag_col[:-len(IMPUTATION_COLNAME_SUFFIX)] in columns: + columns.add(imp_flag_col) + return columns + + @property + def final_feature_table_name(self): + "The name of the final table with all features filled in (no missing values)" + return self.get_table_name(imputed=True) + + @property + def preinsert_queries(self): + """ + Return all queries that should be run before inserting any data. + + Consists of all queries to drop tables from previous runs, as well as all creates + needed for this run. + + Returns a list of queries/executable statements + """ + return [self.get_drop()] + self.get_drops() + list(self.get_creates().values()) + + @property + def insert_queries(self): + """ + Return all inserts to populate this data. Each query in this list should be parallelizable. + + Returns a list of queries/executable statements + """ + return [ + InsertFromSelect(self.get_table_name(group), sel) + for group, sels in self.get_selects().items() + for sel in sels + ] + + @property + def postinsert_queries(self): + """ + Return all queries that should be run after inserting all data + + Consists of indexing queries for each group table as well as a + query to create the aggregation table that encompasses all groups. + + Returns a list of queries/executable statements + """ + postinserts = [ + "CREATE INDEX ON %s (%s);" % (self.get_table_name(group), groupby) + for group, groupby in self.groups.items() + ] + [self.get_create(), self.get_index()] + if self.drop_interim_tables: + postinserts += self.get_drops() + return postinserts + + @property + def imputation_queries(self): + """ + Return all queries that should be run to fill in missing data with imputed values. + + Returns a list of queries/executable statements + """ + if not self.cohort_table_name: + logging.warning( + "No cohort table defined in feature_block, cannot create imputation table for %s", + self.get_table_name(imputed=True), + ) + return [] + + if not table_exists(self.cohort_table_name, self.db_engine): + logging.warning( + "Cohort table %s does not exist, cannot create imputation table for %s", + self.cohort_table_name, + self.get_table_name(imputed=True), + ) + return [] + + with self.db_engine.begin() as conn: + results = conn.execute(self.find_nulls()) + null_counts = results.first().items() + impute_cols = [col for (col, val) in null_counts if val > 0] + nonimpute_cols = [col for (col, val) in null_counts if val == 0] + imp_queries = [ + self.get_drop(imputed=True), # clear out old imputed data + self._get_impute_create(impute_cols, nonimpute_cols), # create the imputed table + self.get_index(imputed=True), # index the imputed table + ] + if self.drop_interim_tables: + imp_queries.append(self.get_drop(imputed=False)) # drop the old aggregation table + return imp_queries + def _get_aggregates_sql(self, interval, date, group): """ Helper for getting aggregates sql @@ -116,6 +333,19 @@ def _get_aggregates_sql(self, interval, date, group): ] ) + def index_query(self, imputed=False): + return "CREATE INDEX ON {} ({}, {})".format( + self.get_table_name(imputed=imputed), + self.entity_column, + self.output_date_column, + ) + + def index_columns(self): + return sorted( + [group for group in self.groups.keys()] + + [self.output_date_column] + ) + def get_selects(self): """ Constructs select queries for this aggregation @@ -129,7 +359,7 @@ def get_selects(self): for group, groupby in self.groups.items(): intervals = self.intervals[group] queries[group] = [] - for date in self.dates: + for date in self.as_of_dates: columns = [ groupby, ex.literal_column("'%s'::date" % date).label( @@ -143,10 +373,10 @@ def get_selects(self): ) gb_clause = make_sql_clause(groupby, ex.literal_column) - if self.join_with_cohort_table: + if not self.features_ignore_cohort: from_obj = ex.text( f"(select from_obj.* from (" - f"(select * from {self.from_obj}) from_obj join {self.state_table} cohort on ( " + f"(select * from {self.from_obj}) from_obj join {self.cohort_table_name} cohort on ( " "cohort.entity_id = from_obj.entity_id and " f"cohort.{self.output_date_column} = '{date}'::date)" ")) cohorted_from_obj") @@ -202,9 +432,9 @@ def where(self, date, intervals): w += "AND {date_column} >= {min_date}".format( date_column=self.date_column, min_date=min_date ) - if self.input_min_date is not None: + if self.feature_start_time is not None: w += "AND {date_column} >= '{bot}'::date".format( - date_column=self.date_column, bot=self.input_min_date + date_column=self.date_column, bot=self.feature_start_time ) return ex.text(w) @@ -231,7 +461,7 @@ def get_join_table(self): intervals = list(set(chain(*self.intervals.values()))) queries = [] - for date in self.dates: + for date in self.as_of_dates: columns = groups + [ ex.literal_column("'%s'::date" % date).label(self.output_date_column) ] @@ -266,9 +496,9 @@ def validate(self, conn): SpacetimeAggregations ensure that no intervals extend beyond the absolute minimum time. """ - if self.input_min_date is not None: + if self.feature_start_time is not None: all_intervals = set(*self.intervals.values()) - for date in self.dates: + for date in self.as_of_dates: for interval in all_intervals: if interval == "all": continue @@ -276,23 +506,23 @@ def validate(self, conn): # it this way allows for nicer error messages. r = conn.execute( "select ('%s'::date - '%s'::interval) < '%s'::date" - % (date, interval, self.input_min_date) + % (date, interval, self.feature_start_time) ) if r.fetchone()[0]: raise ValueError( - "date '%s' - '%s' is before input_min_date ('%s')" - % (date, interval, self.input_min_date) + "date '%s' - '%s' is before feature_start_time ('%s')" + % (date, interval, self.feature_start_time) ) r.close() - for date in self.dates: + for date in self.as_of_dates: r = conn.execute( "select count(*) from %s where %s = '%s'::date" - % (self.state_table, self.output_date_column, date) + % (self.cohort_table_name, self.output_date_column, date) ) if r.fetchone()[0] == 0: raise ValueError( "date '%s' is not present in states table ('%s')" - % (date, self.state_table) + % (date, self.cohort_table_name) ) r.close() @@ -318,13 +548,13 @@ def find_nulls(self, imputed=False): return query_template.format( cols=cols_sql, - state_tbl=self._state_table_sub(), + state_tbl=self._cohort_table_sub(), aggs_tbl=self.get_table_name(imputed=imputed), - group=self.state_group, + group=self.entity_column, date_col=self.output_date_column, ) - def get_impute_create(self, impute_cols, nonimpute_cols): + def _get_impute_create(self, impute_cols, nonimpute_cols): """ Generates the CREATE TABLE query for the aggregation table with imputation. @@ -347,11 +577,23 @@ def get_impute_create(self, impute_cols, nonimpute_cols): ) # imputation starts from the state table and left joins into the aggregation table - query += "\nFROM %s t1" % self._state_table_sub() + query += "\nFROM %s t1" % self._cohort_table_sub() query += "\nLEFT JOIN %s t2 USING(%s, %s)" % ( self.get_table_name(), - self.state_group, + self.entity_column, self.output_date_column, ) return "CREATE TABLE %s AS (%s)" % (self.get_table_name(imputed=True), query) + + @property + def feature_columns_sans_impflags(self): + columns = [] + for group, groupby in self.groups.items(): + intervals = self.intervals[group] + columns += list( + chain( + *[self._get_aggregates_sql(i, "2016-01-01", group) for i in intervals] + ) + ) + return set(label_obj.name for label_obj in columns) diff --git a/src/triage/database_reflection.py b/src/triage/database_reflection.py index 406209bbb..1d99f09b8 100644 --- a/src/triage/database_reflection.py +++ b/src/triage/database_reflection.py @@ -131,6 +131,20 @@ def column_type(table_name, column, db_engine): return type(reflected_table(table_name, db_engine).columns[column].type) +def table_columns(table_name, db_engine): + """Retrieve a list of columns. + + The table is expected to exist. + + Args: + table_name (string) A table name (with schema) + db_engine (sqlalchemy.engine) + + Returns: (list) Every column currently in the table + """ + return reflected_table(table_name, db_engine).columns + + def schema_tables(schema_name, db_engine): meta = MetaData(schema=schema_name, bind=db_engine) meta.reflect() diff --git a/src/triage/experiments/__init__.py b/src/triage/experiments/__init__.py index 95d3081de..b83f9827f 100644 --- a/src/triage/experiments/__init__.py +++ b/src/triage/experiments/__init__.py @@ -1,5 +1,5 @@ # Avoid circular import (required by base) -CONFIG_VERSION = "v6" # noqa: E402 +CONFIG_VERSION = "v7" # noqa: E402 from .base import ExperimentBase from .multicore import MultiCoreExperiment diff --git a/src/triage/experiments/base.py b/src/triage/experiments/base.py index 5418c3d1a..ff4ebab66 100644 --- a/src/triage/experiments/base.py +++ b/src/triage/experiments/base.py @@ -14,11 +14,12 @@ ) from triage.component.architect.features import ( - FeatureGenerator, - FeatureDictionaryCreator, FeatureGroupCreator, FeatureGroupMixer, + FeatureDictionary, ) + +from triage.component.architect.feature_block_generators import feature_blocks_from_config from triage.component.architect.planner import Planner from triage.component.architect.builders import MatrixBuilder from triage.component.architect.cohort_table_generators import ( @@ -55,6 +56,7 @@ from triage.database_reflection import table_has_data from triage.util.conf import dt_from_str +from triage.util.db import run_statements class ExperimentBase(ABC): @@ -196,19 +198,6 @@ def initialize_components(self): "you will not be able to make matrices." ) - self.feature_dictionary_creator = FeatureDictionaryCreator( - features_schema_name=self.features_schema_name, db_engine=self.db_engine - ) - - self.feature_generator = FeatureGenerator( - features_schema_name=self.features_schema_name, - replace=self.replace, - db_engine=self.db_engine, - feature_start_time=split_config["feature_start_time"], - materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, - features_ignore_cohort=self.features_ignore_cohort - ) - self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]}) ) @@ -375,56 +364,25 @@ def all_as_of_times(self): return distinct_as_of_times @cachedproperty - def collate_aggregations(self): + def feature_blocks(self): """Collation of ``Aggregation`` objects used by this experiment. Returns: (list) of ``collate.Aggregation`` objects """ - logging.info("Creating collate aggregations") - if "feature_aggregations" not in self.config: - logging.warning("No feature_aggregation config is available") + if "features" not in self.config: + logging.warning("No feature config is available") return [] - return self.feature_generator.aggregations( - feature_aggregation_config=self.config["feature_aggregations"], - feature_dates=self.all_as_of_times, - state_table=self.cohort_table_name, - ) - - @cachedproperty - def feature_aggregation_table_tasks(self): - """All feature table query tasks specified by this - ``Experiment``. - - Returns: (dict) keys are group table names, values are - themselves dicts, each with keys for different stages of - table creation (prepare, inserts, finalize) and with values - being lists of SQL commands - - """ - logging.info( - "Calculating feature tasks for %s as_of_times", len(self.all_as_of_times) - ) - return self.feature_generator.generate_all_table_tasks( - self.collate_aggregations, task_type="aggregation" - ) - - @cachedproperty - def feature_imputation_table_tasks(self): - """All feature imputation query tasks specified by this - ``Experiment``. - - Returns: (dict) keys are group table names, values are - themselves dicts, each with keys for different stages of - table creation (prepare, inserts, finalize) and with values - being lists of SQL commands - - """ - logging.info( - "Calculating feature tasks for %s as_of_times", len(self.all_as_of_times) - ) - return self.feature_generator.generate_all_table_tasks( - self.collate_aggregations, task_type="imputation" + logging.info("Creating feature blocks from config") + return feature_blocks_from_config( + config=self.config["features"], + as_of_dates=self.all_as_of_times, + cohort_table=self.cohort_table_name, + features_schema_name=self.features_schema_name, + db_engine=self.db_engine, + feature_start_time=self.config["temporal_config"]["feature_start_time"], + features_ignore_cohort=self.features_ignore_cohort, + materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, ) @cachedproperty @@ -436,12 +394,7 @@ def master_feature_dictionary(self): values being lists of feature names """ - result = self.feature_dictionary_creator.feature_dictionary( - feature_table_names=self.feature_imputation_table_tasks.keys(), - index_column_lookup=self.feature_generator.index_column_lookup( - self.collate_aggregations - ), - ) + result = FeatureDictionary(feature_blocks=self.feature_blocks) logging.info("Computed master feature dictionary: %s", result) return result @@ -537,7 +490,7 @@ def process_train_test_tasks(self, train_tasks): pass @abstractmethod - def process_query_tasks(self, query_tasks): + def process_inserts(self, inserts): pass @abstractmethod @@ -545,18 +498,24 @@ def process_matrix_build_tasks(self, matrix_build_tasks): pass def generate_preimputation_features(self): - self.process_query_tasks(self.feature_aggregation_table_tasks) - logging.info( - "Finished running preimputation feature queries. The final results are in tables: %s", - ",".join(agg.get_table_name() for agg in self.collate_aggregations), - ) + for feature_block in self.feature_blocks: + tasks = feature_block.generate_preimpute_tasks(self.replace) + run_statements(tasks.get("prepare", []), self.db_engine) + self.process_inserts(tasks.get("inserts", [])) + run_statements(tasks.get("finalize", []), self.db_engine) + logging.info("Finished running preimputation feature queries.") def impute_missing_features(self): - self.process_query_tasks(self.feature_imputation_table_tasks) + for feature_block in self.feature_blocks: + tasks = feature_block.generate_impute_tasks(self.replace) + run_statements(tasks.get("prepare", []), self.db_engine) + self.process_inserts(tasks.get("inserts", [])) + run_statements(tasks.get("finalize", []), self.db_engine) + logging.info( "Finished running postimputation feature queries. The final results are in tables: %s", ",".join( - agg.get_table_name(imputed=True) for agg in self.collate_aggregations + block.final_feature_table_name for block in self.feature_blocks ), ) diff --git a/src/triage/experiments/multicore.py b/src/triage/experiments/multicore.py index 74602a7be..b0d61ac0e 100644 --- a/src/triage/experiments/multicore.py +++ b/src/triage/experiments/multicore.py @@ -4,6 +4,8 @@ from pebble import ProcessPool from multiprocessing.reduction import ForkingPickler +from triage.util.db import run_statements + from triage.component.catwalk.utils import Batch from triage.experiments import ExperimentBase @@ -60,21 +62,16 @@ def process_train_test_tasks(self, tasks): parallelize(partial_test, tasks, self.n_processes) logging.info("Cleaned up concurrent pool") - def process_query_tasks(self, query_tasks): + def process_inserts(self, inserts): logging.info("Processing query tasks with %s processes", self.n_db_processes) - for table_name, tasks in query_tasks.items(): - logging.info("Processing features for %s", table_name) - self.feature_generator.run_commands(tasks.get("prepare", [])) - partial_insert = partial( - insert_into_table, feature_generator=self.feature_generator - ) + partial_insert = partial( + insert_into_table, db_engine=self.db_engine + ) - insert_batches = [ - list(task_batch) for task_batch in Batch(tasks.get("inserts", []), 25) - ] - parallelize(partial_insert, insert_batches, n_processes=self.n_db_processes) - self.feature_generator.run_commands(tasks.get("finalize", [])) - logging.info("%s completed", table_name) + insert_batches = [ + list(task_batch) for task_batch in Batch(inserts, 25) + ] + parallelize(partial_insert, insert_batches, n_processes=self.n_db_processes) def process_matrix_build_tasks(self, matrix_build_tasks): partial_build_matrix = partial( @@ -90,10 +87,10 @@ def process_matrix_build_tasks(self, matrix_build_tasks): ) -def insert_into_table(insert_statements, feature_generator): +def insert_into_table(insert_statements, db_engine): try: logging.info("Beginning insert batch") - feature_generator.run_commands(insert_statements) + run_statements(insert_statements, db_engine) return True except Exception: logging.error("Child error: %s", traceback.format_exc()) diff --git a/src/triage/experiments/rq.py b/src/triage/experiments/rq.py index cf56a5f13..1ae39d8df 100644 --- a/src/triage/experiments/rq.py +++ b/src/triage/experiments/rq.py @@ -1,6 +1,7 @@ import logging import time from triage.component.catwalk.utils import Batch +from triage.util.db import run_statements from triage.experiments import ExperimentBase try: @@ -74,51 +75,22 @@ def wait_for(self, jobs): logging.info("Sleeping for %s seconds", self.sleep_time) time.sleep(self.sleep_time) - def process_query_tasks(self, query_tasks): - """Run queries by table - - Will run preparation (e.g. create table) and finalize (e.g. create index) tasks - in the main process, - but delegate inserts to rq Jobs in batches of 25 - - Args: query_tasks (dict) - keys should be table names and values should be dicts. - Each inner dict should have up to three keys, each with a list of queries: - 'prepare' (setting up the table), - 'inserts' (insert commands to populate the table), - 'finalize' (finishing table setup after all inserts have run) - - Example: { - 'table_one': { - 'prepare': ['create table table_one (col1 varchar)'], - 'inserts': [ - 'insert into table_one values (\'a\')', - 'insert into table_one values (\'b'\')' - ] - 'finalize': ['create index on table_one (col1)'] - } - } - """ - for table_name, tasks in query_tasks.items(): - logging.info("Processing features for %s", table_name) - self.feature_generator.run_commands(tasks.get("prepare", [])) - - insert_batches = [ - list(task_batch) for task_batch in Batch(tasks.get("inserts", []), 25) - ] - jobs = [ - self.queue.enqueue( - self.feature_generator.run_commands, - insert_batch, - timeout=DEFAULT_TIMEOUT, - result_ttl=DEFAULT_TIMEOUT, - ttl=DEFAULT_TIMEOUT, - ) - for insert_batch in insert_batches - ] - self.wait_for(jobs) - - self.feature_generator.run_commands(tasks.get("finalize", [])) - logging.info("%s completed", table_name) + def process_inserts(self, inserts): + insert_batches = [ + list(task_batch) for task_batch in Batch(inserts, 25) + ] + jobs = [ + self.queue.enqueue( + run_statements, + insert_batch, + self.db_engine, + timeout=DEFAULT_TIMEOUT, + result_ttl=DEFAULT_TIMEOUT, + ttl=DEFAULT_TIMEOUT, + ) + for insert_batch in insert_batches + ] + self.wait_for(jobs) def process_matrix_build_tasks(self, matrix_build_tasks): """Run matrix build tasks using RQ diff --git a/src/triage/experiments/singlethreaded.py b/src/triage/experiments/singlethreaded.py index e414cef92..d57d57b0b 100644 --- a/src/triage/experiments/singlethreaded.py +++ b/src/triage/experiments/singlethreaded.py @@ -1,9 +1,10 @@ from triage.experiments import ExperimentBase +from triage.util.db import run_statements class SingleThreadedExperiment(ExperimentBase): - def process_query_tasks(self, query_tasks): - self.feature_generator.process_table_tasks(query_tasks) + def process_inserts(self, inserts): + run_statements(inserts, self.db_engine) def process_matrix_build_tasks(self, matrix_build_tasks): self.matrix_builder.build_all_matrices(matrix_build_tasks) diff --git a/src/triage/experiments/validate.py b/src/triage/experiments/validate.py index 4a69a484b..db333aec1 100644 --- a/src/triage/experiments/validate.py +++ b/src/triage/experiments/validate.py @@ -126,7 +126,7 @@ def dt_from_str(dt_str): ) -class FeatureAggregationsValidator(Validator): +class SpacetimeAggregationValidator(Validator): def _validate_keys(self, aggregation_config): for key in [ "from_obj", @@ -139,7 +139,7 @@ def _validate_keys(self, aggregation_config): raise ValueError( dedent( """ - Section: feature_aggregations - + Section: features->spacetime_aggregations - '{} required as key: aggregation config: {}""".format( key, aggregation_config ) @@ -155,7 +155,7 @@ def _validate_aggregates(self, aggregation_config): raise ValueError( dedent( """ - Section: feature_aggregations - + Section: features->spacetime_aggregations - Need either aggregates, categoricals, or array_categoricals in {}""".format( aggregation_config @@ -499,7 +499,7 @@ def _run(self, cohort_config): class FeatureGroupDefinitionValidator(Validator): - def _run(self, feature_group_definition, feature_aggregation_config): + def _run(self, feature_group_definition): if not isinstance(feature_group_definition, dict): raise ValueError( dedent( @@ -536,45 +536,6 @@ def _run(self, feature_group_definition, feature_aggregation_config): ) ) - if "prefix" in feature_group_definition: - available_prefixes = { - aggregation["prefix"] for aggregation in feature_aggregation_config - } - bad_prefixes = set(feature_group_definition["prefix"]) - available_prefixes - if bad_prefixes: - raise ValueError( - dedent( - """ - Section: feature_group_definition - - The following given feature group prefixes: '{}' - are invalid. Available prefixes from this experiment's feature - aggregations are: '{}' - """.format( - bad_prefixes, available_prefixes - ) - ) - ) - - if "tables" in feature_group_definition: - available_tables = { - aggregation["prefix"] + "_aggregation_imputed" - for aggregation in feature_aggregation_config - } - bad_tables = set(feature_group_definition["tables"]) - available_tables - if bad_tables: - raise ValueError( - dedent( - """ - Section: feature_group_definition - - The following given feature group tables: '{}' - are invalid. Available tables from this experiment's feature - aggregations are: '{}' - """.format( - bad_tables, available_tables - ) - ) - ) - class FeatureGroupStrategyValidator(Validator): def _run(self, feature_group_strategies): @@ -769,13 +730,37 @@ def _run(self, scoring_config): ) +class FeatureValidator(Validator): + def _run(self, feature_config): + feature_lookup = architect.feature_block_generators.FEATURE_BLOCK_GENERATOR_LOOKUP + available_keys = set(feature_lookup.keys()) + given_keys = set(feature_config.keys()) + bad_keys = given_keys - available_keys + if bad_keys: + raise ValueError( + dedent( + """Section: features - + The following given feature types '{}' are unavailable. + Available metrics are: '{}' + """.format( + bad_keys, available_keys + ) + ) + ) + if 'spacetime_aggregations' in feature_config: + SpacetimeAggregationValidator( + self.db_engine, + strict=self.strict + ).run(feature_config['spacetime_aggregations']) + + class ExperimentValidator(Validator): def run(self, experiment_config): TemporalValidator(strict=self.strict).run( experiment_config.get("temporal_config", {}) ) - FeatureAggregationsValidator(self.db_engine, strict=self.strict).run( - experiment_config.get("feature_aggregations", {}) + FeatureValidator(self.db_engine, strict=self.strict).run( + experiment_config.get("features", {}) ) LabelConfigValidator(self.db_engine, strict=self.strict).run( experiment_config.get("label_config", None) @@ -785,7 +770,6 @@ def run(self, experiment_config): ) FeatureGroupDefinitionValidator(strict=self.strict).run( experiment_config.get("feature_group_definition", {}), - experiment_config.get("feature_aggregations", {}), ) FeatureGroupStrategyValidator(strict=self.strict).run( experiment_config.get("feature_group_strategies", []) diff --git a/src/triage/util/db.py b/src/triage/util/db.py index 57142a46f..c9c4f36ad 100644 --- a/src/triage/util/db.py +++ b/src/triage/util/db.py @@ -34,3 +34,9 @@ def __reconstruct__(cls, url, creator, kwargs): create_engine = SerializableDbEngine + + +def run_statements(statement_list, db_engine): + with db_engine.begin() as conn: + for statement in statement_list: + conn.execute(statement) From 6bc3a291745578978025de068275360130a8abd9 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Thu, 21 Feb 2019 14:06:20 -0600 Subject: [PATCH 02/22] Remove unnecessary quotes in split_table --- src/triage/database_reflection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/triage/database_reflection.py b/src/triage/database_reflection.py index 1d99f09b8..4b316c013 100644 --- a/src/triage/database_reflection.py +++ b/src/triage/database_reflection.py @@ -10,7 +10,7 @@ def split_table(table_name): Returns: (tuple) of schema and table name """ - table_parts = table_name.split(".") + table_parts = table_name.replace('"', '').split(".") if len(table_parts) == 2: return tuple(table_parts) elif len(table_parts) == 1: From ca9207ea96bf3a9b726dbd8a9ec0feb1331fa6c6 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Thu, 21 Feb 2019 14:43:30 -0600 Subject: [PATCH 03/22] Test for verbose task info --- src/tests/architect_tests/test_feature_blocks.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/tests/architect_tests/test_feature_blocks.py b/src/tests/architect_tests/test_feature_blocks.py index 24153e88a..bdc4a6277 100644 --- a/src/tests/architect_tests/test_feature_blocks.py +++ b/src/tests/architect_tests/test_feature_blocks.py @@ -88,6 +88,13 @@ def test_FeatureBlock_generate_impute_tasks(db_engine): } +def test_FeatureBlock_log_verbose_task_info(db_engine): + block = FeatureBlockExample(db_engine=db_engine, cohort_table="mycohort", as_of_dates=['2016-01-01', '2016-02-01']) + task = block.generate_impute_tasks(replace=True) + # just want to make sure that the logging doesn't error, no assertions + block.log_verbose_task_info(task) + + def test_FeatureBlock_needs_features(db_engine): # needs_features should function as following: # if there are members of the cohort without features, needs_features should return true From cf96a6c2121c66b8c32dc208279545fd18edc6ff Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Wed, 27 Feb 2019 10:34:24 -0600 Subject: [PATCH 04/22] Remove unused FeatureQueryRunner --- .../architect/feature_query_runners.py | 141 ------------------ 1 file changed, 141 deletions(-) delete mode 100644 src/triage/component/architect/feature_query_runners.py diff --git a/src/triage/component/architect/feature_query_runners.py b/src/triage/component/architect/feature_query_runners.py deleted file mode 100644 index 25891c613..000000000 --- a/src/triage/component/architect/feature_query_runners.py +++ /dev/null @@ -1,141 +0,0 @@ -import logging - -import sqlparse - -from triage.database_reflection import table_exists - - -def run_statements(statement_list, db_engine): - with db_engine.begin() as conn: - for statement in statement_list: - logging.debug("Executing feature generation query: %s", statement) - conn.execute(statement) - - -def process_table_task(task, db_engine): - run_statements(task.get("prepare", []), db_engine) - run_statements(task.get("inserts", []), db_engine) - run_statements(task.get("finalize", []), db_engine) - - -def process_table_tasks(table_tasks, db_engine, verbose=False): - for task, task_num in enumerate(table_tasks, 1): - if verbose: - log_verbose_task_info(task, task_num) - process_table_task(task, db_engine) - - -def needs_features(feature_block, db_engine): - imputed_table = feature_block.get_final_feature_table_name() - - if table_exists(imputed_table, db_engine): - check_query = ( - f"select 1 from {feature_block.cohort_table} " - f"left join {imputed_table} " - "using (entity_id, as_of_date) " - f"where {imputed_table}.entity_id is null limit 1" - ) - if db_engine.execute(check_query).scalar(): - logging.warning( - "Imputed feature table %s did not contain rows from the " - "entire cohort, need to rebuild features", imputed_table) - return True - else: - logging.warning("Imputed feature table %s did not exist, " - "need to build features", imputed_table) - return True - logging.warning("Imputed feature table %s looks good, " - "skipping feature building!", imputed_table) - return False - - -def generate_preimpute_tasks(feature_blocks, replace, db_engine): - table_tasks = [] - for block in feature_blocks: - if replace or needs_features(block, db_engine): - table_tasks.append({ - "prepare": block.get_preinsert_queries(), - "inserts": block.get_inserts(), - "finalize": block.get_postinsert_queries() - }) - logging.info("Generated tasks to create %s feature block tables", len(table_tasks)) - else: - logging.info("Skipping feature table creation for %s", block) - return table_tasks - - -def generate_impute_tasks(feature_blocks, replace, db_engine): - table_tasks = [] - for block in feature_blocks: - if replace or needs_features(block, db_engine): - table_tasks.append({ - "prepare": block.get_impute_queries(), - "inserts": [], - "finalize": [] - }) - logging.info("Generated tasks to create %s feature block tables", len(table_tasks)) - else: - logging.info("Skipping feature table creation for %s", block) - return table_tasks - - -def create_all_tables(feature_blocks, replace, db_engine): - """Create all feature tables. - - First builds the aggregation tables, and then performs - imputation on any null values, (requiring a two-step process to - determine which columns contain nulls after the initial - aggregation tables are built). - """ - process_table_tasks(generate_preimpute_tasks(feature_blocks, replace, db_engine)) - process_table_tasks(generate_impute_tasks(feature_blocks, replace, db_engine)) - - # perform a sanity check that no nulls were left after imputation - for feature_block in feature_blocks: - feature_block.verify_no_nulls() - - -def log_verbose_task_info(task, task_num): - prepares = task.get("prepare", []) - inserts = task.get("inserts", []) - finalize = task.get("finalize", []) - logging.info("------------------") - logging.info("TASK %s ", task_num) - logging.info( - "%s prepare queries, %s insert queries, %s finalize queries", - len(prepares), - len(inserts), - len(finalize), - ) - logging.info("------------------") - logging.info("") - logging.info("------------------") - logging.info("PREPARATION QUERIES") - logging.info("------------------") - for query_num, query in enumerate(prepares, 1): - logging.info("") - logging.info( - "prepare query %s: %s", - query_num, - sqlparse.format(str(query), reindent=True), - ) - logging.info("------------------") - logging.info("INSERT QUERIES") - logging.info("------------------") - for query_num, query in enumerate(inserts, 1): - logging.info("") - logging.info( - "insert query %s: %s", - query_num, - sqlparse.format(str(query), reindent=True), - ) - logging.info("------------------") - logging.info("FINALIZE QUERIES") - logging.info("------------------") - for query_num, query in enumerate(finalize, 1): - logging.info("") - logging.info( - "finalize query %s: %s", - query_num, - sqlparse.format(str(query), reindent=True), - ) From 6bff2bce01d68f7d6a61f92a491a8791463caab4 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Thu, 28 Feb 2019 16:44:11 -0600 Subject: [PATCH 05/22] Updates to some docs, remove no-longer-useful feature YAML --- docs/mkdocs.yml | 5 +- docs/sources/experiments/feature-testing.md | 7 +- docs/sources/experiments/upgrading.md | 5 - example/config/feature.yaml | 100 -------------------- 4 files changed, 8 insertions(+), 109 deletions(-) delete mode 100644 docs/sources/experiments/upgrading.md delete mode 100644 example/config/feature.yaml diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 787533bd5..2eb7eea8f 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -19,7 +19,10 @@ pages: - Defining an Experiment: experiments/defining.md - Testing Feature Configuration: experiments/feature-testing.md - Running an Experiment: experiments/running.md - - Upgrading an Experiment: experiments/upgrading.md + - Upgrading an Experiment: + to v5: experiments/upgrade-to-v5.md + to v6: experiments/upgrade-to-v6.md + to v7: experiments/upgrade-to-v7.md - Temporal Validation Deep Dive: experiments/temporal-validation.md - Cohort and Label Deep Dive: experiments/cohort-labels.md - Feature Generation Recipe Book: experiments/features.md diff --git a/docs/sources/experiments/feature-testing.md b/docs/sources/experiments/feature-testing.md index 35789c1fa..800bbe34f 100644 --- a/docs/sources/experiments/feature-testing.md +++ b/docs/sources/experiments/feature-testing.md @@ -5,11 +5,11 @@ Developing features for Triage experiments can be a daunting task. There are a l To speed up the process of iterating on features, you can run a list of feature aggregations, without imputation, on just one as-of-date. This functionality can be accessed through the `triage` command line tool or called directly from code (say, in a Jupyter notebook) using the `feature_blocks_from_config` utility. ## Using Triage CLI -![triage featuretest cli help screen](featuretest-cli.png) The command-line interface for testing features takes in two arguments: - - An experiment config file, with a feature section and optionally a cohort section. - - An as-of-date. This should be in the format `2016-01-01`. + +- An experiment config file. It should have at least a `features` section, and if a `cohort_config` section is present, it will use that to limit the number of feature rows it creates to the cohort at the given date. Other keys can be in there but are ignored. In other lwords, you can use your experiment config file either before or after its fully completed. +- An as-of-date. This should be in the format `2016-01-01`. Example: `triage experiment featuretest example/config/experiment.yaml 2016-01-01` @@ -39,6 +39,7 @@ feature_config = {'spacetime_aggregations': [{ { 'quantity': 'quantity_one', 'metrics': ['sum', 'count'], + } ], 'categoricals': [ { diff --git a/docs/sources/experiments/upgrading.md b/docs/sources/experiments/upgrading.md deleted file mode 100644 index f08a49ab7..000000000 --- a/docs/sources/experiments/upgrading.md +++ /dev/null @@ -1,5 +0,0 @@ -# Upgrading an Experiment config - -* [v5 → v6](experiments/upgrade-to-v6.md) -* [v3/v4 → v5](experiments/upgrade-to-v5.md) - diff --git a/example/config/feature.yaml b/example/config/feature.yaml deleted file mode 100644 index 9de66b39d..000000000 --- a/example/config/feature.yaml +++ /dev/null @@ -1,100 +0,0 @@ -### EXAMPLE FEATURE CONFIG -# -### - - - # prefix given to the resultant tables - prefix: 'prefix' - # from_obj is usually a source table but can be an expression, such as - # a join (ie 'cool_stuff join other_stuff using (stuff_id)') - from_obj: 'cool_stuff' - # The date column to use for specifying which records to include - # in temporal features. It is important that the column used specifies - # the date at which the event is known about, which may be different - # from the date the event happened. - knowledge_date_column: 'open_date' - - # top-level imputation rules that will apply to all aggregates functions - # can also specify categoricals_imputation or array_categoricals_imputation - # - # You must specified at least one of the top-level or feature-level imputation - # to cover ever feature being defined. - aggregates_imputation: - # The `all` rule will apply to all aggregation functions, unless over- - # ridden by a more specific one - all: - # every imputation rule must have a `type` parameter, while some - # (like 'constant') have other required parameters (`value` here) - type: 'constant' - value: 0 - # specifying `max` here will take precedence over the `all` rule for - # aggregations using a MAX() function - max: - type: 'mean' - - # aggregates and categoricals define the actual features created. So - # at least one is required - # - # Aggregates of numerical columns. Each quantity is a number of some - # sort, and the list of metrics are applied to each quantity - aggregates: - - - quantity: 'homeless::INT' - # Imputation rules specified at the level of specific features - # will take precedence over the higer-level rules specified - # above. Note that the 'count' and 'sum' metrics will be - # imputed differently here. - imputation: - count: - type: 'mean' - sum: - type: 'constant' - value: 137 - metrics: - - 'count' - - 'sum' - - - # since we're specifying `aggregates_imputation` above, - # a feature-specific imputation rule can be omitted - quantity: 'some_flag' - metrics: - - 'max' - - 'sum' - # Categorical features. The column given can be of any type, but the - # choices must comparable to that type for equality within SQL - # The result will be one feature for each choice/metric combination - categoricals: - - - column: 'color' - # note that we haven't specified a top level `categoricals_imputation` - # set of rules, so we have to include feature-specific imputation - # rules for both of our categoricals here. - imputation: - sum: - type: 'null_category' - max: - type: 'mean' - choices: - - 'red' - - 'blue' - - 'green' - metrics: - - 'sum' - - - column: 'shape' - # as with the top-level imputation rules, `all` can be used - # for the feature-level rules to specify the same type of - # imputation for all aggregation functions - imputation: - all: - type: 'zero' - choice_query: 'select distinct shape from cool_stuff' - metrics: - - 'sum' - # The time intervals over which to aggregate features - intervals: - - '1 year' - - '2 years' - - 'all' - # A list of different columns to separately group by - groups: - - 'entity_id' From 7e6fa3e8e0b2db2f73323829e8cdae67822bef99 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Thu, 28 Feb 2019 17:15:51 -0600 Subject: [PATCH 06/22] Changes from review --- .../component/architect/feature_block.py | 22 ++++------ .../component/architect/feature_dictionary.py | 41 +++---------------- 2 files changed, 13 insertions(+), 50 deletions(-) diff --git a/src/triage/component/architect/feature_block.py b/src/triage/component/architect/feature_block.py index bd4e00941..ef0eaf0fc 100644 --- a/src/triage/component/architect/feature_block.py +++ b/src/triage/component/architect/feature_block.py @@ -83,21 +83,17 @@ def _cohort_table_sub(self): """Helper function to ensure we only include state table records in our set of input dates and after the feature_start_time. """ - datestr = ", ".join(["'%s'::date" % dt for dt in self.as_of_dates]) + datestr = ", ".join(f"'{dt}'::date" for dt in self.as_of_dates) mindtstr = ( - " AND as_of_date >= '%s'::date" % (self.feature_start_time,) + f" AND as_of_date >= '{self.feature_start_time}'::date" if self.feature_start_time is not None else "" ) - return """( + return f"""( SELECT * - FROM {st} + FROM {self.cohort_table_name} WHERE as_of_date IN ({datestr}) - {mindtstr})""".format( - st=self.cohort_table_name, - datestr=datestr, - mindtstr=mindtstr, - ) + {mindtstr})""" def verify_no_nulls(self): """ @@ -112,12 +108,8 @@ def verify_no_nulls(self): LEFT JOIN {aggs_tbl} t2 USING(entity_id, as_of_date) """ cols_sql = ",\n".join( - [ - """SUM(CASE WHEN "{col}" IS NULL THEN 1 ELSE 0 END) AS "{col}" """.format( - col=column.name - ) - for column in table_columns(self.final_feature_table_name, self.db_engine) - ] + f"""SUM(CASE WHEN "{column.name}" IS NULL THEN 1 ELSE 0 END) AS "{column.name}" """ + for column in table_columns(self.final_feature_table_name, self.db_engine) ) results = self.db_engine.execute(query_template.format( diff --git a/src/triage/component/architect/feature_dictionary.py b/src/triage/component/architect/feature_dictionary.py index 5bc29bcbc..bb3e4ac34 100644 --- a/src/triage/component/architect/feature_dictionary.py +++ b/src/triage/component/architect/feature_dictionary.py @@ -1,26 +1,21 @@ from triage.component.architect.utils import remove_schema_from_table_name from triage.util.structs import FeatureNameList from collections.abc import Iterable -from collections import MutableMapping -class FeatureDictionary(MutableMapping): +class FeatureDictionary(dict): """A feature dictionary, consisting of table names as keys and column names as values If a list of feature_blocks is passed, will initialize the feature dictionary with their data. """ def __init__(self, feature_blocks=None, *args, **kwargs): - self.tables = dict() - self.update(dict(*args, **kwargs)) # use the free update to set keys + super().__init__(*args, **kwargs) for feature_block in feature_blocks: cleaned_table = remove_schema_from_table_name( feature_block.final_feature_table_name ) self[cleaned_table] = feature_block.feature_columns - def __getitem__(self, key): - return FeatureNameList(self.tables[key]) - def __setitem__(self, table, feature_names): if not isinstance(table, str): raise ValueError("key of FeatureDictionary objects represents a table " @@ -34,31 +29,7 @@ def __setitem__(self, table, feature_names): f"invalid type: {type(feature_name)} " "The value of FeatureDictionary objects represents a list of " "feature names, and therefore each item must be a string") - self.tables[table] = feature_names - - def __delitem__(self, key): - del self.tables[key] - - def __iter__(self): - return iter(self.tables) - - def __len__(self): - return len(self.tables) - - def __sub__(self, other): - not_in_other = FeatureDictionary() - for table_name, feature_list in self.items(): - if table_name not in other: - not_in_other[table_name] = feature_list - continue - missing_feature_names = [ - feature_name - for feature_name in feature_list - if feature_name not in other[table_name] - ] - if missing_feature_names: - not_in_other[table_name] = missing_feature_names - return not_in_other - - def __repr__(self): - return str(self.tables) + if isinstance(feature_names, FeatureNameList): + super().__setitem__(table, feature_names) + else: + super().__setitem__(table, FeatureNameList(feature_names)) From 3a3c71f4f7e844ef3fd2ffaa2898a241b2e29639 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 5 Mar 2019 17:04:04 -0600 Subject: [PATCH 07/22] from review --- src/triage/component/architect/feature_dictionary.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/triage/component/architect/feature_dictionary.py b/src/triage/component/architect/feature_dictionary.py index bb3e4ac34..33cde7913 100644 --- a/src/triage/component/architect/feature_dictionary.py +++ b/src/triage/component/architect/feature_dictionary.py @@ -18,14 +18,14 @@ def __init__(self, feature_blocks=None, *args, **kwargs): def __setitem__(self, table, feature_names): if not isinstance(table, str): - raise ValueError("key of FeatureDictionary objects represents a table " + raise TypeError("key of FeatureDictionary objects represents a table " "name and must be a string") if not isinstance(feature_names, Iterable): - raise ValueError("value of FeatureDictionary objects represents a list of " + raise TypeError("value of FeatureDictionary objects represents a list of " "feature names, and therefore must be iterable") for feature_name in feature_names: if not isinstance(feature_name, str): - raise ValueError(f"invalid value: {feature_name}. " + raise TypeError(f"invalid value: {feature_name}. " f"invalid type: {type(feature_name)} " "The value of FeatureDictionary objects represents a list of " "feature names, and therefore each item must be a string") From 5a14339265e22c2f720039bf65530deefb451b00 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Wed, 6 Mar 2019 11:37:11 -0600 Subject: [PATCH 08/22] WIP --- .../component/architect/feature_dictionary.py | 10 ++---- src/triage/component/collate/spacetime.py | 15 ++++----- src/triage/experiments/validate.py | 15 ++++----- src/triage/util/structs.py | 33 ++++++++++++++++++- 4 files changed, 47 insertions(+), 26 deletions(-) diff --git a/src/triage/component/architect/feature_dictionary.py b/src/triage/component/architect/feature_dictionary.py index 33cde7913..aeef02d46 100644 --- a/src/triage/component/architect/feature_dictionary.py +++ b/src/triage/component/architect/feature_dictionary.py @@ -19,16 +19,10 @@ def __init__(self, feature_blocks=None, *args, **kwargs): def __setitem__(self, table, feature_names): if not isinstance(table, str): raise TypeError("key of FeatureDictionary objects represents a table " - "name and must be a string") + "name and must be a string") if not isinstance(feature_names, Iterable): raise TypeError("value of FeatureDictionary objects represents a list of " - "feature names, and therefore must be iterable") - for feature_name in feature_names: - if not isinstance(feature_name, str): - raise TypeError(f"invalid value: {feature_name}. " - f"invalid type: {type(feature_name)} " - "The value of FeatureDictionary objects represents a list of " - "feature names, and therefore each item must be a string") + "feature names, and therefore must be iterable") if isinstance(feature_names, FeatureNameList): super().__setitem__(table, feature_names) else: diff --git a/src/triage/component/collate/spacetime.py b/src/triage/component/collate/spacetime.py index fdccd569a..e04e93191 100644 --- a/src/triage/component/collate/spacetime.py +++ b/src/triage/component/collate/spacetime.py @@ -588,12 +588,11 @@ def _get_impute_create(self, impute_cols, nonimpute_cols): @property def feature_columns_sans_impflags(self): - columns = [] - for group, groupby in self.groups.items(): - intervals = self.intervals[group] - columns += list( - chain( - *[self._get_aggregates_sql(i, "2016-01-01", group) for i in intervals] - ) + columns = chain.from_iterable( + chain.from_iterable( + self._get_aggregates_sql(interval, "2016-01-01", group) + for interval in self.intervals[group] ) - return set(label_obj.name for label_obj in columns) + for (group, groupby) in self.groups.items() + ) + return {label.name for label in columns} diff --git a/src/triage/experiments/validate.py b/src/triage/experiments/validate.py index a882e0d91..abb2bf4db 100644 --- a/src/triage/experiments/validate.py +++ b/src/triage/experiments/validate.py @@ -779,18 +779,15 @@ def _run(self, scoring_config): class FeatureValidator(Validator): def _run(self, feature_config): feature_lookup = architect.feature_block_generators.FEATURE_BLOCK_GENERATOR_LOOKUP - available_keys = set(feature_lookup.keys()) - given_keys = set(feature_config.keys()) - bad_keys = given_keys - available_keys + bad_keys = feature_config.keys() - feature_lookup.keys() if bad_keys: raise ValueError( dedent( - """Section: features - - The following given feature types '{}' are unavailable. - Available metrics are: '{}' - """.format( - bad_keys, available_keys - ) + f"""\ + Section: features - + The following given feature types '{bad_keys}' are unavailable. + Available metrics are: '{feature_lookup.keys()}' + """ ) ) if 'spacetime_aggregations' in feature_config: diff --git a/src/triage/util/structs.py b/src/triage/util/structs.py index 677616eaf..b5c1aaaaa 100644 --- a/src/triage/util/structs.py +++ b/src/triage/util/structs.py @@ -13,4 +13,35 @@ class AsOfTimeList(TruncatedRepresentationList): class FeatureNameList(TruncatedRepresentationList): - pass + def check(self, value): + if not isinstance(value, str): + raise TypeError("A FeatureNameList represents a list of feature names, and therefore" + f"each item must be a string, not: {value!r}") + return value + + def insert(self, i, v): + raise ValueError("in insert!") + self.check(v) + super().insert(i, v) + + def __setitem__(self, i, v): + raise ValueError("in setitem!") + self.check(v) + super().__setitem__(i, v) + + def append(self, item): + raise ValueError("in append!") + self.check(item) + super().append(item) + + def extend(self, t): + raise ValueError("in extend") + return super().extend([ self.check(v) for v in t ]) + + def __add__(self, t): # This is for something like `CheckedList(validator, [1, 2, 3]) + list([4, 5, 6])`... + raise ValueError("in add") + return super().__add__([ self.check(v) for v in t ]) + + def __iadd__(self, t): # This is for something like `l = CheckedList(validator); l += [1, 2, 3]` + raise ValueError("in iadd") + return super().__iadd__([ self.check(v) for v in t ]) From 935e6736d5cd02a5f45e6bdcee1528a2b97f86a9 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Wed, 6 Mar 2019 14:01:47 -0600 Subject: [PATCH 09/22] changes from review --- .../component/architect/feature_dictionary.py | 5 +++ src/triage/util/structs.py | 33 +------------------ 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/src/triage/component/architect/feature_dictionary.py b/src/triage/component/architect/feature_dictionary.py index aeef02d46..ea2eea6a5 100644 --- a/src/triage/component/architect/feature_dictionary.py +++ b/src/triage/component/architect/feature_dictionary.py @@ -23,6 +23,11 @@ def __setitem__(self, table, feature_names): if not isinstance(feature_names, Iterable): raise TypeError("value of FeatureDictionary objects represents a list of " "feature names, and therefore must be iterable") + + for feature_name in feature_names: + if not isinstance(feature_name, str): + raise TypeError("A FeatureNameList represents a list of feature names, and therefore" + f"each item must be a string, not: {feature_name!r}") if isinstance(feature_names, FeatureNameList): super().__setitem__(table, feature_names) else: diff --git a/src/triage/util/structs.py b/src/triage/util/structs.py index b5c1aaaaa..677616eaf 100644 --- a/src/triage/util/structs.py +++ b/src/triage/util/structs.py @@ -13,35 +13,4 @@ class AsOfTimeList(TruncatedRepresentationList): class FeatureNameList(TruncatedRepresentationList): - def check(self, value): - if not isinstance(value, str): - raise TypeError("A FeatureNameList represents a list of feature names, and therefore" - f"each item must be a string, not: {value!r}") - return value - - def insert(self, i, v): - raise ValueError("in insert!") - self.check(v) - super().insert(i, v) - - def __setitem__(self, i, v): - raise ValueError("in setitem!") - self.check(v) - super().__setitem__(i, v) - - def append(self, item): - raise ValueError("in append!") - self.check(item) - super().append(item) - - def extend(self, t): - raise ValueError("in extend") - return super().extend([ self.check(v) for v in t ]) - - def __add__(self, t): # This is for something like `CheckedList(validator, [1, 2, 3]) + list([4, 5, 6])`... - raise ValueError("in add") - return super().__add__([ self.check(v) for v in t ]) - - def __iadd__(self, t): # This is for something like `l = CheckedList(validator); l += [1, 2, 3]` - raise ValueError("in iadd") - return super().__iadd__([ self.check(v) for v in t ]) + pass From 929a2e9d0ef0e1a4f83228b3b9d65c25cd16681a Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Mon, 11 Mar 2019 16:22:20 -0500 Subject: [PATCH 10/22] Update feature mock --- src/tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/test_cli.py b/src/tests/test_cli.py index 497059381..fd2453a1f 100644 --- a/src/tests/test_cli.py +++ b/src/tests/test_cli.py @@ -51,7 +51,7 @@ def test_cli_crosstabs(): def test_featuretest(): - with patch('triage.cli.FeatureGenerator', autospec=True) as featuremock: + with patch('triage.cli.feature_blocks_from_config', autospec=True) as featuremock: with patch('triage.cli.EntityDateTableGenerator', autospec=True) as cohortmock: try_command('featuretest', 'example/config/experiment.yaml', '2017-06-06') featuremock.assert_called_once() From 8733fcb89e2a7861d58479d8a9e84f8d32fde6b4 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Thu, 21 Mar 2019 15:57:12 -0500 Subject: [PATCH 11/22] WIP --- example/config/experiment.yaml | 78 ++++----- .../test_feature_block_generators.py | 148 +++++++++--------- src/tests/collate_tests/test_spacetime.py | 5 +- .../component/architect/feature_block.py | 5 +- .../architect/feature_block_generators.py | 33 ++-- src/triage/component/collate/spacetime.py | 23 +-- src/triage/experiments/base.py | 39 ++--- 7 files changed, 157 insertions(+), 174 deletions(-) diff --git a/example/config/experiment.yaml b/example/config/experiment.yaml index 5bf5eec4d..19356d697 100644 --- a/example/config/experiment.yaml +++ b/example/config/experiment.yaml @@ -73,42 +73,46 @@ label_config: # FEATURE GENERATION features: - spacetime_aggregations: - # The aggregate features to generate for each train/test split + # Every entry in the features section contains: + # - a key that names the output feature table + # - a value that configures the feature table based on the options + # for one of the available feature generator types # - # Implemented by wrapping collate: https://github.com/dssg/collate - # Most terminology here is taken directly from collate - # - # Each entry describes a collate.SpacetimeAggregation object, and the - # arguments needed to create it. Generally, each of these entries controls - # the features from one source table, though in the case of multiple groups - # may result in multiple output tables - # - # Rules specifying how to handle imputation of null values must be explicitly - # defined in your config file. These can be specified in two places: either - # within each feature or overall for each type of feature (aggregates_imputation, - # categoricals_imputation, array_categoricals_imputation). In either case, a rule must be given for - # each aggregation function (e.g., sum, max, avg, etc) used, or a catch-all - # can be specified with `all`. Aggregation function-specific rules will take - # precedence over the `all` rule and feature-specific rules will take - # precedence over the higher-level rules. Several examples are provided below. - # - # Available Imputation Rules: - # * mean: The average value of the feature (for SpacetimeAggregation the - # mean is taken within-date). - # * constant: Fill with a constant value from a required `value` parameter. - # * zero: Fill with zero. - # * null_category: Only available for categorical features. Just flag null - # values with the null category column. - # * binary_mode: Only available for aggregate column types. Takes the modal - # value for a binary feature. - # * error: Raise an exception if any null values are encountered for this - # feature. - - - # prefix given to the resultant tables - prefix: 'prefix' - # from_obj is usually a source table but can be an expression, such as - # a join (ie 'cool_stuff join other_stuff using (stuff_id)') + my_feature_output_table: # the output from this will go into 'my_feature_output_table' + feature_generator_type: "spacetime_aggregation" + # available types: 'spacetime_aggregation' + # + # SPACETIME_AGGREGATION + # The aggregate features to generate for each train/test split + # + # Implemented by wrapping collate: https://github.com/dssg/collate + # Most terminology here is taken directly from collate + # + # Each entry describes a collate.SpacetimeAggregation object, and the + # arguments needed to create it. Generally, each of these entries controls + # the features from one source table, though in the case of multiple groups + # may result in multiple output tables + # + # Rules specifying how to handle imputation of null values must be explicitly + # defined in your config file. These can be specified in two places: either + # within each feature or overall for each type of feature (aggregates_imputation, + # categoricals_imputation, array_categoricals_imputation). In either case, a rule must be given for + # each aggregation function (e.g., sum, max, avg, etc) used, or a catch-all + # can be specified with `all`. Aggregation function-specific rules will take + # precedence over the `all` rule and feature-specific rules will take + # precedence over the higher-level rules. Several examples are provided below. + # + # Available Imputation Rules: + # * mean: The average value of the feature (for SpacetimeAggregation the + # mean is taken within-date). + # * constant: Fill with a constant value from a required `value` parameter. + # * zero: Fill with zero. + # * null_category: Only available for categorical features. Just flag null + # values with the null category column. + # * binary_mode: Only available for aggregate column types. Takes the modal + # value for a binary feature. + # * error: Raise an exception if any null values are encountered for this + # feature. from_obj: 'cool_stuff' # The date column to use for specifying which records to include # in temporal features. It is important that the column used specifies @@ -208,10 +212,10 @@ features: # feature_group_definition allows you to create groups/subset of your features # by different criteria. # for instance, -# - 'tables' allows you to send a list of collate feature tables (collate builds these by appending 'aggregation_imputed' to the prefix) +# - 'tables' allows you to send a list of feature tables # - 'prefix' allows you to specify a list of feature name prefixes feature_group_definition: - tables: ['prefix_aggregation_imputed'] + tables: ['my_feature_output_table'] # strategies for generating combinations of groups # available: all, leave-one-out, leave-one-in, all-combinations diff --git a/src/tests/architect_tests/test_feature_block_generators.py b/src/tests/architect_tests/test_feature_block_generators.py index 283ae0d8a..9078eb3aa 100644 --- a/src/tests/architect_tests/test_feature_block_generators.py +++ b/src/tests/architect_tests/test_feature_block_generators.py @@ -1,6 +1,6 @@ from datetime import datetime, date -from triage.component.architect.feature_block_generators import generate_spacetime_aggregations +from triage.component.architect.feature_block_generators import generate_spacetime_aggregation import triage.component.collate as collate import pytest @@ -8,45 +8,41 @@ def test_spacetime_generation(db_engine): - aggregation_config = [ - { - "prefix": "aprefix", - "aggregates": [ - { - "quantity": "quantity_one", - "metrics": ["sum", "count"], - "imputation": { - "sum": {"type": "constant", "value": 137}, - "count": {"type": "zero"}, - }, - } - ], - "categoricals_imputation": {"all": {"type": "null_category"}}, - "categoricals": [ - {"column": "cat_one", "choices": ["good", "bad"], "metrics": ["sum"]} - ], - "groups": ["entity_id", "zip_code"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - aggregations = generate_spacetime_aggregations( + aggregation_config = { + "aggregates": [ + { + "quantity": "quantity_one", + "metrics": ["sum", "count"], + "imputation": { + "sum": {"type": "constant", "value": 137}, + "count": {"type": "zero"}, + }, + } + ], + "categoricals_imputation": {"all": {"type": "null_category"}}, + "categoricals": [ + {"column": "cat_one", "choices": ["good", "bad"], "metrics": ["sum"]} + ], + "groups": ["entity_id", "zip_code"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + } + aggregation = generate_spacetime_aggregation( feature_aggregation_config=aggregation_config, as_of_dates=["2017-01-02", "2017-02-02"], cohort_table="my_cohort", + feature_table_name="my_features", db_engine=db_engine, features_schema_name="features", feature_start_time="2011-01-01", ) - assert len(aggregations) == 1 - aggregation = aggregations[0] assert isinstance(aggregation, collate.SpacetimeAggregation) assert aggregation.as_of_dates == ["2017-01-02", "2017-02-02"] assert aggregation.feature_start_time == "2011-01-01" assert aggregation.groups == {"entity_id": "entity_id", "zip_code": "zip_code"} assert aggregation.intervals == {"entity_id": ["all"], "zip_code": ["all"]} - assert aggregation.from_obj == "data" + assert str(aggregation.from_obj) == "data" assert len(aggregation.aggregates) == 2 for aggregate in aggregation.aggregates: if isinstance(aggregate, collate.Categorical): @@ -118,32 +114,30 @@ def fixture_test_engine(db_engine): def test_choice_query(test_engine): - aggregation_config = [ - { - "prefix": "aprefix", - "categoricals": [ - { - "column": "cat_one", - "choice_query": "select distinct(cat_one) from data", - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - aggregations = generate_spacetime_aggregations( + aggregation_config = { + "categoricals": [ + { + "column": "cat_one", + "choice_query": "select distinct(cat_one) from data", + "metrics": ["sum"], + "imputation": {"all": {"type": "null_category"}}, + } + ], + "groups": ["entity_id"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + } + aggregation = generate_spacetime_aggregation( feature_aggregation_config=aggregation_config, as_of_dates=["2017-01-02", "2017-02-02"], cohort_table="my_cohort", db_engine=test_engine, features_schema_name="features", feature_start_time="2011-01-01", + feature_table_name="aprefix", ) - assert aggregations[0].aggregates[0].quantities == { + assert aggregation.aggregates[0].quantities == { "cat_one__NULL": ('(cat_one is NULL)::INT',), "cat_one_bad": ("(cat_one = 'bad')::INT",), "cat_one_good": ("(cat_one = 'good')::INT",), @@ -151,42 +145,39 @@ def test_choice_query(test_engine): } def test_array_categoricals(test_engine): - aggregation_config = [ - { - "prefix": "aprefix", - "array_categoricals": [ - { - "column": "cat_one", - "choices": ["good", "bad", "inbetween"], - "metrics": ["sum"], - "imputation": {"all": {"type": "null_category"}}, - } - ], - "groups": ["entity_id"], - "intervals": ["all"], - "knowledge_date_column": "knowledge_date", - "from_obj": "data", - } - ] - aggregations = generate_spacetime_aggregations( + aggregation_config = { + "array_categoricals": [ + { + "column": "cat_one", + "choices": ["good", "bad", "inbetween"], + "metrics": ["sum"], + "imputation": {"all": {"type": "null_category"}}, + } + ], + "groups": ["entity_id"], + "intervals": ["all"], + "knowledge_date_column": "knowledge_date", + "from_obj": "data", + } + aggregation = generate_spacetime_aggregation( feature_aggregation_config=aggregation_config, as_of_dates=["2017-01-02", "2017-02-02"], cohort_table="my_cohort", db_engine=test_engine, features_schema_name="features", feature_start_time="2011-01-01", + feature_table_name="aprefix", ) - assert aggregations[0].aggregates[0].quantities == { + assert aggregation.aggregates[0].quantities == { "cat_one__NULL": ('(cat_one is NULL)::INT',), "cat_one_bad": ("(cat_one @> array['bad'::varchar])::INT",), "cat_one_good": ("(cat_one @> array['good'::varchar])::INT",), "cat_one_inbetween": ("(cat_one @> array['inbetween'::varchar])::INT",), } -def test_materialize_off(db_engine): - aggregation_config = [{ - "prefix": "aprefix", +def xtest_materialize_off(db_engine): + aggregation_config = { "categoricals": [ { "column": "cat_one", @@ -199,23 +190,23 @@ def test_materialize_off(db_engine): "intervals": ["all"], "knowledge_date_column": "knowledge_date", "from_obj": "data", - }] + } with patch("triage.component.architect.feature_block_generators.FromObj") as fromobj_mock: - feature_generator = generate_spacetime_aggregations( + feature_generator = generate_spacetime_aggregation( feature_aggregation_config=aggregation_config, as_of_dates=["2017-01-02", "2017-02-02"], cohort_table="my_cohort", db_engine=db_engine, features_schema_name="features", - materialize_subquery_fromobjs=False + materialize_subquery_fromobjs=False, + feature_table_name="aprefix", ) assert not fromobj_mock.called -def test_aggregations_materialize_on(db_engine): - aggregation_config = [{ - "prefix": "aprefix", +def xtest_aggregations_materialize_on(db_engine): + aggregation_config = { "categoricals": [ { "column": "cat_one", @@ -228,16 +219,17 @@ def test_aggregations_materialize_on(db_engine): "intervals": ["all"], "knowledge_date_column": "knowledge_date", "from_obj": "data", - }] + } with patch("triage.component.architect.feature_block_generators.FromObj") as fromobj_mock: - feature_generator = generate_spacetime_aggregations( + feature_generator = generate_spacetime_aggregation( feature_aggregation_config=aggregation_config, as_of_dates=["2017-01-02", "2017-02-02"], cohort_table="my_cohort", db_engine=db_engine, features_schema_name="features", - materialize_subquery_fromobjs=True + materialize_subquery_fromobjs=True, + feature_table_name="aprefix", ) fromobj_mock.assert_called_once_with( from_obj="data", diff --git a/src/tests/collate_tests/test_spacetime.py b/src/tests/collate_tests/test_spacetime.py index c7e975a02..860b410e8 100755 --- a/src/tests/collate_tests/test_spacetime.py +++ b/src/tests/collate_tests/test_spacetime.py @@ -71,6 +71,7 @@ def test_basic_spacetime(): intervals=["1y", "2y", "all"], as_of_dates=["2016-01-01", "2015-01-01"], features_schema_name="schema", + features_table_name="myfeaturetable", cohort_table="states", entity_column="entity_id", date_column="event_date", @@ -81,7 +82,7 @@ def test_basic_spacetime(): engine.execute(st.get_create_schema()) st.run_preimputation() r = engine.execute( - "select * from schema.events_entity_id order by entity_id, as_of_date" + "select * from schema.myfeaturetable_entity_id order by entity_id, as_of_date" ) rows = [x for x in r] assert rows[0]["entity_id"] == 1 @@ -149,7 +150,7 @@ def test_basic_spacetime(): st.run_imputation() # check some imputation results r = engine.execute( - "select * from schema.events_aggregation_imputed order by entity_id, as_of_date" + "select * from schema.myfeaturetable order by entity_id, as_of_date" ) rows = [x for x in r] assert rows[6]["entity_id"] == 4 diff --git a/src/triage/component/architect/feature_block.py b/src/triage/component/architect/feature_block.py index 0ac02a746..070954a44 100644 --- a/src/triage/component/architect/feature_block.py +++ b/src/triage/component/architect/feature_block.py @@ -12,6 +12,7 @@ def __init__( db_engine, cohort_table, as_of_dates, + features_table_name, features_schema_name=None, feature_start_time=None, features_ignore_cohort=False, @@ -19,15 +20,15 @@ def __init__( self.db_engine = db_engine self.cohort_table_name = cohort_table self.as_of_dates = as_of_dates + self.features_table_name_without_schema = features_table_name self.features_schema_name = features_schema_name self.feature_start_time = feature_start_time self.features_ignore_cohort = features_ignore_cohort @property - @abstractmethod def final_feature_table_name(self): "The name of the final table with all features filled in (no missing values)" - pass + return f"{self.features_schema_name}.{self.features_table_name_without_schema}" @property @abstractmethod diff --git a/src/triage/component/architect/feature_block_generators.py b/src/triage/component/architect/feature_block_generators.py index 0a7d905e5..b526aae40 100644 --- a/src/triage/component/architect/feature_block_generators.py +++ b/src/triage/component/architect/feature_block_generators.py @@ -8,8 +8,9 @@ ) -def generate_spacetime_aggregations( +def generate_spacetime_aggregation( feature_aggregation_config, + feature_table_name, as_of_dates, cohort_table, db_engine, @@ -23,6 +24,7 @@ def generate_spacetime_aggregations( Args: feature_aggregation_config (list) all values, except for feature date, necessary to instantiate a collate.SpacetimeAggregation + feature_table_name (string) the table in which to put output features as_of_dates (list) dates to generate features as of cohort_table (string) schema.table_name for state table with all entity/date pairs db_engine (sqlalchemy.db.engine) @@ -49,10 +51,11 @@ def generate_spacetime_aggregations( feature_start_time=feature_start_time, materialize_subquery_fromobjs=materialize_subquery_fromobjs, features_ignore_cohort=features_ignore_cohort, - ).aggregations( + ).aggregation( feature_aggregation_config, as_of_dates, - cohort_table + cohort_table, + feature_table_name ) @@ -158,7 +161,7 @@ def _build_array_categoricals(self, categorical_config, impute_rules): for categorical in categorical_config ] - def _aggregation(self, aggregation_config, feature_dates, state_table): + def aggregation(self, aggregation_config, feature_dates, state_table, feature_table_name): logging.info( "Building collate.SpacetimeAggregation for config %s and %s as_of_dates", aggregation_config, @@ -203,19 +206,13 @@ def _aggregation(self, aggregation_config, feature_dates, state_table): db_engine=self.db_engine, feature_start_time=self.feature_start_time, features_schema_name=self.features_schema_name, - prefix=aggregation_config["prefix"], + features_table_name=feature_table_name, features_ignore_cohort=self.features_ignore_cohort ) - def aggregations(self, feature_aggregation_config, feature_dates, state_table): - return [ - self._aggregation(aggregation_config, feature_dates, state_table) - for aggregation_config in feature_aggregation_config - ] - FEATURE_BLOCK_GENERATOR_LOOKUP = { - 'spacetime_aggregations': generate_spacetime_aggregations + 'spacetime_aggregations': generate_spacetime_aggregation } @@ -249,15 +246,17 @@ def feature_blocks_from_config( Returns: (list) of FeatureBlock objects """ feature_blocks = [] - for config_key, config_value in config.items(): - feature_block_generator = FEATURE_BLOCK_GENERATOR_LOOKUP.get(config_key, None) + for feature_table_name, feature_block_configuration in config.items(): + feature_generator_type = feature_block_configuration.pop("feature_generator_type") + feature_block_generator = FEATURE_BLOCK_GENERATOR_LOOKUP.get(feature_generator_type, None) if not feature_block_generator: - raise ValueError(f"feature config key {config_key} does not correspond to a recognized" - " feature generator. Recognized feature generator keys:" + raise ValueError(f"feature generator type {feature_generator_type} does not correspond to a recognized" + " feature generator. Recognized feature generator types:" f"{FEATURE_BLOCK_GENERATOR_LOOKUP.keys()}") for feature_block in feature_block_generator( - config_value, + feature_block_configuration, + feature_table_name=feature_table_name, as_of_dates=as_of_dates, cohort_table=cohort_table, db_engine=db_engine, diff --git a/src/triage/component/collate/spacetime.py b/src/triage/component/collate/spacetime.py index bea219bc3..520d7bcbb 100644 --- a/src/triage/component/collate/spacetime.py +++ b/src/triage/component/collate/spacetime.py @@ -82,21 +82,19 @@ def __init__( self.from_obj = make_sql_clause(from_obj, ex.text) self.entity_column = entity_column if entity_column else "entity_id" self.prefix = prefix if prefix else str(from_obj) - self.suffix = suffix if suffix else "aggregation" self.drop_interim_tables = drop_interim_tables def get_table_name(self, group=None, imputed=False): """ Returns name for table for the given group """ - if group is None and not imputed: - name = '"%s_%s"' % (self.prefix, self.suffix) - elif group is None and imputed: - name = '"%s_%s_%s"' % (self.prefix, self.suffix, "imputed") - elif imputed: - name = '"%s"' % to_sql_name("%s_%s_%s" % (self.prefix, group, "imputed")) + if imputed: + return self.final_feature_table_name + prefix = self.features_table_name_without_schema + if group is None: + name = '"%s_%s"' % (prefix, "aggregation") else: - name = '"%s"' % to_sql_name("%s_%s" % (self.prefix, group)) + name = '"%s"' % to_sql_name("%s_%s" % (prefix, group)) schema = '"%s".' % self.features_schema_name if self.features_schema_name else "" return "%s%s" % (schema, name) @@ -219,11 +217,6 @@ def feature_columns(self): columns.add(imp_flag_col) return columns - @property - def final_feature_table_name(self): - "The name of the final table with all features filled in (no missing values)" - return self.get_table_name(imputed=True) - @property def preinsert_queries(self): """ @@ -277,7 +270,7 @@ def imputation_queries(self): if not self.cohort_table_name: logging.warning( "No cohort table defined in feature_block, cannot create imputation table for %s", - self.get_table_name(imputed=True), + self.final_feature_table_name ) return [] @@ -285,7 +278,7 @@ def imputation_queries(self): logging.warning( "Cohort table %s does not exist, cannot create imputation table for %s", self.cohort_table_name, - self.get_table_name(imputed=True), + self.final_feature_table_name ) return [] diff --git a/src/triage/experiments/base.py b/src/triage/experiments/base.py index 89fcfd0c8..138d76b5f 100644 --- a/src/triage/experiments/base.py +++ b/src/triage/experiments/base.py @@ -202,6 +202,21 @@ def initialize_components(self): "you will not be able to make matrices." ) + if "features" not in self.config: + logging.warning("No feature config is available") + return [] + logging.info("Creating feature blocks from config") + self.feature_blocks = feature_blocks_from_config( + config=self.config["features"], + as_of_dates=self.all_as_of_times, + cohort_table=self.cohort_table_name, + features_schema_name=self.features_schema_name, + db_engine=self.db_engine, + feature_start_time=self.config["temporal_config"]["feature_start_time"], + features_ignore_cohort=self.features_ignore_cohort, + materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, + ) + self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]}) ) @@ -241,7 +256,7 @@ def initialize_components(self): replace=self.replace, as_of_times=self.all_as_of_times ) - + self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, @@ -374,28 +389,6 @@ def all_as_of_times(self): ) return distinct_as_of_times - @cachedproperty - def feature_blocks(self): - """Collation of ``Aggregation`` objects used by this experiment. - - Returns: (list) of ``collate.Aggregation`` objects - - """ - if "features" not in self.config: - logging.warning("No feature config is available") - return [] - logging.info("Creating feature blocks from config") - return feature_blocks_from_config( - config=self.config["features"], - as_of_dates=self.all_as_of_times, - cohort_table=self.cohort_table_name, - features_schema_name=self.features_schema_name, - db_engine=self.db_engine, - feature_start_time=self.config["temporal_config"]["feature_start_time"], - features_ignore_cohort=self.features_ignore_cohort, - materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, - ) - @cachedproperty def master_feature_dictionary(self): """All possible features found in the database. Not all features From 3e40ae071e91590a9ef6166a47580c9e4ad9a71d Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Mon, 1 Apr 2019 10:43:59 -0500 Subject: [PATCH 12/22] converted one more spacetime test for now --- src/tests/collate_tests/test_spacetime.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tests/collate_tests/test_spacetime.py b/src/tests/collate_tests/test_spacetime.py index 860b410e8..79ef9e00a 100755 --- a/src/tests/collate_tests/test_spacetime.py +++ b/src/tests/collate_tests/test_spacetime.py @@ -232,6 +232,7 @@ def test_feature_start_time(): groups=["entity_id"], intervals=["all"], as_of_dates=["2016-01-01"], + features_table_name="event_features", cohort_table="states", entity_column="entity_id", date_column='"date"', @@ -242,7 +243,7 @@ def test_feature_start_time(): st.run_preimputation() - r = engine.execute("select * from events_entity_id order by entity_id") + r = engine.execute("select * from event_features_entity_id order by entity_id") rows = [x for x in r] assert rows[0]["entity_id"] == 1 @@ -262,6 +263,7 @@ def test_feature_start_time(): groups=["entity_id"], intervals=["1y", "all"], as_of_dates=["2016-01-01", "2015-01-01"], + features_table_name="event_features", cohort_table="states", entity_column="entity_id", date_column='"date"', From cf748229c2e5bee03b7b29495b7d7615d1d94861 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Mon, 1 Apr 2019 11:17:19 -0500 Subject: [PATCH 13/22] Finish converting spacetime tests --- src/tests/collate_tests/test_spacetime.py | 28 ++++++++++--------- .../component/architect/feature_block.py | 4 ++- src/triage/component/collate/spacetime.py | 2 +- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/tests/collate_tests/test_spacetime.py b/src/tests/collate_tests/test_spacetime.py index 79ef9e00a..8c4da8152 100755 --- a/src/tests/collate_tests/test_spacetime.py +++ b/src/tests/collate_tests/test_spacetime.py @@ -310,6 +310,7 @@ def test_features_ignore_cohort(db_engine): groups=["entity_id"], intervals=["all"], as_of_dates=["2016-01-01", "2015-01-01"], + features_table_name="event_features", cohort_table="cohort", entity_column="entity_id", date_column='"date"', @@ -320,7 +321,7 @@ def test_features_ignore_cohort(db_engine): st.run_preimputation() - r = db_engine.execute("select * from events_entity_id order by entity_id, date") + r = db_engine.execute("select * from event_features_entity_id order by entity_id, date") rows = [x for x in r] # these rows should be similar to the rows in the basic spacetime test, @@ -350,43 +351,43 @@ def test_aggregation_table_name_no_schema(): # no schema assert ( SpacetimeAggregation( - [], from_obj="source", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[], + [], from_obj="source", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[], ).get_table_name() - == '"source_aggregation"' + == '"source_features_aggregation"' ) assert ( - SpacetimeAggregation([], from_obj="source", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[]).get_table_name( + SpacetimeAggregation([], from_obj="source", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[]).get_table_name( imputed=True ) - == '"source_aggregation_imputed"' + == '"source_features"' ) # prefix assert ( SpacetimeAggregation( - [], from_obj="source", prefix="mysource", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[], + [], from_obj="source", prefix="mysource", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[], ).get_table_name() - == '"mysource_aggregation"' + == '"source_features_aggregation"' ) assert ( SpacetimeAggregation( - [], from_obj="source", prefix="mysource", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[], + [], from_obj="source", prefix="mysource", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[], ).get_table_name(imputed=True) - == '"mysource_aggregation_imputed"' + == '"source_features"' ) # schema assert ( SpacetimeAggregation( - [], from_obj="source", features_schema_name="schema", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[], + [], from_obj="source", features_schema_name="schema", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[], ).get_table_name() - == '"schema"."source_aggregation"' + == '"schema"."source_features_aggregation"' ) assert ( SpacetimeAggregation( - [], from_obj="source", features_schema_name="schema", groups=[], cohort_table="tbl", db_engine=None, as_of_dates=[], + [], from_obj="source", features_schema_name="schema", groups=[], cohort_table="tbl", features_table_name="source_features", db_engine=None, as_of_dates=[], ).get_table_name(imputed=True) - == '"schema"."source_aggregation_imputed"' + == '"schema"."source_features"' ) @@ -400,6 +401,7 @@ def test_get_feature_columns(): aggregates=[n, d, m], from_obj="source", features_schema_name="schema", + features_table_name="source_features", prefix="prefix", groups=["entity_id"], cohort_table="tbl", diff --git a/src/triage/component/architect/feature_block.py b/src/triage/component/architect/feature_block.py index 070954a44..1e2cd0b02 100644 --- a/src/triage/component/architect/feature_block.py +++ b/src/triage/component/architect/feature_block.py @@ -28,7 +28,9 @@ def __init__( @property def final_feature_table_name(self): "The name of the final table with all features filled in (no missing values)" - return f"{self.features_schema_name}.{self.features_table_name_without_schema}" + schema = '"%s".' % self.features_schema_name if self.features_schema_name else "" + name = f'"{self.features_table_name_without_schema}"' + return "%s%s" % (schema, name) @property @abstractmethod diff --git a/src/triage/component/collate/spacetime.py b/src/triage/component/collate/spacetime.py index 520d7bcbb..775b6b481 100644 --- a/src/triage/component/collate/spacetime.py +++ b/src/triage/component/collate/spacetime.py @@ -95,7 +95,7 @@ def get_table_name(self, group=None, imputed=False): name = '"%s_%s"' % (prefix, "aggregation") else: name = '"%s"' % to_sql_name("%s_%s" % (prefix, group)) - schema = '"%s".' % self.features_schema_name if self.features_schema_name else "" + schema = '"%s".' % to_sql_name(self.features_schema_name) if self.features_schema_name else "" return "%s%s" % (schema, name) def get_drops(self): From 0fd781b373e9854172a943725ca839634bcc6693 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 2 Apr 2019 10:46:12 -0500 Subject: [PATCH 14/22] More fixes --- .../architect_tests/test_feature_blocks.py | 35 +++++++++-- src/tests/architect_tests/test_integration.py | 62 +++++++++---------- src/tests/test_experiments.py | 4 +- src/tests/utils.py | 14 ++--- .../architect/feature_block_generators.py | 8 +-- src/triage/component/collate/spacetime.py | 4 +- 6 files changed, 76 insertions(+), 51 deletions(-) diff --git a/src/tests/architect_tests/test_feature_blocks.py b/src/tests/architect_tests/test_feature_blocks.py index bdc4a6277..e8a0130ee 100644 --- a/src/tests/architect_tests/test_feature_blocks.py +++ b/src/tests/architect_tests/test_feature_blocks.py @@ -53,7 +53,12 @@ def populate_cohort(db_engine): def test_FeatureBlock_generate_preimpute_tasks(db_engine): - block = FeatureBlockExample(db_engine=db_engine, cohort_table="mycohort", as_of_dates=['2016-01-01', '2016-02-01']) + block = FeatureBlockExample( + db_engine=db_engine, + cohort_table="mycohort", + features_table_name="myfeaturetable", + as_of_dates=['2016-01-01', '2016-02-01'] + ) block.needs_features = lambda: True assert block.generate_preimpute_tasks(replace=False) == { "prepare": block.preinsert_queries, @@ -71,7 +76,12 @@ def test_FeatureBlock_generate_preimpute_tasks(db_engine): def test_FeatureBlock_generate_impute_tasks(db_engine): - block = FeatureBlockExample(db_engine=db_engine, cohort_table="mycohort", as_of_dates=['2016-01-01', '2016-02-01']) + block = FeatureBlockExample( + db_engine=db_engine, + cohort_table="mycohort", + features_table_name="myfeaturetable", + as_of_dates=['2016-01-01', '2016-02-01'] + ) block.needs_features = lambda: True assert block.generate_impute_tasks(replace=False) == { "prepare": block.imputation_queries, @@ -89,7 +99,12 @@ def test_FeatureBlock_generate_impute_tasks(db_engine): def test_FeatureBlock_log_verbose_task_info(db_engine): - block = FeatureBlockExample(db_engine=db_engine, cohort_table="mycohort", as_of_dates=['2016-01-01', '2016-02-01']) + block = FeatureBlockExample( + db_engine=db_engine, + cohort_table="mycohort", + features_table_name="myfeaturetable", + as_of_dates=['2016-01-01', '2016-02-01'] + ) task = block.generate_impute_tasks(replace=True) # just want to make sure that the logging doesn't error, no assertions block.log_verbose_task_info(task) @@ -99,7 +114,12 @@ def test_FeatureBlock_needs_features(db_engine): # needs_features should function as following: # if there are members of the cohort without features, needs_features should return true # 1. a freshly created table should definitely need features - block = FeatureBlockExample(db_engine=db_engine, cohort_table="mycohort", as_of_dates=['2016-01-01', '2016-02-01']) + block = FeatureBlockExample( + db_engine=db_engine, + cohort_table="mycohort", + features_table_name="myfeaturetable", + as_of_dates=['2016-01-01', '2016-02-01'] + ) populate_cohort(db_engine) assert block.needs_features() block.run_preimputation() @@ -116,7 +136,12 @@ def test_FeatureBlock_verify_nonulls(db_engine): # verify_no_nulls should function as following: # if there are members of the cohort without features, needs_features should return true # 1. a freshly created table should definitely need features - block = FeatureBlockExample(db_engine=db_engine, cohort_table="mycohort", as_of_dates=['2016-01-01', '2016-02-01']) + block = FeatureBlockExample( + db_engine=db_engine, + cohort_table="mycohort", + features_table_name="myfeaturetable", + as_of_dates=['2016-01-01', '2016-02-01'] + ) populate_cohort(db_engine) block.run_preimputation() with pytest.raises(ValueError): diff --git a/src/tests/architect_tests/test_integration.py b/src/tests/architect_tests/test_integration.py index acd545df6..c43fe2eeb 100644 --- a/src/tests/architect_tests/test_integration.py +++ b/src/tests/architect_tests/test_integration.py @@ -221,38 +221,38 @@ def basic_integration_test( feature_blocks = feature_blocks_from_config( { - 'spacetime_aggregations': [ - { - "prefix": "cat", - "from_obj": "cat_complaints", - "knowledge_date_column": "as_of_date", - "aggregates": [ - { - "quantity": "cat_sightings", - "metrics": ["count", "avg"], - "imputation": {"all": {"type": "mean"}}, - } - ], - "intervals": ["1y"], - "groups": ["entity_id"], + 'cat': { + "feature_generator_type": "spacetime_aggregation", + "prefix": "cat", + "from_obj": "cat_complaints", + "knowledge_date_column": "as_of_date", + "aggregates": [ + { + "quantity": "cat_sightings", + "metrics": ["count", "avg"], + "imputation": {"all": {"type": "mean"}}, + } + ], + "intervals": ["1y"], + "groups": ["entity_id"], + }, + 'dog': { + "feature_generator_type": "spacetime_aggregation", + "prefix": "dog", + "from_obj": "dog_complaints", + "knowledge_date_column": "as_of_date", + "aggregates_imputation": { + "count": {"type": "constant", "value": 7}, + "sum": {"type": "mean"}, + "avg": {"type": "zero"}, }, - { - "prefix": "dog", - "from_obj": "dog_complaints", - "knowledge_date_column": "as_of_date", - "aggregates_imputation": { - "count": {"type": "constant", "value": 7}, - "sum": {"type": "mean"}, - "avg": {"type": "zero"}, - }, - "aggregates": [ - {"quantity": "dog_sightings", "metrics": ["count", "avg"]} - - ], - "intervals": ["1y"], - "groups": ["entity_id"], - }, - ] + "aggregates": [ + {"quantity": "dog_sightings", "metrics": ["count", "avg"]} + + ], + "intervals": ["1y"], + "groups": ["entity_id"], + }, }, as_of_dates=all_as_of_times, cohort_table=entity_date_table_generator.entity_date_table_name, diff --git a/src/tests/test_experiments.py b/src/tests/test_experiments.py index 6c4f4744c..d4e2a39e9 100644 --- a/src/tests/test_experiments.py +++ b/src/tests/test_experiments.py @@ -405,8 +405,8 @@ def test_baselines_with_missing_features(experiment_class): } config["feature_group_definition"] = { "tables": [ - "entity_features_aggregation_imputed", - "zip_code_features_aggregation_imputed", + "entity_features", + "zip_code_features", ] } config["feature_group_strategies"] = ["leave-one-in"] diff --git a/src/tests/utils.py b/src/tests/utils.py index 20c595462..5ed8d90c6 100644 --- a/src/tests/utils.py +++ b/src/tests/utils.py @@ -356,9 +356,9 @@ def sample_config(): } } - spacetime_agg_config = [ - { - "prefix": "entity_features", + feature_config = { + "entity_features": { + "feature_generator_type": "spacetime_aggregation", "from_obj": "cat_complaints", "knowledge_date_column": "as_of_date", "aggregates_imputation": {"all": {"type": "constant", "value": 0}}, @@ -366,8 +366,8 @@ def sample_config(): "intervals": ["1year"], "groups": ["entity_id"], }, - { - "prefix": "zip_code_features", + "zip_code_features": { + "feature_generator_type": "spacetime_aggregation", "from_obj": "entity_zip_codes join zip_code_events using (zip_code)", "knowledge_date_column": "as_of_date", "aggregates_imputation": {"all": {"type": "constant", "value": 0}}, @@ -375,7 +375,7 @@ def sample_config(): "intervals": ["1year"], "groups": ["entity_id", "zip_code"], }, - ] + } cohort_config = { "query": "select distinct(entity_id) from events where '{as_of_date}'::date >= outcome_date", @@ -402,7 +402,7 @@ def sample_config(): "entity_column_name": "entity_id", "model_comment": "test2-final-final", "model_group_keys": ["label_name", "label_type", "custom_key"], - "features": {"spacetime_aggregations": spacetime_agg_config}, + "features": feature_config, "cohort_config": cohort_config, "temporal_config": temporal_config, "grid_config": grid_config, diff --git a/src/triage/component/architect/feature_block_generators.py b/src/triage/component/architect/feature_block_generators.py index b526aae40..6da49c18e 100644 --- a/src/triage/component/architect/feature_block_generators.py +++ b/src/triage/component/architect/feature_block_generators.py @@ -212,7 +212,7 @@ def aggregation(self, aggregation_config, feature_dates, state_table, feature_ta FEATURE_BLOCK_GENERATOR_LOOKUP = { - 'spacetime_aggregations': generate_spacetime_aggregation + 'spacetime_aggregation': generate_spacetime_aggregation } @@ -254,7 +254,7 @@ def feature_blocks_from_config( " feature generator. Recognized feature generator types:" f"{FEATURE_BLOCK_GENERATOR_LOOKUP.keys()}") - for feature_block in feature_block_generator( + feature_block = feature_block_generator( feature_block_configuration, feature_table_name=feature_table_name, as_of_dates=as_of_dates, @@ -264,6 +264,6 @@ def feature_blocks_from_config( feature_start_time=feature_start_time, features_ignore_cohort=features_ignore_cohort, **kwargs - ): - feature_blocks.append(feature_block) + ) + feature_blocks.append(feature_block) return feature_blocks diff --git a/src/triage/component/collate/spacetime.py b/src/triage/component/collate/spacetime.py index 775b6b481..c1ace89f7 100644 --- a/src/triage/component/collate/spacetime.py +++ b/src/triage/component/collate/spacetime.py @@ -81,7 +81,7 @@ def __init__( self.aggregates = aggregates self.from_obj = make_sql_clause(from_obj, ex.text) self.entity_column = entity_column if entity_column else "entity_id" - self.prefix = prefix if prefix else str(from_obj) + self.prefix = prefix if prefix else self.features_table_name_without_schema self.drop_interim_tables = drop_interim_tables def get_table_name(self, group=None, imputed=False): @@ -227,7 +227,7 @@ def preinsert_queries(self): Returns a list of queries/executable statements """ - return [self.get_drop()] + self.get_drops() + list(self.get_creates().values()) + return [self.get_create_schema(), self.get_drop()] + self.get_drops() + list(self.get_creates().values()) @property def insert_queries(self): From daf088de29483cca52aac9d5f710e5239147a0af Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 2 Apr 2019 11:38:04 -0500 Subject: [PATCH 15/22] Fix behavior when no feature config is given --- src/tests/test_partial_experiments.py | 15 +++-------- .../architect/feature_block_generators.py | 6 +++-- src/triage/experiments/base.py | 27 ++++++++++--------- 3 files changed, 21 insertions(+), 27 deletions(-) diff --git a/src/tests/test_partial_experiments.py b/src/tests/test_partial_experiments.py index 179de8c69..bb1e116f2 100644 --- a/src/tests/test_partial_experiments.py +++ b/src/tests/test_partial_experiments.py @@ -115,7 +115,7 @@ def test_run(self): if "_aggregation" in table ] - assert len(generated_tables) == len(sample_config()["features"]["spacetime_aggregations"]) + assert len(generated_tables) == len(sample_config()["features"]) for table in generated_tables: table_should_have_data(table, experiment.db_engine) @@ -140,17 +140,8 @@ class PostimputationFeatures(TestCase): def test_run(self): with prepare_experiment(self.config) as experiment: experiment.run() - generated_tables = [ - table - for table in schema_tables( - experiment.features_schema_name, experiment.db_engine - ).keys() - if "_aggregation_imputed" in table - ] - - assert len(generated_tables) == len(sample_config()["features"]["spacetime_aggregations"]) - for table in generated_tables: - table_should_have_data(table, experiment.db_engine) + for feature_table_name in self.config['features'].keys(): + table_should_have_data("{}.{}".format(experiment.features_schema_name, feature_table_name), experiment.db_engine) def test_validate_nonstrict(self): with prepare_experiment(self.config) as experiment: diff --git a/src/triage/component/architect/feature_block_generators.py b/src/triage/component/architect/feature_block_generators.py index 6da49c18e..c117c8ff9 100644 --- a/src/triage/component/architect/feature_block_generators.py +++ b/src/triage/component/architect/feature_block_generators.py @@ -1,3 +1,4 @@ +import copy import logging from triage.component.collate import ( @@ -247,7 +248,8 @@ def feature_blocks_from_config( """ feature_blocks = [] for feature_table_name, feature_block_configuration in config.items(): - feature_generator_type = feature_block_configuration.pop("feature_generator_type") + config_to_pass = copy.deepcopy(feature_block_configuration) + feature_generator_type = config_to_pass.pop("feature_generator_type") feature_block_generator = FEATURE_BLOCK_GENERATOR_LOOKUP.get(feature_generator_type, None) if not feature_block_generator: raise ValueError(f"feature generator type {feature_generator_type} does not correspond to a recognized" @@ -255,7 +257,7 @@ def feature_blocks_from_config( f"{FEATURE_BLOCK_GENERATOR_LOOKUP.keys()}") feature_block = feature_block_generator( - feature_block_configuration, + config_to_pass, feature_table_name=feature_table_name, as_of_dates=as_of_dates, cohort_table=cohort_table, diff --git a/src/triage/experiments/base.py b/src/triage/experiments/base.py index 138d76b5f..67b1e3a9d 100644 --- a/src/triage/experiments/base.py +++ b/src/triage/experiments/base.py @@ -202,20 +202,21 @@ def initialize_components(self): "you will not be able to make matrices." ) - if "features" not in self.config: + if "features" in self.config: + logging.info("Creating feature blocks from config") + self.feature_blocks = feature_blocks_from_config( + config=self.config["features"], + as_of_dates=self.all_as_of_times, + cohort_table=self.cohort_table_name, + features_schema_name=self.features_schema_name, + db_engine=self.db_engine, + feature_start_time=self.config["temporal_config"]["feature_start_time"], + features_ignore_cohort=self.features_ignore_cohort, + materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, + ) + else: logging.warning("No feature config is available") - return [] - logging.info("Creating feature blocks from config") - self.feature_blocks = feature_blocks_from_config( - config=self.config["features"], - as_of_dates=self.all_as_of_times, - cohort_table=self.cohort_table_name, - features_schema_name=self.features_schema_name, - db_engine=self.db_engine, - feature_start_time=self.config["temporal_config"]["feature_start_time"], - features_ignore_cohort=self.features_ignore_cohort, - materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, - ) + self.feature_blocks = [] self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]}) From 4a8443129784ca936f6e416451500aabd52f8c04 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 2 Apr 2019 12:05:04 -0500 Subject: [PATCH 16/22] Fix validation --- src/triage/experiments/validate.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/triage/experiments/validate.py b/src/triage/experiments/validate.py index 02b35f91e..034848914 100644 --- a/src/triage/experiments/validate.py +++ b/src/triage/experiments/validate.py @@ -516,7 +516,7 @@ def _validate_prefixes(self, prefix_list): ) ) - def _run(self, feature_group_definition, feature_blocks): + def _run(self, feature_group_definition, feature_table_names): if not isinstance(feature_group_definition, dict): raise ValueError( dedent( @@ -557,10 +557,7 @@ def _run(self, feature_group_definition, feature_blocks): self._validate_prefixes(feature_group_definition["prefix"]) if "tables" in feature_group_definition: - available_tables = { - feature_block.final_feature_table_name - for feature_block in feature_blocks - } + available_tables = feature_table_names bad_tables = set(feature_group_definition["tables"]) - available_tables if bad_tables: raise ValueError( @@ -819,7 +816,8 @@ def _run(self, scoring_config): class FeatureValidator(Validator): def _run(self, feature_config): feature_lookup = architect.feature_block_generators.FEATURE_BLOCK_GENERATOR_LOOKUP - bad_keys = feature_config.keys() - feature_lookup.keys() + given_keys = set(feature_block['feature_generator_type'] for feature_block in feature_config.values()) + bad_keys = given_keys - feature_lookup.keys() if bad_keys: raise ValueError( dedent( @@ -838,7 +836,7 @@ def _run(self, feature_config): class ExperimentValidator(Validator): - def run(self, experiment_config, feature_blocks): + def run(self, experiment_config): TemporalValidator(strict=self.strict).run( experiment_config.get("temporal_config", {}) ) @@ -853,7 +851,7 @@ def run(self, experiment_config, feature_blocks): ) FeatureGroupDefinitionValidator(strict=self.strict).run( experiment_config.get("feature_group_definition", {}), - feature_blocks + set(experiment_config.get("features", {}).keys()) ) FeatureGroupStrategyValidator(strict=self.strict).run( experiment_config.get("feature_group_strategies", []) From b7c94268ca0c5aae4b19baa410a4ef1d4f2cfce8 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 2 Apr 2019 12:36:18 -0500 Subject: [PATCH 17/22] Fix validate call --- src/triage/experiments/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/triage/experiments/base.py b/src/triage/experiments/base.py index 67b1e3a9d..3e26968c0 100644 --- a/src/triage/experiments/base.py +++ b/src/triage/experiments/base.py @@ -593,7 +593,7 @@ def train_and_test_models(self): self.process_train_test_batches(batches) def validate(self, strict=True): - ExperimentValidator(self.db_engine, strict=strict).run(self.config, self.feature_blocks) + ExperimentValidator(self.db_engine, strict=strict).run(self.config) def _run(self): try: From 403d0f56d24013b5cbb832ce59af9a05b0fde2a0 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 2 Apr 2019 12:49:42 -0500 Subject: [PATCH 18/22] Use prefixes in spacetime tests --- src/tests/collate_tests/test_spacetime.py | 4 ++++ src/triage/component/collate/spacetime.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/tests/collate_tests/test_spacetime.py b/src/tests/collate_tests/test_spacetime.py index 8c4da8152..2fb163892 100755 --- a/src/tests/collate_tests/test_spacetime.py +++ b/src/tests/collate_tests/test_spacetime.py @@ -67,6 +67,7 @@ def test_basic_spacetime(): st = SpacetimeAggregation( aggregates=[agg], from_obj="events", + prefix="events", groups=["entity_id"], intervals=["1y", "2y", "all"], as_of_dates=["2016-01-01", "2015-01-01"], @@ -229,6 +230,7 @@ def test_feature_start_time(): st = SpacetimeAggregation( aggregates=[agg], from_obj="events", + prefix="events", groups=["entity_id"], intervals=["all"], as_of_dates=["2016-01-01"], @@ -260,6 +262,7 @@ def test_feature_start_time(): st = SpacetimeAggregation( aggregates=[agg], from_obj="events", + prefix="events", groups=["entity_id"], intervals=["1y", "all"], as_of_dates=["2016-01-01", "2015-01-01"], @@ -307,6 +310,7 @@ def test_features_ignore_cohort(db_engine): st = SpacetimeAggregation( aggregates=[agg], from_obj="events", + prefix="events", groups=["entity_id"], intervals=["all"], as_of_dates=["2016-01-01", "2015-01-01"], diff --git a/src/triage/component/collate/spacetime.py b/src/triage/component/collate/spacetime.py index c1ace89f7..5e63d8d80 100644 --- a/src/triage/component/collate/spacetime.py +++ b/src/triage/component/collate/spacetime.py @@ -227,7 +227,11 @@ def preinsert_queries(self): Returns a list of queries/executable statements """ - return [self.get_create_schema(), self.get_drop()] + self.get_drops() + list(self.get_creates().values()) + preinserts = [self.get_drop()] + self.get_drops() + list(self.get_creates().values()) + create_schema = self.get_create_schema() + if create_schema: + preinserts.insert(0, create_schema) + return preinserts @property def insert_queries(self): From a628bcd453c4e57d563d191aba64105ae958ea34 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 2 Apr 2019 13:20:32 -0500 Subject: [PATCH 19/22] Postmodeling fixes, start to correct documentation --- docs/sources/experiments/extending-features.md | 4 ++-- src/tests/collate_tests/test_imputation_output.py | 7 ++++--- src/tests/conftest.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/sources/experiments/extending-features.md b/docs/sources/experiments/extending-features.md index 1ca5530db..4f75cff59 100644 --- a/docs/sources/experiments/extending-features.md +++ b/docs/sources/experiments/extending-features.md @@ -10,7 +10,7 @@ A FeatureBlock represents a single feature table in the database and how to gene Class name | Experiment config key | Use ------------ | ------------- | ------------ -triage.component.collate.SpacetimeAggregation | spacetime_aggregations | Temporal aggregations of event-based data +triage.component.collate.SpacetimeAggregation | spacetime_aggregation | Temporal aggregations of event-based data ## Writing a new FeatureBlock class @@ -22,7 +22,6 @@ Any method here without parentheses afterwards is expected to be a property. Method | Task | Return Type ------------ | ------------- | ------------- -final_feature_table_name | The name of the final table with all features filled in (no missing values) | string feature_columns | The list of feature columns in the final, postimputation table. Should exclude any index columns (e.g. entity id, date) | list preinsert_queries | Return all queries that should be run before inserting any data. The creation of your feature table should happen here, and is expected to have `entity_id(integer)` and `as_of_date(timestamp)` columns. | list insert_queries | Return all inserts to populate this data. Each query in this list should be parallelizable, and should be valid after all `preinsert_queries` are run. | list @@ -39,6 +38,7 @@ Name | Type | Purpose ------------ | ------------- | ------------- as_of_dates | list | Features are created "as of" specific dates, and expects that each of these dates will be populated with a row for each member of the cohort on that date. cohort_table | string | The final shape of the feature table should at least include every entity id/date pair in this cohort table. +final_feature_table_name | string | The name of the final table with all features filled in (no missing values). This is provided by the user in feature config, as the key that corresponds to the configuration block that instantiates. db_engine | sqlalchemy.engine | The engine to use to access the database. Although these instances are mostly returning queries, the engine may be useful for implementing imputation. features_schema_name | string | The database schema where all feature tables should reside. Defaults to None, which ends up in the public schema. feature_start_time | string/datetime | A time before which no data should be considered for features. This is generally only applicable if your FeatureBlock is doing temporal aggregations. Defaults to None, which means no data will be excluded. diff --git a/src/tests/collate_tests/test_imputation_output.py b/src/tests/collate_tests/test_imputation_output.py index c0cdc45cc..2c8e222f9 100644 --- a/src/tests/collate_tests/test_imputation_output.py +++ b/src/tests/collate_tests/test_imputation_output.py @@ -122,7 +122,7 @@ def test_imputation_output(feat_list, exp_imp_cols, feat_table): ) engine.execute( - """create table prefix_aggregation ( + """create table myfeatures_aggregation ( entity_id int , as_of_date date %s @@ -130,7 +130,7 @@ def test_imputation_output(feat_list, exp_imp_cols, feat_table): % feat_sql ) ins_sql = ( - "insert into prefix_aggregation values (%s, %s" + "insert into myfeatures_aggregation values (%s, %s" + (", %s" * len(feat_list)) + ")" ) @@ -159,6 +159,7 @@ def test_imputation_output(feat_list, exp_imp_cols, feat_table): st = SpacetimeAggregation( aggregates=aggs, db_engine=engine, + features_table_name="myfeatures", from_obj="prefix_events", prefix="prefix", groups=["entity_id"], @@ -175,7 +176,7 @@ def test_imputation_output(feat_list, exp_imp_cols, feat_table): st.run_imputation() # check the results - df = pd.read_sql("SELECT * FROM prefix_aggregation_imputed", engine) + df = pd.read_sql("SELECT * FROM myfeatures", engine) # we should have a record for every entity/date combo assert df.shape[0] == len(states_table) diff --git a/src/tests/conftest.py b/src/tests/conftest.py index abbf72dad..c13ef4577 100644 --- a/src/tests/conftest.py +++ b/src/tests/conftest.py @@ -108,7 +108,7 @@ def crosstabs_config(): using (model_id, as_of_date)""", "features_query": """ select m.model_id, f1.* - from features.entity_features_aggregation_imputed f1 join + from features.entity_features f1 join models_dates_join_query m using (as_of_date)""", "predictions_query": """ select model_id, From a98fae44979b58c93005256eb41a432f70d0ceb5 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 2 Apr 2019 13:52:16 -0500 Subject: [PATCH 20/22] Update docs some more --- .../sources/experiments/extending-features.md | 20 +++++---- docs/sources/experiments/feature-testing.md | 43 ++++++++++--------- docs/sources/experiments/upgrade-to-v7.md | 37 +++++++++------- src/triage/cli.py | 6 +-- 4 files changed, 58 insertions(+), 48 deletions(-) diff --git a/docs/sources/experiments/extending-features.md b/docs/sources/experiments/extending-features.md index 4f75cff59..a3c523ad8 100644 --- a/docs/sources/experiments/extending-features.md +++ b/docs/sources/experiments/extending-features.md @@ -38,7 +38,7 @@ Name | Type | Purpose ------------ | ------------- | ------------- as_of_dates | list | Features are created "as of" specific dates, and expects that each of these dates will be populated with a row for each member of the cohort on that date. cohort_table | string | The final shape of the feature table should at least include every entity id/date pair in this cohort table. -final_feature_table_name | string | The name of the final table with all features filled in (no missing values). This is provided by the user in feature config, as the key that corresponds to the configuration block that instantiates. +final_feature_table_name | string | The name of the final table with all features filled in (no missing values). This is provided by the user in feature config, as the key that corresponds to the configuration section that instantiates the feature block db_engine | sqlalchemy.engine | The engine to use to access the database. Although these instances are mostly returning queries, the engine may be useful for implementing imputation. features_schema_name | string | The database schema where all feature tables should reside. Defaults to None, which ends up in the public schema. feature_start_time | string/datetime | A time before which no data should be considered for features. This is generally only applicable if your FeatureBlock is doing temporal aggregations. Defaults to None, which means no data will be excluded. @@ -64,10 +64,6 @@ class SimpleQueryFeature(FeatureBlock): self.query = query super().__init__(*args, **kwargs) - @property - def final_feature_table_name(self): - return f"{self.features_schema_name}.mytable" - @property def feature_columns(self): return ['myfeature'] @@ -105,6 +101,7 @@ This class would allow many different uses: basically any query a user can come feature_block = SimpleQueryFeature( query="select entity_id, as_of_date, quantity from source_table where date < '{as_of_date}'", as_of_dates=["2016-01-01"], + features_table_name="my_features", cohort_table="my_cohort_table", db_engine=triage.create_engine(<..mydbinfo..>) ) @@ -132,12 +129,17 @@ FEATURE_BLOCK_GENERATOR_LOOKUP = { } ``` -At this point, you could use it in an experiment configuration like this: +At this point, you could use it in an experiment configuration by adding a feature table section and specifying the `feature_generator_type` key to be the name you just put in the lookup, `simple_query`. All other keys/values in that config block will be passed to the constructor to your class. Since the class you defined only takes in one extra keyword argument (the query), the only other key you need to specify in config is that query. + +An example: ```yaml features: - simple_query: - - query: "select entity_id, as_of_date, quantity from source_table where date < '{as_of_date}'" - - query: "select entity_id, as_of_date, other_quantity from other_source_table where date < '{as_of_date}'" + my_feature_table: + feature_generator_type: "simple_query" + query: "select entity_id, as_of_date, quantity from source_table where date < '{as_of_date}'" + my_other_feature_table: + feature_generator_type: "simple_query" + query: "select entity_id, as_of_date, other_quantity from other_source_table where date < '{as_of_date}'" ``` diff --git a/docs/sources/experiments/feature-testing.md b/docs/sources/experiments/feature-testing.md index 800bbe34f..3e814d121 100644 --- a/docs/sources/experiments/feature-testing.md +++ b/docs/sources/experiments/feature-testing.md @@ -33,26 +33,29 @@ logging.basicConfig(level=logging.INFO) db_url = 'your db url here' db_engine = create_engine(db_url) -feature_config = {'spacetime_aggregations': [{ - 'prefix': 'aprefix', - 'aggregates': [ - { - 'quantity': 'quantity_one', - 'metrics': ['sum', 'count'], - } - ], - 'categoricals': [ - { - 'column': 'cat_one', - 'choices': ['good', 'bad'], - 'metrics': ['sum'] - }, - ], - 'groups': ['entity_id', 'zip_code'], - 'intervals': ['all'], - 'knowledge_date_column': 'knowledge_date', - 'from_obj': 'data' -}]} +feature_config = { + 'myfeaturetable': { + 'feature_generator_type': 'spacetime_aggregation', + 'prefix': 'aprefix', + 'aggregates': [ + { + 'quantity': 'quantity_one', + 'metrics': ['sum', 'count'], + } + ], + 'categoricals': [ + { + 'column': 'cat_one', + 'choices': ['good', 'bad'], + 'metrics': ['sum'] + }, + ], + 'groups': ['entity_id', 'zip_code'], + 'intervals': ['all'], + 'knowledge_date_column': 'knowledge_date', + 'from_obj': 'data' + } +} feature_blocks = feature_blocks_from_config( feature_config, diff --git a/docs/sources/experiments/upgrade-to-v7.md b/docs/sources/experiments/upgrade-to-v7.md index 89476e048..b6cd38176 100644 --- a/docs/sources/experiments/upgrade-to-v7.md +++ b/docs/sources/experiments/upgrade-to-v7.md @@ -4,7 +4,12 @@ This document details the steps needed to update a triage v6 configuration to v7, mimicking the old behavior. -Experiment configuration v7 includes only one change from v6: The features are given at a different key. Instead of `feature_aggregations`, to make space for non-collate features to be added in the future, there is now a more generic `features` key, under which collate features reside at `spacetime_aggregations`. +Experiment configuration v7 includes only one change from v6: The features are given at a different key. Instead of `feature_aggregations`, to make space for non-collate features to be added in the future, there is now a more generic `features` key. The value of this key is a dictionary, the key of which is the desired output table name for that feature table, and the value of which is the same as the configuration for each feature aggregation from before. There is one change to this. A new key called 'feature_generator_type', to specify which method is being used to generate this feature table. Since non-collate features have not been added yet, there is only one key for this: `spacetime_aggregation`. + +Since the output feature table name is now configurable, there are two things to note: +- Final tables won't necessarily be suffixed with `_aggregation_imputed` as they were before. If you would like to use the old naming system, for instance to avoid having to change postmodeling code that reads features from the database, you can add that suffix to your table name. The example below does set the table name to match what it was before, but there's no reason you have to follow this if you don't want! You can call the table whatever you want. +- The `prefix` key is no longer used to construct the table name. It is still used to prefix column names, if present. If not present, the name of the feature table will be used. + Old: @@ -31,21 +36,21 @@ New: ``` features: - spacetime_aggregations: - - - prefix: 'prefix' - from_obj: 'cool_stuff' - knowledge_date_column: 'open_date' - aggregates_imputation: - all: - type: 'constant' - value: 0 - aggregates: - - - quantity: 'homeless::INT' - metrics: ['count', 'sum'] - intervals: ['1 year', '2 year'] - groups: ['entity_id'] + prefix_aggregation_imputed: + feature_generator_type: 'spacetime_aggregation' + prefix: 'prefix' + from_obj: 'cool_stuff' + knowledge_date_column: 'open_date' + aggregates_imputation: + all: + type: 'constant' + value: 0 + aggregates: + - + quantity: 'homeless::INT' + metrics: ['count', 'sum'] + intervals: ['1 year', '2 year'] + groups: ['entity_id'] ``` ## Upgrading the experiment config version diff --git a/src/triage/cli.py b/src/triage/cli.py index fc2bbd8db..24195c469 100755 --- a/src/triage/cli.py +++ b/src/triage/cli.py @@ -111,13 +111,13 @@ def configversion(self, args): @Triage.register class FeatureTest(Command): - """Test a feature aggregation by running it for one date""" + """Test features by running them for one date""" def __init__(self, parser): parser.add_argument( - "feature_config_file", + "experiment_config_file", type=argparse.FileType("r"), - help="Feature config YAML file, containing a list of feature_aggregation objects", + help="Experiment config YAML file, containing at least one feature configuration block. Cohort config will be used if present to filter the results. Any other keys will be ignored.", ) parser.add_argument( "as_of_date", From acb8c450cbccf9751842207d0c5b1cbcfd2e9b17 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 2 Apr 2019 16:22:06 -0500 Subject: [PATCH 21/22] Fix reference to experiment config file in featuretest CLI --- src/triage/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/triage/cli.py b/src/triage/cli.py index 24195c469..3a2b3447b 100755 --- a/src/triage/cli.py +++ b/src/triage/cli.py @@ -128,7 +128,7 @@ def __init__(self, parser): def __call__(self, args): self.root.setup() # Loading configuration (if exists) db_engine = create_engine(self.root.db_url) - full_config = yaml.load(args.feature_config_file) + full_config = yaml.load(args.experiment_config_file) feature_config = full_config['features'] cohort_config = full_config.get('cohort_config', None) if cohort_config: From fd1887c8c338a3738d267ad84b2b3c0839146447 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Tue, 7 May 2019 14:19:04 -0500 Subject: [PATCH 22/22] Reimplement impflag squashing changes --- src/tests/collate_tests/test_spacetime.py | 15 +++-- src/triage/component/collate/imputations.py | 2 +- src/triage/component/collate/spacetime.py | 65 ++++++++++++--------- 3 files changed, 46 insertions(+), 36 deletions(-) diff --git a/src/tests/collate_tests/test_spacetime.py b/src/tests/collate_tests/test_spacetime.py index 10c4eebb6..7080c468d 100755 --- a/src/tests/collate_tests/test_spacetime.py +++ b/src/tests/collate_tests/test_spacetime.py @@ -199,17 +199,20 @@ def test_basic_spacetime(): assert st.feature_columns == { "events_entity_id_1y_outcome::int_sum", - "events_entity_id_1y_outcome::int_sum_imp", "events_entity_id_1y_outcome::int_avg", - "events_entity_id_1y_outcome::int_avg_imp", + "events_entity_id_1y_outcome::int_stddev", + "events_entity_id_1y_outcome::int_imp", + "events_entity_id_1y_outcome::int_stddev_imp", "events_entity_id_2y_outcome::int_sum", - "events_entity_id_2y_outcome::int_sum_imp", "events_entity_id_2y_outcome::int_avg", - "events_entity_id_2y_outcome::int_avg_imp", + "events_entity_id_2y_outcome::int_stddev", + "events_entity_id_2y_outcome::int_imp", + "events_entity_id_2y_outcome::int_stddev_imp", "events_entity_id_all_outcome::int_sum", - "events_entity_id_all_outcome::int_sum_imp", "events_entity_id_all_outcome::int_avg", - "events_entity_id_all_outcome::int_avg_imp" + "events_entity_id_all_outcome::int_stddev", + "events_entity_id_all_outcome::int_imp", + "events_entity_id_all_outcome::int_stddev_imp", } diff --git a/src/triage/component/collate/imputations.py b/src/triage/component/collate/imputations.py index 417b1a80e..24e64ee8d 100644 --- a/src/triage/component/collate/imputations.py +++ b/src/triage/component/collate/imputations.py @@ -29,7 +29,7 @@ def _base_sql(self): def imputed_flag_select_and_alias(self): if not self.noflag: template = """CASE WHEN "{col}" IS NULL THEN 1::SMALLINT ELSE 0::SMALLINT END""" - alias_template = "{base_for_impflag}_{suffix}" + alias_template = "{base_for_impflag}{suffix}" if self.column_base_for_impflag: return ( template.format(col=self.column), diff --git a/src/triage/component/collate/spacetime.py b/src/triage/component/collate/spacetime.py index 379393d4d..c6d76e9d6 100644 --- a/src/triage/component/collate/spacetime.py +++ b/src/triage/component/collate/spacetime.py @@ -142,13 +142,32 @@ def imputed_flag_column_names(self): schema=self.features_schema_name or 'public', suffix=IMPUTATION_COLNAME_SUFFIX ) - print(feature_names_query) feature_names = [ row[0] for row in self.db_engine.execute(feature_names_query) ] return feature_names + def _basecol_of_impflag(self, impflag_col): + # we don't want to add redundant imputation flags. for a given source + # column and time interval, all of the functions will have identical + # sets of rows that needed imputation + # to reliably merge these, we lookup the original aggregate that produced + # the function, and see its available functions. we expect exactly one of + # these functions to end the column name and remove it if so + if hasattr(self.colname_aggregate_lookup[impflag_col], 'functions'): + agg_functions = self.colname_aggregate_lookup[impflag_col].functions + used_function = next(funcname for funcname in agg_functions if impflag_col.endswith(funcname)) + if used_function in AGGFUNCS_NEED_MULTIPLE_VALUES: + return impflag_col + else: + return impflag_col.rstrip('_' + used_function) + else: + logging.warning("Imputation flag merging is not implemented for " + "AggregateExpression objects that don't define an aggregate " + "function (e.g. composites)") + return impflag_col + def _get_impute_select(self, impute_cols, nonimpute_cols, partitionby=None): imprules = self.get_imputation_rules() @@ -172,26 +191,6 @@ def _get_impute_select(self, impute_cols, nonimpute_cols, partitionby=None): # for columns that do require imputation, include SQL to do the imputation work # and a flag for whether the value was imputed if col in impute_cols: - # we don't want to add redundant imputation flags. for a given source - # column and time interval, all of the functions will have identical - # sets of rows that needed imputation - # to reliably merge these, we lookup the original aggregate that produced - # the function, and see its available functions. we expect exactly one of - # these functions to end the column name and remove it if so - # this is passed to the imputer - if hasattr(self.colname_aggregate_lookup[col], 'functions'): - agg_functions = self.colname_aggregate_lookup[col].functions - used_function = next(funcname for funcname in agg_functions if col.endswith(funcname)) - if used_function in AGGFUNCS_NEED_MULTIPLE_VALUES: - impflag_basecol = col - else: - impflag_basecol = col.rstrip('_' + used_function) - else: - logging.warning("Imputation flag merging is not implemented for " - "AggregateExpression objects that don't define an aggregate " - "function (e.g. composites)") - impflag_basecol = col - impute_rule = imprules[col] try: @@ -202,6 +201,7 @@ def _get_impute_select(self, impute_cols, nonimpute_cols, partitionby=None): % (impute_rule.get("type", ""), col) ) from err + impflag_basecol = self._basecol_of_impflag(col) imputer = imputer(column=col, column_base_for_impflag=impflag_basecol, partitionby=partitionby, **impute_rule) query += "\n,%s" % imputer.to_sql() @@ -236,13 +236,21 @@ def feature_columns(self): Should exclude any index columns (e.g. entity id, date) """ - columns = self.feature_columns_sans_impflags + # start with all columns defined in the feature block. + # this is important as we don't want to return columns in the final feature table that + # aren't defined in the feature block (e.g. from an earlier run with more features); + # this will exclude impflag columns as they are decided after initial features are written + feature_columns = self.feature_columns_sans_impflags + impflag_columns = set() + + # our list of imputation flag columns comes from the database, + # but it may contain columns from prior runs that we didn't specify imputation_flag_feature_cols = self.imputed_flag_column_names() - print(imputation_flag_feature_cols) - for imp_flag_col in imputation_flag_feature_cols: - if imp_flag_col[:-len(IMPUTATION_COLNAME_SUFFIX)] in columns: - columns.add(imp_flag_col) - return columns + for feature_column in feature_columns: + impflag_name = self._basecol_of_impflag(feature_column) + IMPUTATION_COLNAME_SUFFIX + if impflag_name in imputation_flag_feature_cols: + impflag_columns.add(impflag_name) + return feature_columns | impflag_columns @property def preinsert_queries(self): @@ -354,9 +362,8 @@ def colname_aggregate_lookup(self): for group, groupby in self.groups.items(): intervals = self.intervals[group] for interval in intervals: - date = self.dates[0] for agg in self.aggregates: - for col in self._cols_for_aggregate(agg, group, interval, date): + for col in self._cols_for_aggregate(agg, group, interval, None): if col.name in lookup: raise ValueError("Duplicate feature column name found: ", col.name) lookup[col.name] = agg