dssg · shaycrk · Aug 27, 2021 · Feb 20, 2019 · Feb 20, 2019 · Feb 21, 2019
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -120,6 +120,7 @@ nav:
           - Using Postmodeling: postmodeling/index.md
           - Postmodeling & Crosstabs Configuration: postmodeling/postmodeling-config.md
       - Model governance:  dirtyduck/ml_governance.md
+      -Predictlist: predictlist/index.md
       - Scaling up: dirtyduck/aws_batch.md
       - API Reference:
           - Audition: 

diff --git a/docs/sources/predictlist/index.md b/docs/sources/predictlist/index.md
@@ -0,0 +1,87 @@
+# Retrain and Predict
+Use an existing model group to retrain a new model on all the data up to the current date and then predict forward into the future.
+
+## Examples
+Both examples assume you have already run a Triage Experiment in the past, and know these two pieces of information:
+1. A `model_group_id` from a Triage model group that you want to use to retrain a model and generate prediction
+2. A `today` to generate your predictions on.
+
+### CLI
+`triage retrainpredict <model_group_id> <today>`
+
+Example:
+`triage retrainpredict 30 2021-04-04`
+
+The `retrainpredict` will assume the current path to be the 'project path' to train models and write matrices, but this can be overridden by sending the `--project-path` option
+
+### Python
+The `Retrainer` class from `triage.predictlist` module can be used to retrain a model and predict forward.
+
+```python
+from triage.predictlist import Retrainer
+from triage import create_engine
+
+retrainer = Retrainer(
+    db_engine=create_engine(<your-db-info>),
+    project_path='/home/you/triage/project2'
+    model_group_id=36,
+)
+retrainer.retrain(today='2021-04-04')
+retrainer.predict(today='2021-04-04')
+
+```
+
+## Output
+The retrained model is sotred similariy to the matrices created during an Experiment:
+- Raw Matrix saved to the matrices directory in project storage
+- Raw Model saved to the trained_model directory in project storage
+- Retrained Model info saved in a table (triage_metadata.models) where model_comment = 'retrain_2021-04-04 21:19:09.975112'
+- Predictions saved in a table (triage_production.predictions)
+- Prediction metadata (tiebreaking, random seed) saved in a table (triage_produciton.prediction_metadata)
+
+
+# Predictlist
+If you would like to generate a list of predictions on already-trained Triage model with new data, you can use the 'Predictlist' module.
+
+# Predict Foward with Existed Model
+Use an existing model object to generate predictions on new data.
+
+## Examples
+Both examples assume you have already run a Triage Experiment in the past, and know these two pieces of information:
+1. A `model_id` from a Triage model that you want to use to generate predictions
+2. An `as_of_date` to generate your predictions on.
+
+### CLI
+`triage predictlist <model_id> <as_of_date>`
+
+Example:
+`triage predictlist 46 2019-05-06`
+
+The predictlist will assume the current path to be the 'project path' to find models and write matrices, but this can be overridden by sending the `--project-path` option.
+
+### Python
+
+The `predict_forward_with_existed_model` function from the `triage.predictlist` module can be used similarly to the CLI, with the addition of the database engine and project storage as inputs.
+```
+from triage.predictlist import generate predict_forward_with_existed_model 
+from triage import create_engine
+
+predict_forward_with_existed_model(
+    db_engine=create_engine(<your-db-info>),
+    project_path='/home/you/triage/project2'
+    model_id=46,
+    as_of_date='2019-05-06'
+)
+```
+
+## Output
+The Predictlist is stored similarly to the matrices created during an Experiment:
+- Raw Matrix saved to the matrices directory in project storage
+- Predictions saved in a table (triage_production.predictions)
+- Prediction metadata (tiebreaking, random seed) saved in a table (triage_production.prediction_metadata)
+
+## Notes
+- The cohort and features for the Predictlist are all inferred from the Experiment that trained the given model_id (as defined by the experiment_models table).
+- The feature list ensures that imputation flag columns are present for any columns that either needed to be imputed in the training process, or that needed to be imputed in the predictlist dataset.
+
+
diff --git a/src/tests/catwalk_tests/test_integration.py b/src/tests/catwalk_tests/test_integration.py
@@ -6,6 +6,7 @@
     MatrixStore,
     MatrixStorageEngine,
 )
+from triage.tracking import initialize_tracking_and_get_run_id
 from tests.utils import (
     get_matrix_store,
     matrix_metadata_creator,
@@ -19,12 +20,20 @@ def test_ModelTrainTester_generate_tasks(db_engine_with_results_schema, project_
     model_storage_engine = ModelStorageEngine(project_storage)
     matrix_storage_engine = MatrixStorageEngine(project_storage)
     sample_matrix_store = get_matrix_store(project_storage)
-    experiment_hash = save_experiment_and_get_hash({}, 1234, db_engine)
+    experiment_hash = save_experiment_and_get_hash({}, db_engine)
+    run_id = initialize_tracking_and_get_run_id(
+        experiment_hash,
+        experiment_class_path="",
+        random_seed=5,
+        experiment_kwargs={},
+        db_engine=db_engine_with_results_schema
+    )
     # instantiate pipeline objects
     trainer = ModelTrainer(
         experiment_hash=experiment_hash,
         model_storage_engine=model_storage_engine,
         db_engine=db_engine,
+        run_id=run_id,
     )
     train_tester = ModelTrainTester(
         matrix_storage_engine=matrix_storage_engine,

diff --git a/src/tests/catwalk_tests/test_model_trainers.py b/src/tests/catwalk_tests/test_model_trainers.py
@@ -5,6 +5,8 @@
 
 from triage.component.catwalk.model_grouping import ModelGrouper
 from triage.component.catwalk.model_trainers import ModelTrainer
+from triage.component.catwalk.utils import save_experiment_and_get_hash
+from triage.tracking import initialize_tracking_and_get_run_id
 from tests.utils import get_matrix_store
 
 
@@ -22,11 +24,24 @@ def grid_config():
 @pytest.fixture(scope="function")
 def default_model_trainer(db_engine_with_results_schema, project_storage):
     model_storage_engine = project_storage.model_storage_engine()
+    experiment_hash = save_experiment_and_get_hash(
+        config={'foo': 'bar'}, 
+        db_engine=db_engine_with_results_schema
+        )
+    run_id = initialize_tracking_and_get_run_id(
+        experiment_hash,
+        experiment_class_path="",
+        random_seed=5,
+        experiment_kwargs={},
+        db_engine=db_engine_with_results_schema
+    )
+    # import pdb; pdb.set_trace()
     trainer = ModelTrainer(
-        experiment_hash=None,
+        experiment_hash=experiment_hash,
         model_storage_engine=model_storage_engine,
         db_engine=db_engine_with_results_schema,
         model_grouper=ModelGrouper(),
+        run_id=run_id,
     )
     yield trainer
 
@@ -132,14 +147,26 @@ def set_test_seed():
             "select max(batch_run_time) from triage_metadata.models"
         )
     ][0]
+    experiment_hash = save_experiment_and_get_hash(
+        config={'foo': 'bar'}, 
+        db_engine=db_engine
+        )
+    run_id = initialize_tracking_and_get_run_id(
+        experiment_hash,
+        experiment_class_path="",
+        random_seed=5,
+        experiment_kwargs={},
+        db_engine=db_engine
+    )
     trainer = ModelTrainer(
-        experiment_hash=None,
+        experiment_hash=experiment_hash,
         model_storage_engine=model_storage_engine,
         model_grouper=ModelGrouper(
             model_group_keys=["label_name", "label_timespan"]
         ),
         db_engine=db_engine,
         replace=True,
+        run_id=run_id,
     )
     set_test_seed()
     new_model_ids = trainer.train_models(
@@ -212,11 +239,23 @@ def test_baseline_exception_handling(default_model_trainer):
 
 def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage):
     model_storage_engine = project_storage.model_storage_engine()
+    experiment_hash = save_experiment_and_get_hash(
+        config={'foo': 'bar'}, 
+        db_engine=db_engine_with_results_schema
+        )
+    run_id = initialize_tracking_and_get_run_id(
+        experiment_hash,
+        experiment_class_path="",
+        random_seed=5,
+        experiment_kwargs={},
+        db_engine=db_engine_with_results_schema
+    )
     trainer = ModelTrainer(
-        experiment_hash=None,
+        experiment_hash=experiment_hash,
         model_storage_engine=model_storage_engine,
         model_grouper=ModelGrouper(["class_path"]),
         db_engine=db_engine_with_results_schema,
+        run_id=run_id,
     )
     # create training set
     model_ids = trainer.train_models(
@@ -235,6 +274,106 @@ def test_custom_groups(grid_config, db_engine_with_results_schema, project_stora
     assert records[0] == model_ids[0]
 
 
+def test_reuse_model_random_seeds(grid_config, default_model_trainer):
+    trainer = default_model_trainer
+    db_engine = trainer.db_engine
+    project_storage = trainer.model_storage_engine.project_storage
+    model_storage_engine = trainer.model_storage_engine
+
+    # re-using the random seeds requires the association between experiments and models
+    # to exist, which we're not getting in these tests since we aren't using the experiment
+    # architecture, so back-fill these associations after each train_models() run
+    def update_experiment_models(db_engine):
+        sql = """
+            INSERT INTO triage_metadata.experiment_models(experiment_hash,model_hash) 
+            SELECT m.built_by_experiment, m.model_hash 
+            FROM triage_metadata.models m 
+            LEFT JOIN triage_metadata.experiment_models em 
+                ON m.model_hash = em.model_hash 
+                AND m.built_by_experiment = em.experiment_hash 
+            WHERE em.experiment_hash IS NULL
+            """
+        db_engine.execute(sql)
+        db_engine.execute('COMMIT;')
+
+    random.seed(5)
+    model_ids = trainer.train_models(
+        grid_config=grid_config,
+        misc_db_parameters=dict(),
+        matrix_store=get_matrix_store(project_storage),
+    )
+    update_experiment_models(db_engine)
+
+    # simulate running a new experiment where the experiment hash has changed
+    # (e.g. because the model grid is different), but experiment seed is the
+    # same, so previously-trained models should not get new seeds
+    experiment_hash = save_experiment_and_get_hash(
+        config={'baz': 'qux'}, 
+        db_engine=db_engine
+        )
+    run_id = initialize_tracking_and_get_run_id(
+        experiment_hash,
+        experiment_class_path="",
+        random_seed=5,
+        experiment_kwargs={},
+        db_engine=db_engine
+    )
+    trainer = ModelTrainer(
+        experiment_hash=experiment_hash,
+        model_storage_engine=model_storage_engine,
+        db_engine=db_engine,
+        model_grouper=ModelGrouper(),
+        run_id=run_id,
+    )
+    new_grid = grid_config.copy()
+    new_grid['sklearn.tree.DecisionTreeClassifier']['min_samples_split'] = [3,10,100]
+    random.seed(5)
+    new_model_ids = trainer.train_models(
+        grid_config=new_grid,
+        misc_db_parameters=dict(),
+        matrix_store=get_matrix_store(project_storage),
+    )
+    update_experiment_models(db_engine)
+
+    # should have received 5 models
+    assert len(new_model_ids) == 6
+
+    # all the original model ids should be in the new set
+    assert len(set(new_model_ids) & set(model_ids)) == len(model_ids)
+
+    # however, we should NOT re-use the random seeds (and so get new model_ids)
+    # if the experiment-level seed is different
+    experiment_hash = save_experiment_and_get_hash(
+        config={'lorem': 'ipsum'}, 
+        db_engine=db_engine
+        )
+    run_id = initialize_tracking_and_get_run_id(
+        experiment_hash,
+        experiment_class_path="",
+        random_seed=42,
+        experiment_kwargs={},
+        db_engine=db_engine
+    )
+    trainer = ModelTrainer(
+        experiment_hash=experiment_hash,
+        model_storage_engine=model_storage_engine,
+        db_engine=db_engine,
+        model_grouper=ModelGrouper(),
+        run_id=run_id,
+    )
+    random.seed(42) # different from above
+    newer_model_ids = trainer.train_models(
+        grid_config=new_grid,
+        misc_db_parameters=dict(),
+        matrix_store=get_matrix_store(project_storage),
+    )
+    update_experiment_models(db_engine)
+
+    # should get entirely new models now (different IDs)
+    assert len(newer_model_ids) == 6
+    assert len(set(new_model_ids) & set(newer_model_ids)) == 0
+
+
 def test_n_jobs_not_new_model(default_model_trainer):
     grid_config = {
         "sklearn.ensemble.AdaBoostClassifier": {"n_estimators": [10, 100, 1000]},

diff --git a/src/tests/catwalk_tests/test_utils.py b/src/tests/catwalk_tests/test_utils.py
@@ -64,9 +64,9 @@ def test_save_experiment_and_get_hash():
     with testing.postgresql.Postgresql() as postgresql:
         engine = create_engine(postgresql.url())
         ensure_db(engine)
-        exp_hash = save_experiment_and_get_hash(experiment_config, 1234, engine)
+        exp_hash = save_experiment_and_get_hash(experiment_config, engine)
         assert isinstance(exp_hash, str)
-        new_hash = save_experiment_and_get_hash(experiment_config, 1234, engine)
+        new_hash = save_experiment_and_get_hash(experiment_config, engine)
         assert new_hash == exp_hash
 
 
@@ -75,7 +75,7 @@ def test_missing_model_hashes():
         db_engine = create_engine(postgresql.url())
         ensure_db(db_engine)
 
-        experiment_hash = save_experiment_and_get_hash({}, 1234, db_engine)
+        experiment_hash = save_experiment_and_get_hash({}, db_engine)
         model_hashes = ['abcd', 'bcde', 'cdef']
 
         # if we associate model hashes with an experiment but don't actually train the models
@@ -96,7 +96,7 @@ def test_missing_matrix_uuids():
         db_engine = create_engine(postgresql.url())
         ensure_db(db_engine)
 
-        experiment_hash = save_experiment_and_get_hash({}, 1234, db_engine)
+        experiment_hash = save_experiment_and_get_hash({}, db_engine)
         matrix_uuids = ['abcd', 'bcde', 'cdef']
 
         # if we associate matrix uuids with an experiment but don't actually build the matrices