Merge pull request #274 from microsoft/docstr

update docstr
microsoft · Nov 8, 2021 · 3f09c69 · 3f09c69
2 parents 62a3170 + 5b68f55
commit 3f09c69
Show file tree

Hide file tree

Showing 24 changed files with 1,250 additions and 1,108 deletions.
diff --git a/flaml/automl.py b/flaml/automl.py
@@ -1,8 +1,7 @@
-"""!
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the
- * project root for license information.
-"""
+# !
+#  * Copyright (c) Microsoft Corporation. All rights reserved.
+#  * Licensed under the MIT License. See LICENSE file in the
+#  * project root for license information.
 import time
 from typing import Callable, Optional
 from functools import partial
@@ -311,7 +310,7 @@ def size(state: AutoMLState, config: dict) -> float:
 
 
 class AutoML:
-    """The AutoML class
+    """The AutoML class.
 
     Example:
 
@@ -359,10 +358,10 @@ def model(self):
         return self.__dict__.get("_trained_estimator")
 
     def best_model_for_estimator(self, estimator_name):
-        """Return the best model found for a particular estimator
+        """Return the best model found for a particular estimator.
 
         Args:
-            estimator_name: a str of the estimator's name
+            estimator_name: a str of the estimator's name.
 
         Returns:
             An object with `predict()` and `predict_proba()` method (for
@@ -398,7 +397,7 @@ def best_config_per_estimator(self):
 
     @property
     def best_loss(self):
-        """A float of the best loss found"""
+        """A float of the best loss found."""
         return self._state.best_loss
 
     @property
@@ -421,7 +420,7 @@ def classes_(self):
 
     @property
     def time_to_find_best_model(self) -> float:
-        """Time taken to find best model in seconds"""
+        """Time taken to find best model in seconds."""
         return self.__dict__.get("_time_taken_best_iter")
 
     def predict(self, X_test):
@@ -490,7 +489,7 @@ def _preprocess(self, X):
         if issparse(X):
             X = X.tocsr()
         if self._transformer:
-            X = self._transformer.transform(X, self._state.task)
+            X = self._transformer.transform(X)
         return X
 
     def _validate_data(
@@ -583,13 +582,11 @@ def _validate_data(
                 X_val.shape[0] == y_val.shape[0]
             ), "# rows in X_val must match length of y_val."
             if self._transformer:
-                self._state.X_val = self._transformer.transform(X_val, self._state.task)
+                self._state.X_val = self._transformer.transform(X_val)
             else:
                 self._state.X_val = X_val
             if self._label_transformer:
-                self._state.y_val = self._label_transformer.transform(
-                    y_val, self._state.task
-                )
+                self._state.y_val = self._label_transformer.transform(y_val)
             else:
                 self._state.y_val = y_val
         else:
@@ -852,26 +849,26 @@ def _prepare_data(self, eval_method, split_ratio, n_splits):
             )
 
     def add_learner(self, learner_name, learner_class):
-        """Add a customized learner
+        """Add a customized learner.
 
         Args:
-            learner_name: A string of the learner's name
-            learner_class: A subclass of flaml.model.BaseEstimator
+            learner_name: A string of the learner's name.
+            learner_class: A subclass of flaml.model.BaseEstimator.
         """
         self._state.learner_classes[learner_name] = learner_class
 
     def get_estimator_from_log(self, log_file_name, record_id, task):
-        """Get the estimator from log file
+        """Get the estimator from log file.
 
         Args:
-            log_file_name: A string of the log file name
+            log_file_name: A string of the log file name.
             record_id: An integer of the record ID in the file,
-                0 corresponds to the first trial
+                0 corresponds to the first trial.
             task: A string of the task type,
-                'binary', 'multi', 'regression', 'ts_forecast', 'rank'
+                'binary', 'multi', 'regression', 'ts_forecast', 'rank'.
 
         Returns:
-            An estimator object for the given configuration
+            An estimator object for the given configuration.
         """
 
         with training_log_reader(log_file_name) as reader:
@@ -910,16 +907,16 @@ def retrain_from_log(
         auto_augment=True,
         **fit_kwargs,
     ):
-        """Retrain from log file
+        """Retrain from log file.
 
         Args:
-            log_file_name: A string of the log file name
-            X_train: A numpy array of training data in shape n*m
+            log_file_name: A string of the log file name.
+            X_train: A numpy array or dataframe of training data in shape n*m.
                 For 'ts_forecast' task, the first column of X_train
                 must be the timestamp column (datetime type). Other
                 columns in the dataframe are assumed to be exogenous
                 variables (categorical or numeric).
-            y_train: A numpy array of labels in shape n*1
+            y_train: A numpy array or series of labels in shape n*1.
             dataframe: A dataframe of training data including label column.
                 For 'ts_forecast' task, dataframe must be specified and should
                 have at least two columns: timestamp and label, where the first
@@ -1080,11 +1077,13 @@ def _decide_eval_method(self, time_budget):
 
     @property
     def search_space(self) -> dict:
-        """Search space
-        Must be called after fit(...) (use max_iter=0 to prevent actual fitting)
+        """Search space.
+
+        Must be called after fit(...)
+        (use max_iter=0 and retrain_final=False to prevent actual fitting).
 
         Returns:
-            A dict of the search space
+            A dict of the search space.
         """
         estimator_list = self.estimator_list
         if len(estimator_list) == 1:
@@ -1101,7 +1100,7 @@ def search_space(self) -> dict:
 
     @property
     def low_cost_partial_config(self) -> dict:
-        """Low cost partial config
+        """Low cost partial config.
 
         Returns:
             A dict.
@@ -1112,7 +1111,6 @@ def low_cost_partial_config(self) -> dict:
             to each learner's low_cost_partial_config; the estimator index as
             an integer corresponding to the cheapest learner is appended to the
             list at the end.
-
         """
         if len(self.estimator_list) == 1:
             estimator = self.estimator_list[0]
@@ -1146,7 +1144,6 @@ def cat_hp_cost(self) -> dict:
             a list of the cat_hp_cost's as the value, corresponding
             to each learner's cat_hp_cost; the cost relative to lgbm for each
             learner (as a list itself) is appended to the list at the end.
-
         """
         if len(self.estimator_list) == 1:
             estimator = self.estimator_list[0]
@@ -1198,28 +1195,28 @@ def prune_attr(self) -> Optional[str]:
 
     @property
     def min_resource(self) -> Optional[float]:
-        """Attribute for pruning
+        """Attribute for pruning.
 
         Returns:
-            A float for the minimal sample size or None
+            A float for the minimal sample size or None.
         """
         return self._min_sample_size if self._sample else None
 
     @property
     def max_resource(self) -> Optional[float]:
-        """Attribute for pruning
+        """Attribute for pruning.
 
         Returns:
-            A float for the maximal sample size or None
+            A float for the maximal sample size or None.
         """
         return self._state.data_size if self._sample else None
 
     @property
     def trainable(self) -> Callable[[dict], Optional[float]]:
-        """Training function
+        """Training function.
 
         Returns:
-            A function that evaluates each config and returns the loss
+            A function that evaluates each config and returns the loss.
         """
         self._state.time_from_start = 0
         for estimator in self.estimator_list:
@@ -1255,10 +1252,10 @@ def train(config: dict):
 
     @property
     def metric_constraints(self) -> list:
-        """Metric constraints
+        """Metric constraints.
 
         Returns:
-            A list of the metric constraints
+            A list of the metric constraints.
         """
         constraints = []
         if np.isfinite(self._pred_time_limit):
@@ -1310,7 +1307,7 @@ def fit(
         use_ray=False,
         **fit_kwargs,
     ):
-        """Find a model for a given task
+        """Find a model for a given task.
 
         Args:
             X_train: A numpy array or a pandas dataframe of training data in
@@ -1499,6 +1496,7 @@ def custom_metric(
             and eval_method == "holdout"
             and self._state.X_val is None
             or eval_method == "cv"
+            and (max_iter > 0 or retrain_full is True)
             or max_iter == 1
         )
         self._auto_augment = auto_augment

diff --git a/flaml/data.py b/flaml/data.py
@@ -1,8 +1,7 @@
-"""!
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT License.
-"""
-
+# !
+#  * Copyright (c) Microsoft Corporation. All rights reserved.
+#  * Licensed under the MIT License. See LICENSE file in the
+#  * project root for license information.
 import numpy as np
 from scipy.sparse import vstack, issparse
 import pandas as pd
@@ -130,17 +129,15 @@ def get_output_from_log(filename, time_budget):
     """Get output from log file
 
     Args:
-        filename: A string of the log file name
-        time_budget: A float of the time budget in seconds
+        filename: A string of the log file name.
+        time_budget: A float of the time budget in seconds.
 
     Returns:
-        search_time_list: A list of the finished time of each logged iter
-        best_error_list:
-            A list of the best validation error after each logged iter
-        error_list: A list of the validation error of each logged iter
-        config_list:
-            A list of the estimator, sample size and config of each logged iter
-        logged_metric_list: A list of the logged metric of each logged iter
+        search_time_list: A list of the finished time of each logged iter.
+        best_error_list: A list of the best validation error after each logged iter.
+        error_list: A list of the validation error of each logged iter.
+        config_list: A list of the estimator, sample size and config of each logged iter.
+        logged_metric_list: A list of the logged metric of each logged iter.
     """
 
     best_config = None
@@ -208,9 +205,21 @@ def concat(X1, X2):
 
 
 class DataTransformer:
-    """transform X, y"""
+    """Transform input training data."""
 
     def fit_transform(self, X, y, task):
+        """Fit transformer and process the input training data according to the task type.
+
+        Args:
+            X: A numpy array or a pandas dataframe of training data.
+            y: A numpy array or a pandas series of labels.
+            task: A string of the task type, e.g.,
+                'classification', 'regression', 'ts_forecast', 'rank'.
+
+        Returns:
+            X: Processed numpy array or pandas dataframe of training data.
+            y: Processed numpy array or pandas series of labels.
+        """
         if isinstance(X, pd.DataFrame):
             X = X.copy()
             n = X.shape[0]
@@ -320,17 +329,30 @@ def fit_transform(self, X, y, task):
             y = self.label_transformer.fit_transform(y)
         else:
             self.label_transformer = None
+        self._task = task
         return X, y
 
-    def transform(self, X, task):
+    def transform(self, X):
+        """Process data using fit transformer.
+
+        Args:
+            X: A numpy array or a pandas dataframe of training data.
+            y: A numpy array or a pandas series of labels.
+            task: A string of the task type, e.g.,
+                'classification', 'regression', 'ts_forecast', 'rank'.
+
+        Returns:
+            X: Processed numpy array or pandas dataframe of training data.
+            y: Processed numpy array or pandas series of labels.
+        """
         X = X.copy()
         if isinstance(X, pd.DataFrame):
             cat_columns, num_columns, datetime_columns = (
                 self._cat_columns,
                 self._num_columns,
                 self._datetime_columns,
             )
-            if task == TS_FORECAST:
+            if self._task == TS_FORECAST:
                 X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL})
                 ds_col = X.pop(TS_TIMESTAMP_COL)
             if datetime_columns:
@@ -357,7 +379,7 @@ def transform(self, X, task):
                     X[column] = X[column].map(datetime.toordinal)
                     del tmp_dt
             X = X[cat_columns + num_columns].copy()
-            if task == TS_FORECAST:
+            if self._task == TS_FORECAST:
                 X.insert(0, TS_TIMESTAMP_COL, ds_col)
             for column in cat_columns:
                 if X[column].dtype.name == "object":