Merge pull request #5 from perib/dev

Dev
EpistasisLab · Apr 19, 2023 · d0129f0 · d0129f0
2 parents 9dd57ca + 4a49692
commit d0129f0
Show file tree

Hide file tree

Showing 28 changed files with 1,618 additions and 1,053 deletions.
diff --git a/README.md b/README.md
@@ -50,9 +50,81 @@ If you downloaded with git pull, then the repository folder will be named TPOT2.
 If you downloaded as a zip, the folder may be called tpot2-main. 
 
 
-### Usage 
+## Usage 
+
+See the Tutorials Folder for more instructions and examples.
+
+### Best Practices
+
+#### 1 
+TPOT2 uses dask for parallel processing. When Python is parallelized, each module is imported within each processes. Therefore it is important to protect all code within a `if __name__ == "__main__"` when running TPOT2 from a script. This is not required when running TPOT2 from a notebook.
+
+For example:
+
+```
+#my_analysis.py
+
+import tpot2
+if __name__ == "__main__":
+    X, y = load_my_data()
+    est = tpot2.TPOTClassifier()
+    est.fit(X,y)
+    #rest of analysis
+```
+
+#### 2
+
+When designing custom objective functions, avoid the use of global variables.
+
+Don't Do:
+```
+global_a = 10
+def foo_objective(est, X, y):
+    return my_scorer(est, X, y, a=global_a)
+```
+
+Instead use a partial
+
+
+```
+from functools import partial
+
+def foo_scorer(est, X, y, a):
+    return my_scorer(est, X, y, a=a)
+
+final_scorer = partial(foo_scorer, a=10)
+```
+
+Similarly when using lambda functions.
+
+Dont Do:
+
+```
+def new_objective(est, a, b)
+    #definition
+
+a = 100
+b = 20
+bad_function = lambda est :  new_objective(est=est, a=a, b=b)
+```
+
+Do:
+```
+def new_objective(est, a, b)
+    #definition
+
+a = 100
+b = 20
+good_function = lambda est, a=a, b=b : new_objective(est=est, a=a, b=b)
+```
+
+### Tips
+
+TPOT2 will not check if your data is correctly formatted. It will assume that you have passed in operators that can handle the type of data that was passed in. For instance, if you pass in a pandas dataframe with categorical features and missing data, then you should also include in your configuration operators that can handle those feautures of the data. Alternatively, if you pass in `preprocessing = True`, TPOT2 will impute missing values, one hot encode categorical features, then standardize the data. (Note that this is currently fitted and transformed on the entire training set before splitting for CV. Later there will be an option to apply per fold, and have the parameters be learnable.)
+
+
+Setting `verbose` to 5 can be helpful during debugging as it will print out the error generated by failing pipelines. 
 
-See the Tutorials Folder for instructions and examples.
 
 ## Contributing to TPOT2
 

diff --git a/Tutorial/6_SH_and_early_termination.ipynb b/Tutorial/6_SH_and_early_termination.ipynb
diff --git a/Tutorial/7_dask_parallelization.ipynb b/Tutorial/7_dask_parallelization.ipynb
@@ -7,8 +7,47 @@
    "source": [
     "# Parallelization\n",
     "\n",
-    "TPOT2 uses the Dask package for parallelization either locally (dask.destributed.LocalCluster) or multi-node via a job schedule (dask-jobqueue). \n",
+    "TPOT2 uses the Dask package for parallelization either locally (dask.destributed.LocalCluster) or multi-node via a job schedule (dask-jobqueue). \n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Best Practices\n",
+    "\n",
+    "When running tpot from an .py script, it is important to protect code with `if __name__==\"__main__\":`\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#my_analysis.py\n",
     "\n",
+    "from dask.distributed import Client, LocalCluster\n",
+    "import tpot2\n",
+    "import sklearn\n",
+    "import sklearn.datasets\n",
+    "import numpy as np\n",
+    "\n",
+    "if __name__==\"__main__\":\n",
+    "    scorer = sklearn.metrics.get_scorer('roc_auc_ovr')\n",
+    "    X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
+    "    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n",
+    "    est = tpot2.TPOTClassifier(population_size= 8, generations=5,)\n",
+    "    est.fit(X_train, y_train)\n",
+    "    print(scorer(est, X_test, y_test))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "## Local Machine Parallelization\n",
     "\n",
     "TPOT2 can be easily parallelized on a local computer by setting the n_jobs and memory_limit parameters.\n",
@@ -93,7 +132,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    " client.dashboard_link"
+    "client.dashboard_link"
    ]
   },
   {
@@ -112,7 +151,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "est = tpot2.TPOTClassifier(population_size= 8, generations=5, client=client verbose=1)\n",
+    "est = tpot2.TPOTClassifier(population_size= 8, generations=5, client=client, verbose=1)\n",
     "# this is equivalent to: \n",
     "# est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n",
     "est.fit(X_train, y_train)\n",

diff --git a/Tutorial/0_Genetic_Algorithm_Overview.ipynb → Tutorial/8_Genetic_Algorithm_Overview.ipynb b/Tutorial/0_Genetic_Algorithm_Overview.ipynb → Tutorial/8_Genetic_Algorithm_Overview.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "Objective functions can optionally take in step, budget, and generations.\n",
     "\n",
-    "step - The same objective function will be run for #evalutation_early_stop_steps, the current step will be passed into the function as an interger. (This is useful for getting a single fold of cross validation for example).\n",
+    "step - The same objective function will be run for #evaluation_early_stop_steps, the current step will be passed into the function as an interger. (This is useful for getting a single fold of cross validation for example).\n",
     "\n",
     "budget - A parameter that varies over the course of the generations. Gets passed into the objective function as a float between 0 and 1. If the budget of the previous evaluation is less than the current budget, it will get re-evaluated. Useful for using smaller datasets earlier in training.\n",
     "\n",
@@ -16,13 +16,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1e1495a724f34ba7a6b99c45737a406e",
+       "model_id": "b45777d6dffe4af3892ecc716ce12a64",
        "version_major": 2,
        "version_minor": 0
       },
@@ -40,6 +40,7 @@
     "import tpot2\n",
     "import random\n",
     "import matplotlib.pyplot as plt\n",
+    "from dask.distributed import Client, LocalCluster\n",
     "\n",
     "class SubsetSelector(tpot2.BaseIndividual):\n",
     "    def __init__(   self,\n",
@@ -64,6 +65,23 @@
     "        self.mutation_list = [self._mutate_add, self._mutate_remove]\n",
     "        self.crossover_list = [self._crossover_swap]\n",
     "        \n",
+    "\n",
+    "    def mutate(self,):\n",
+    "        mutation_list_copy = self.mutation_list.copy()\n",
+    "        random.shuffle(mutation_list_copy)\n",
+    "        for func in mutation_list_copy:\n",
+    "            if func():\n",
+    "                return True\n",
+    "        return False\n",
+    "\n",
+    "    def crossover(self, ind2):\n",
+    "        crossover_list_copy = self.crossover_list.copy()\n",
+    "        random.shuffle(crossover_list_copy)\n",
+    "        for func in crossover_list_copy:\n",
+    "            if func(ind2):\n",
+    "                return True\n",
+    "        return False\n",
+    "\n",
     "    def _mutate_add(self,):\n",
     "        not_included = list(self.values.difference(self.subsets))\n",
     "        if len(not_included) > 1:\n",
@@ -116,7 +134,9 @@
     "objective_names = [\"Value\", \"Weight\"]\n",
     "objective_function_weights = [1,-1]\n",
     "\n",
-    "evolver = tpot2.evolutionary_algorithms.eaNSGA2.eaNSGA2_Evolver(   individual_generator=individual_generator(), \n",
+    "\n",
+    "\n",
+    "evolver = tpot2.BaseEvolver(   individual_generator=individual_generator(), \n",
     "                                objective_functions=[simple_objective],\n",
     "                                objective_function_weights = objective_function_weights,\n",
     "                                bigger_is_better = True,\n",
@@ -412,7 +432,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.10.10"
   },
   "orig_nbformat": 4,
   "vscode": {

diff --git a/tpot2/__init__.py b/tpot2/__init__.py
@@ -13,3 +13,4 @@
 from .builtin_modules import *
 from .config import *
 from .representations import *
+from .parent_selectors import *