helper added

wri · Mar 12, 2020 · afcc192 · afcc192
1 parent 5570af1
commit afcc192
Showing 1 changed file with 325 additions and 0 deletions.
diff --git a/archive/phase_iv/final/helper_sum-confusion-matrices.ipynb b/archive/phase_iv/final/helper_sum-confusion-matrices.ipynb
@@ -0,0 +1,325 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Helper: Sum Confusion Matrices\n",
+    "Select an arbitrary set of rows in a scorecard, and sum the confusion matrices to generate a single \"total\" confusion matrix. This allows for direct calculation of statistics across the full set, rather than merely averaging statistics derived from the various individual entries.  \n",
+    "\n",
+    "Date: 3 March 2020  \n",
+    "Author: Peter Kerins  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Load and inspect scorecard data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scorecard_path = \"C:/Users/Peter.Kerins/World Resources Institute/Urban Land Use - Documents/WRI Results/phase_iv/scorecards_analysis/single-sheet_composite_validation.csv\"\n",
+    "df = pd.read_csv(scorecard_path,sep=',')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.confusion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Separate 3-category and 6-category entries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df6 = df[df.notes.str.contains('full') & df.notes.str.contains('2019')]\n",
+    "df3 = df[df.notes.str#### Confusion matrix scoring method.contains('reduced') & df.notes.str.contains('2019')]\n",
+    "print (len(df6), len(df3))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Define confusion matrix scoring method\n",
+    "Copied from `util_scoring.py` for simplicity (this notebook can be executed with just the scorecard file, not needing any other project code; helpful when VM is not currently constituted)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from util_scoring.py\n",
+    "def calc_confusion_details(confusion):\n",
+    "    n_categories = confusion.shape[0]\n",
+    "    # out of samples in category, how many assigned to that category\n",
+    "    # (true positives) / (true positives + false negatives)\n",
+    "    # (correct) / (samples from category)\n",
+    "    recalls = np.zeros(n_categories, dtype='float32')\n",
+    "    # out of samples assigned to category, how many belong to that category\n",
+    "    # (true positives) / (true positives + false positives)\n",
+    "    # (correct) / (samples assigned to category)\n",
+    "    precisions = np.zeros(n_categories, dtype='float32')\n",
+    "\n",
+    "    for j in range(n_categories):\n",
+    "        ascribed = np.sum(confusion[:,j])\n",
+    "        actual = np.sum(confusion[j,:])\n",
+    "        correct = confusion[j,j]\n",
+    "        if actual:\n",
+    "            recalls[j] = float(correct)/float(actual)\n",
+    "        else:\n",
+    "            recalls[j] = 1e8\n",
+    "        if ascribed:\n",
+    "            precisions[j] = float(correct)/float(ascribed)\n",
+    "        else:\n",
+    "            precisions[j] = 1e8\n",
+    "    # what percentage of total samples were assigned to the correct category\n",
+    "    accuracy = confusion.trace()/float(confusion.sum())\n",
+    "\n",
+    "    return recalls, precisions, accuracy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 6-category scoring"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df6.confusion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = df6.confusion.apply(lambda x: \n",
+    "                           np.fromstring(\n",
+    "                               x.replace('\\n','')\n",
+    "                                .replace('[','')\n",
+    "                                .replace(']','')\n",
+    "                                .replace('  ',' '), sep=' '))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np_sum = (result.sum())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "confusion = np_sum.reshape((6,6)).astype('uint')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "confusion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "recalls, precisions, accuracy = calc_confusion_details(confusion)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(recalls)\n",
+    "print(precisions)\n",
+    "print(accuracy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate f-score\n",
+    "beta = 2\n",
+    "f_scores = (beta**2 + 1) * precisions * recalls / ( (beta**2 * precisions) + recalls )\n",
+    "f_score_average = np.mean(f_scores)\n",
+    "print (f_scores)\n",
+    "print (f_score_average)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3-category scoring"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df3.confusion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = df3.confusion.apply(lambda x: \n",
+    "                           np.fromstring(\n",
+    "                               x.replace('\\n','')\n",
+    "                                .replace('[','')\n",
+    "                                .replace(']','')\n",
+    "                                .replace('  ',' '), sep=' '))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np_sum = (result.sum())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "confusion = np_sum.reshape((3,3)).astype('uint')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "confusion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "recalls, precisions, accuracy = calc_confusion_details(confusion)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(recalls)\n",
+    "print(precisions)\n",
+    "print(accuracy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate f-score\n",
+    "beta = 2\n",
+    "f_scores = (beta**2 + 1) * precisions * recalls / ( (beta**2 * precisions) + recalls )\n",
+    "f_score_average = np.mean(f_scores)\n",
+    "print (f_scores)\n",
+    "print (f_score_average)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}