Skip to content

Commit

Permalink
Results validation.
Browse files Browse the repository at this point in the history
  • Loading branch information
fferegrino committed Feb 11, 2018
1 parent f93b968 commit 92b1c44
Show file tree
Hide file tree
Showing 3 changed files with 355 additions and 96 deletions.
2 changes: 2 additions & 0 deletions notebooks/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.txt
*.json
294 changes: 229 additions & 65 deletions notebooks/results-validation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,67 +2,270 @@
"cells": [
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 65,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from glob import glob as g\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 82,
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"PageRankResults = pd.read_table(\"PageRankResults.txt\", sep=\" \", \n",
" quoting=3, na_values='', na_filter=False,\n",
" header=None, names=[\"article\", \"pr\"])\n",
"PageRankResults.set_index(\"article\", inplace=True)"
"PageRankResults_list = []\n",
"names = []\n",
"for results_file in sorted(g(\"PageRankResults_round*.txt\")):\n",
" name = \"pr_\" + results_file[len(\"PageRankResults_\"):-4]\n",
" PageRankResults = pd.read_table(results_file, sep=\" \", \n",
" quoting=3, na_values='', na_filter=False,\n",
" header=None, names=[\"article\", name])\n",
" PageRankResults.set_index(\"article\", inplace=True)\n",
" PageRankResults_list.append(PageRankResults)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>pr_round0</th>\n",
" <th>pr_round1</th>\n",
" <th>pr_round2</th>\n",
" <th>pr_round3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>903556.000000</td>\n",
" <td>903556.000000</td>\n",
" <td>903556.000000</td>\n",
" <td>903556.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.155969</td>\n",
" <td>0.155968</td>\n",
" <td>0.155968</td>\n",
" <td>0.155969</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.067821</td>\n",
" <td>0.067881</td>\n",
" <td>0.067881</td>\n",
" <td>0.067820</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.150000</td>\n",
" <td>0.150000</td>\n",
" <td>0.150000</td>\n",
" <td>0.150000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.150465</td>\n",
" <td>0.150466</td>\n",
" <td>0.150466</td>\n",
" <td>0.150465</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.151263</td>\n",
" <td>0.151263</td>\n",
" <td>0.151263</td>\n",
" <td>0.151263</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>0.153601</td>\n",
" <td>0.153602</td>\n",
" <td>0.153602</td>\n",
" <td>0.153601</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>25.471842</td>\n",
" <td>25.487890</td>\n",
" <td>25.487890</td>\n",
" <td>25.471392</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" pr_round0 pr_round1 pr_round2 pr_round3\n",
"count 903556.000000 903556.000000 903556.000000 903556.000000\n",
"mean 0.155969 0.155968 0.155968 0.155969\n",
"std 0.067821 0.067881 0.067881 0.067820\n",
"min 0.150000 0.150000 0.150000 0.150000\n",
"25% 0.150465 0.150466 0.150466 0.150465\n",
"50% 0.151263 0.151263 0.151263 0.151263\n",
"75% 0.153601 0.153602 0.153602 0.153601\n",
"max 25.471842 25.487890 25.487890 25.471392"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PageRankResults = PageRankResults_list[0]\n",
"for df_ in PageRankResults_list[1:]:\n",
" PageRankResults = PageRankResults.join(df_, how='outer')\n",
"\n",
"PageRankResults.describe()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>pr_round0</th>\n",
" <th>pr_round1</th>\n",
" <th>pr_round2</th>\n",
" <th>pr_round3</th>\n",
" </tr>\n",
" <tr>\n",
" <th>article</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [pr_round0, pr_round1, pr_round2, pr_round3]\n",
"Index: []"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"null_values = PageRankResults[PageRankResults[\"pr_round0\"].isnull() |\n",
" PageRankResults[\"pr_round1\"].isnull() |\n",
" PageRankResults[\"pr_round2\"].isnull() |\n",
" PageRankResults[\"pr_round3\"].isnull()]\n",
"null_values.head()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pr_round0 0.150058\n",
"pr_round1 0.150058\n",
"pr_round2 0.150058\n",
"pr_round3 0.150058\n",
"Name: Dejiko_No_Maibura__(でじこ の まいブラ), dtype: float64"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PageRankResults.describe()\n",
"PageRankResults.loc[\"Dejiko_No_Maibura__(でじこ の まいブラ)\"]"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" pr\n",
"count 903556.000000\n",
"mean 0.155968\n",
"std 0.067881\n",
"min 0.150000\n",
"25% 0.150466\n",
"50% 0.151263\n",
"75% 0.153602\n",
"max 25.487890\n",
"\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 903556 entries, \"Battling\"_Green to ニ\n",
"Data columns (total 1 columns):\n",
"pr 903556 non-null float64\n",
"dtypes: float64(1)\n",
"memory usage: 13.8+ MB\n",
"Index: 903556 entries, !!! to 애국가,_pronounced_as_Aegug-ga\n",
"Data columns (total 4 columns):\n",
"pr_round0 903556 non-null float64\n",
"pr_round1 903556 non-null float64\n",
"pr_round2 903556 non-null float64\n",
"pr_round3 903556 non-null float64\n",
"dtypes: float64(4)\n",
"memory usage: 74.5+ MB\n",
"None\n"
]
}
],
"source": [
"print(PageRankResults.describe())\n",
"print()\n",
"print(PageRankResults.info())"
]
},
{
"cell_type": "code",
"execution_count": 84,
"execution_count": 71,
"metadata": {
"scrolled": true
},
Expand All @@ -72,7 +275,7 @@
"output_type": "stream",
"text": [
"Empty DataFrame\n",
"Columns: [pr]\n",
"Columns: [pr_round0, pr_round1, pr_round2, pr_round3]\n",
"Index: []\n"
]
}
Expand All @@ -82,45 +285,6 @@
"len(duplicated)\n",
"print(duplicated)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'the label [\"] is not in the [index]'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/Users/fferegrino/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_has_valid_type\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1433\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0max\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontains\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1434\u001b[0;31m \u001b[0merror\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1435\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/fferegrino/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36merror\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1428\u001b[0m raise KeyError(\"the label [%s] is not in the [%s]\" %\n\u001b[0;32m-> 1429\u001b[0;31m (key, self.obj._get_axis_name(axis)))\n\u001b[0m\u001b[1;32m 1430\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 'the label [\"] is not in the [index]'",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-87-ba74a000b88a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mPageRankResults\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"\\\"\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/Users/fferegrino/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1326\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1327\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_if_callable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1328\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1329\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1330\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_is_scalar_access\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/fferegrino/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1549\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1550\u001b[0m \u001b[0;31m# fall thru to straight lookup\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1551\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_has_valid_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1552\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_label\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1553\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/fferegrino/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_has_valid_type\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1440\u001b[0m \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1441\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1442\u001b[0;31m \u001b[0merror\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1443\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1444\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/fferegrino/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36merror\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1427\u001b[0m \"key\")\n\u001b[1;32m 1428\u001b[0m raise KeyError(\"the label [%s] is not in the [%s]\" %\n\u001b[0;32m-> 1429\u001b[0;31m (key, self.obj._get_axis_name(axis)))\n\u001b[0m\u001b[1;32m 1430\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1431\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 'the label [\"] is not in the [index]'"
]
}
],
"source": [
"PageRankResults.loc[\" \"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading

0 comments on commit 92b1c44

Please sign in to comment.