Skip to content

Commit

Permalink
fix pandas
Browse files Browse the repository at this point in the history
  • Loading branch information
luweizheng committed Sep 23, 2023
1 parent 572f674 commit 3405de6
Show file tree
Hide file tree
Showing 28 changed files with 90 additions and 4,651 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
**/.ipynb_checkpoints
**/__pycache__
data/
datasets/
*.json
*.params
*.DS_Store
Expand Down
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ jupyter-book build ./
rm -rf docs
mkdir docs
touch docs/.nojekyll

cp -r datasets ./docs/
rsync -a _build/html/* ./docs/
10 changes: 6 additions & 4 deletions ch-pandas/data-preprocessing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@
"iopub.status.busy": "2023-09-11T14:28:36.287206Z",
"iopub.status.idle": "2023-09-11T14:28:36.942807Z",
"shell.execute_reply": "2023-09-11T14:28:36.941596Z"
}
},
"tags": [
"hide code",
"hide-cell"
]
},
"outputs": [
{
Expand All @@ -32,14 +36,12 @@
}
],
"source": [
"# Hide outputs\n",
"# Hide code\n",
"import os\n",
"import urllib.request\n",
"import zipfile\n",
"import pandas as pd\n",
"\n",
"folder_path = os.path.join(os.getcwd(), \"./data/pwt\")\n",
"folder_path = os.path.join(os.getcwd(), \"../data/pwt\")\n",
"download_url = \"https://www.rug.nl/ggdc/docs/pwt70_06032011version.zip\"\n",
"file_name = download_url.split(\"/\")[-1]\n",
"if not os.path.exists(folder_path):\n",
Expand Down
7 changes: 4 additions & 3 deletions ch-pandas/dataframe-groupby.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@
"iopub.status.busy": "2023-09-18T00:08:11.016110Z",
"iopub.status.idle": "2023-09-18T00:08:11.605146Z",
"shell.execute_reply": "2023-09-18T00:08:11.604394Z"
}
},
"tags": [
"hide-cell"
]
},
"outputs": [
{
Expand All @@ -41,8 +44,6 @@
}
],
"source": [
"# Hide outputs\n",
"# Hide code\n",
"import os\n",
"import urllib.request\n",
"import zipfile\n",
Expand Down
8 changes: 5 additions & 3 deletions ch-pandas/dataframe-merge-concat.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1063,7 +1063,10 @@
"iopub.status.busy": "2023-09-18T00:21:29.110565Z",
"iopub.status.idle": "2023-09-18T00:21:29.123036Z",
"shell.execute_reply": "2023-09-18T00:21:29.120657Z"
}
},
"tags": [
"hide-cell"
]
},
"outputs": [
{
Expand All @@ -1075,12 +1078,11 @@
}
],
"source": [
"# Hide outputs\n",
"import urllib.request\n",
"import os\n",
"import pandas as pd\n",
"\n",
"folder_path = os.path.join(os.getcwd(), \"./data/student-score\")\n",
"folder_path = os.path.join(os.getcwd(), \"../data/student-score\")\n",
"score_download_url = \"score.csv\"\n",
"student_attr_download_url = \"student.csv\"\n",
"\n",
Expand Down
9 changes: 5 additions & 4 deletions ch-pandas/dataframe-slicing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
"iopub.status.busy": "2023-09-11T14:28:34.456141Z",
"iopub.status.idle": "2023-09-11T14:28:35.515803Z",
"shell.execute_reply": "2023-09-11T14:28:35.515056Z"
}
},
"tags": [
"hide-cell"
]
},
"outputs": [
{
Expand All @@ -32,14 +35,12 @@
}
],
"source": [
"# Hide outputs\n",
"# Hide code\n",
"import os\n",
"import urllib.request\n",
"import zipfile\n",
"import pandas as pd\n",
"\n",
"folder_path = os.path.join(os.getcwd(), \"./data/pwt\")\n",
"folder_path = os.path.join(os.getcwd(), \"../data/pwt\")\n",
"download_url = \"https://www.rug.nl/ggdc/docs/pwt70_06032011version.zip\"\n",
"file_name = download_url.split(\"/\")[-1]\n",
"if not os.path.exists(folder_path):\n",
Expand Down
9 changes: 5 additions & 4 deletions ch-pandas/series-dataframe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
},
"outputs": [],
"source": [
"# Hide outputs\n",
"# Hide code\n",
"import pandas as pd"
]
},
Expand Down Expand Up @@ -424,7 +422,10 @@
"iopub.status.busy": "2023-09-11T14:28:40.769708Z",
"iopub.status.idle": "2023-09-11T14:28:40.775507Z",
"shell.execute_reply": "2023-09-11T14:28:40.774814Z"
}
},
"tags": [
"hide-cell"
]
},
"outputs": [
{
Expand All @@ -440,7 +441,7 @@
"import urllib.request\n",
"import zipfile\n",
"\n",
"folder_path = os.path.join(os.getcwd(), \"./data/pwt\")\n",
"folder_path = os.path.join(os.getcwd(), \"../data/pwt\")\n",
"download_url = \"https://www.rug.nl/ggdc/docs/pwt70_06032011version.zip\"\n",
"file_name = download_url.split(\"/\")[-1]\n",
"if not os.path.exists(folder_path):\n",
Expand Down
13 changes: 7 additions & 6 deletions docs/_sources/ch-pandas/data-preprocessing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"metadata": {},
"source": [
"# 数据处理\n",
":label:`data-preprocessing`\n",
"\n",
"数据处理工作包括处理重复值、缺失值和异常值,生成新的列或者行等。"
]
Expand All @@ -21,7 +20,11 @@
"iopub.status.busy": "2023-09-11T14:28:36.287206Z",
"iopub.status.idle": "2023-09-11T14:28:36.942807Z",
"shell.execute_reply": "2023-09-11T14:28:36.941596Z"
}
},
"tags": [
"hide code",
"hide-cell"
]
},
"outputs": [
{
Expand All @@ -33,14 +36,12 @@
}
],
"source": [
"# Hide outputs\n",
"# Hide code\n",
"import os\n",
"import urllib.request\n",
"import zipfile\n",
"import pandas as pd\n",
"\n",
"folder_path = os.path.join(os.getcwd(), \"./data/pwt\")\n",
"folder_path = os.path.join(os.getcwd(), \"../data/pwt\")\n",
"download_url = \"https://www.rug.nl/ggdc/docs/pwt70_06032011version.zip\"\n",
"file_name = download_url.split(\"/\")[-1]\n",
"if not os.path.exists(folder_path):\n",
Expand Down Expand Up @@ -3423,4 +3424,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
1 change: 0 additions & 1 deletion docs/_sources/ch-pandas/dataframe-groupby.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"metadata": {},
"source": [
"# 分组汇总\n",
":label:`dataframe-groupby`\n",
"\n",
"实际的数据分析中,经常需要对某一类数据进行统计分析。比如,假如我们拥有全国所有人的身高和体重数据,我们想按照省份分组,统计每个省的平均身高和平均体重,这时候就需要使用分组操作。pandas 提供了 `groupby` 函数进行类似的分组汇总操作。:numref:`groupby-img` 计算平均身高的分组汇总流程,主要包括两部分:分组与汇总。其中分组阶段将同一类的内容归结到相同的组中;汇总阶段将所关心的数据进行计算,比如求和、求平均等。\n",
"\n",
Expand Down
12 changes: 5 additions & 7 deletions docs/_sources/ch-pandas/dataframe-merge-concat.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,13 @@
"metadata": {},
"source": [
"# 多表操作\n",
":label:`dataframe-merge-concat`\n",
"\n",
"之前的操作主要在单个 `DataFrame` ,实际上,我们经常需要对多个 `DataFrame` 联合起来进行分析。pandas 提供了多 `DataFrame` 之间的合并和连接的操作,分别是 `merge()` 和 `concat()` 函数。比如,我们可以将两个 `DataFrame` 合并成一个,且保留所有的列。\n",
"\n",
"### merge\n",
"\n",
"pandas 的 `merge` 操作可以合并两个 `DataFrame`(或者称为表) ,类似于 SQL 中的 JOIN 操作。 我们可以想象成:一个大表被拆分成两个小表,两个小表都包含一些同样的数据。现在我们需要把两个小表合并,生成一个大表,大表包含了两个小表的字段。\n",
"\n",
"![分组与汇总](../img/ch-pandas/merge.svg)\n",
":width:`800px`\n",
":label:`merge-img`\n",
"```{figure} ../img/ch-pandas/merge.svg\n",
"---\n",
"name: merge-img\n",
Expand Down Expand Up @@ -1067,7 +1063,10 @@
"iopub.status.busy": "2023-09-18T00:21:29.110565Z",
"iopub.status.idle": "2023-09-18T00:21:29.123036Z",
"shell.execute_reply": "2023-09-18T00:21:29.120657Z"
}
},
"tags": [
"hide-cell"
]
},
"outputs": [
{
Expand All @@ -1079,12 +1078,11 @@
}
],
"source": [
"# Hide outputs\n",
"import urllib.request\n",
"import os\n",
"import pandas as pd\n",
"\n",
"folder_path = os.path.join(os.getcwd(), \"./data/student-score\")\n",
"folder_path = os.path.join(os.getcwd(), \"../data/student-score\")\n",
"score_download_url = \"score.csv\"\n",
"student_attr_download_url = \"student.csv\"\n",
"\n",
Expand Down
12 changes: 6 additions & 6 deletions docs/_sources/ch-pandas/dataframe-slicing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"metadata": {},
"source": [
"# 数据切片\n",
":label:`dataframe-slicing`\n",
"\n",
"实际中,我们常常不是分析整个数据,而是数据中的部分子集。如何根据特定的条件获得所需要的数据是本节的主要内容。"
]
Expand All @@ -21,7 +20,10 @@
"iopub.status.busy": "2023-09-11T14:28:34.456141Z",
"iopub.status.idle": "2023-09-11T14:28:35.515803Z",
"shell.execute_reply": "2023-09-11T14:28:35.515056Z"
}
},
"tags": [
"hide-cell"
]
},
"outputs": [
{
Expand All @@ -33,14 +35,12 @@
}
],
"source": [
"# Hide outputs\n",
"# Hide code\n",
"import os\n",
"import urllib.request\n",
"import zipfile\n",
"import pandas as pd\n",
"\n",
"folder_path = os.path.join(os.getcwd(), \"./data/pwt\")\n",
"folder_path = os.path.join(os.getcwd(), \"../data/pwt\")\n",
"download_url = \"https://www.rug.nl/ggdc/docs/pwt70_06032011version.zip\"\n",
"file_name = download_url.split(\"/\")[-1]\n",
"if not os.path.exists(folder_path):\n",
Expand Down Expand Up @@ -2453,4 +2453,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
12 changes: 6 additions & 6 deletions docs/_sources/ch-pandas/series-dataframe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"metadata": {},
"source": [
"# Series 与 DataFrame\n",
":label:`series-dataframe`\n",
"\n",
"pandas 的核心数据结构有两个: Series 和 DataFrame。"
]
Expand All @@ -25,8 +24,6 @@
},
"outputs": [],
"source": [
"# Hide outputs\n",
"# Hide code\n",
"import pandas as pd"
]
},
Expand Down Expand Up @@ -425,7 +422,10 @@
"iopub.status.busy": "2023-09-11T14:28:40.769708Z",
"iopub.status.idle": "2023-09-11T14:28:40.775507Z",
"shell.execute_reply": "2023-09-11T14:28:40.774814Z"
}
},
"tags": [
"hide-cell"
]
},
"outputs": [
{
Expand All @@ -441,7 +441,7 @@
"import urllib.request\n",
"import zipfile\n",
"\n",
"folder_path = os.path.join(os.getcwd(), \"./data/pwt\")\n",
"folder_path = os.path.join(os.getcwd(), \"../data/pwt\")\n",
"download_url = \"https://www.rug.nl/ggdc/docs/pwt70_06032011version.zip\"\n",
"file_name = download_url.split(\"/\")[-1]\n",
"if not os.path.exists(folder_path):\n",
Expand Down Expand Up @@ -1218,4 +1218,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
Loading

0 comments on commit 3405de6

Please sign in to comment.