From 002c0158f597fbafd745133d1e900672d6557233 Mon Sep 17 00:00:00 2001 From: Petra Vidnerova Date: Sun, 10 Nov 2024 11:02:06 +0100 Subject: [PATCH 1/5] zprehledneni transformaci sloupcu oprava testovaciho prikladu --- lessons/pydata/scikitlearn_api/index.ipynb | 1574 +++++++++++++++----- 1 file changed, 1204 insertions(+), 370 deletions(-) diff --git a/lessons/pydata/scikitlearn_api/index.ipynb b/lessons/pydata/scikitlearn_api/index.ipynb index b40fad0..98663e3 100644 --- a/lessons/pydata/scikitlearn_api/index.ipynb +++ b/lessons/pydata/scikitlearn_api/index.ipynb @@ -473,61 +473,61 @@ " 165\n", " 1\n", " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", + " False\n", + " True\n", + " False\n", + " False\n", + " True\n", + " False\n", + " True\n", " \n", " \n", " 197\n", " 4\n", " 4\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", + " False\n", + " True\n", + " False\n", + " False\n", + " True\n", + " False\n", + " True\n", " \n", " \n", " 78\n", " 26\n", " 19\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", + " False\n", + " False\n", + " True\n", + " False\n", + " True\n", + " False\n", + " True\n", " \n", " \n", " 64\n", " 11\n", " 11\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 1\n", - " 0\n", + " True\n", + " False\n", + " False\n", + " False\n", + " True\n", + " True\n", + " False\n", " \n", " \n", " 166\n", " 21\n", " 8\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", + " False\n", + " False\n", + " True\n", + " False\n", + " True\n", + " False\n", + " True\n", " \n", " \n", "\n", @@ -535,18 +535,18 @@ ], "text/plain": [ " yrs.since.phd yrs.service rank_AssocProf rank_AsstProf rank_Prof \\\n", - "165 1 0 0 1 0 \n", - "197 4 4 0 1 0 \n", - "78 26 19 0 0 1 \n", - "64 11 11 1 0 0 \n", - "166 21 8 0 0 1 \n", + "165 1 0 False True False \n", + "197 4 4 False True False \n", + "78 26 19 False False True \n", + "64 11 11 True False False \n", + "166 21 8 False False True \n", "\n", " discipline_A discipline_B sex_Female sex_Male \n", - "165 0 1 0 1 \n", - "197 0 1 0 1 \n", - "78 0 1 0 1 \n", - "64 0 1 1 0 \n", - "166 0 1 0 1 " + "165 False True False True \n", + "197 False True False True \n", + "78 False True False True \n", + "64 False True True False \n", + "166 False True False True " ] }, "execution_count": 7, @@ -634,6 +634,342 @@ "cell_type": "code", "execution_count": 12, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<138x5 sparse matrix of type ''\n", + "\twith 276 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "encoder.transform(X_train_raw[categorical_columns])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 1., 0., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [1., 0., 0., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [1., 0., 0., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 1., 0., 1., 0.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [1., 0., 0., 1., 0.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [1., 0., 0., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 1., 0.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [1., 0., 0., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [1., 0., 0., 1., 0.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [1., 0., 0., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 1., 0., 1., 0.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [1., 0., 0., 1., 0.],\n", + " [0., 0., 1., 1., 0.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 1., 0.],\n", + " [1., 0., 0., 1., 0.],\n", + " [1., 0., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [1., 0., 0., 1., 0.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 1., 0., 1.],\n", + " [0., 0., 1., 0., 1.]])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "encoder1 = OneHotEncoder(sparse_output=False)\n", + "encoder1.fit(X_train_raw[categorical_columns])\n", + "encoder1.transform(X_train_raw[categorical_columns])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.compose import make_column_transformer" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "transformer = make_column_transformer(\n", + " (OneHotEncoder(sparse_output=False, handle_unknown=\"ignore\"),categorical_columns),\n", + " remainder=\"passthrough\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.0, 1.0, 0.0, ..., 1, 0, 'Male'],\n", + " [0.0, 1.0, 0.0, ..., 4, 4, 'Male'],\n", + " [0.0, 0.0, 1.0, ..., 26, 19, 'Male'],\n", + " ...,\n", + " [0.0, 1.0, 0.0, ..., 3, 3, 'Female'],\n", + " [0.0, 0.0, 1.0, ..., 25, 25, 'Female'],\n", + " [0.0, 0.0, 1.0, ..., 15, 14, 'Male']], dtype=object)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transformer.fit(X_train_raw)\n", + "transformer.transform(X_train_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['onehotencoder__rank_AssocProf', 'onehotencoder__rank_AsstProf',\n", + " 'onehotencoder__rank_Prof', 'onehotencoder__discipline_A',\n", + " 'onehotencoder__discipline_B', 'remainder__yrs.since.phd',\n", + " 'remainder__yrs.service', 'remainder__sex'], dtype=object)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transformer.get_feature_names_out()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import OrdinalEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "transformer = make_column_transformer(\n", + " (OneHotEncoder(sparse_output=False, handle_unknown=\"ignore\"), categorical_columns),\n", + " (OrdinalEncoder(), [\"sex\"]),\n", + " remainder=\"passthrough\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_transformed = transformer.fit_transform(X_train_raw)\n", + "X_test_transformed = transformer.transform(X_test_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array(['onehotencoder__rank_AssocProf', 'onehotencoder__rank_AsstProf',\n", + " 'onehotencoder__rank_Prof', 'onehotencoder__discipline_A',\n", + " 'onehotencoder__discipline_B', 'ordinalencoder__sex',\n", + " 'remainder__yrs.since.phd', 'remainder__yrs.service'], dtype=object),\n", + " 0 1 2 3 4 5 6 7\n", + " 0 1.0 0.0 0.0 1.0 0.0 1.0 19.0 16.0\n", + " 1 0.0 0.0 1.0 1.0 0.0 1.0 35.0 23.0\n", + " 2 0.0 0.0 1.0 0.0 1.0 1.0 17.0 3.0\n", + " 3 0.0 1.0 0.0 0.0 1.0 1.0 3.0 1.0\n", + " 4 1.0 0.0 0.0 0.0 1.0 1.0 8.0 8.0\n", + " 5 0.0 1.0 0.0 0.0 1.0 1.0 5.0 5.0\n", + " 6 1.0 0.0 0.0 0.0 1.0 1.0 12.0 8.0\n", + " 7 0.0 0.0 1.0 1.0 0.0 1.0 56.0 57.0\n", + " 8 0.0 1.0 0.0 1.0 0.0 0.0 3.0 1.0\n", + " 9 0.0 0.0 1.0 0.0 1.0 1.0 37.0 37.0)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transformer.get_feature_names_out(), pd.DataFrame(X_test_transformed).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Škálování" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Přeškálování není vždy nutné, ale některým modelům to může pomoci. Řiďte se tedy pravidlem, že rozhodně neuškodí.\n", + "Využijeme [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler).\n", + "\n", + "StandardScaler nám hodnoty přeškáluje, aby zhruba odpovídaly normálnímu rozdělení. Některé algoritmy to předpokládají. Pokud bychom neškálovali, mohlo by se stát, že příznak (sloupeček), která má výrazně větší rozptyl než ostatní, je brán jako významnější. \n", + "\n", + "Nejprve si ukažme jednoduchý příklad." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, "outputs": [ { "data": { @@ -656,143 +992,258 @@ " \n", " \n", " \n", - " yrs.since.phd\n", - " yrs.service\n", - " sex\n", - " rank_AssocProf\n", - " rank_AsstProf\n", - " rank_Prof\n", - " discipline_A\n", - " discipline_B\n", + " a\n", + " b\n", " \n", " \n", " \n", " \n", - " 165\n", - " 1\n", - " 0\n", - " 0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", + " 0\n", + " 100.888984\n", + " -260.878801\n", " \n", " \n", - " 197\n", - " 4\n", - " 4\n", - " 0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", + " 1\n", + " 100.496064\n", + " -65.508528\n", " \n", " \n", - " 78\n", - " 26\n", - " 19\n", - " 0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 1.0\n", + " 2\n", + " 100.055572\n", + " 59.374218\n", + " \n", + " \n", + " 3\n", + " 100.060233\n", + " 121.281507\n", + " \n", + " \n", + " 4\n", + " 98.560538\n", + " 20.311355\n", + " \n", + " \n", + " 5\n", + " 99.616512\n", + " 4.957273\n", + " \n", + " \n", + " 6\n", + " 100.886202\n", + " 161.057846\n", + " \n", + " \n", + " 7\n", + " 99.766297\n", + " 127.399339\n", + " \n", + " \n", + " 8\n", + " 100.044351\n", + " -86.858194\n", + " \n", + " \n", + " 9\n", + " 98.823873\n", + " 11.641098\n", + " \n", + " \n", + " 10\n", + " 100.445507\n", + " 36.016900\n", + " \n", + " \n", + " 11\n", + " 98.538243\n", + " 171.859937\n", + " \n", + " \n", + " 12\n", + " 100.289883\n", + " 86.607052\n", + " \n", + " \n", + " 13\n", + " 99.812775\n", + " -10.357610\n", + " \n", + " \n", + " 14\n", + " 98.256715\n", + " 22.007433\n", + " \n", + " \n", + " 15\n", + " 101.069478\n", + " -188.388349\n", + " \n", + " \n", + " 16\n", + " 99.836135\n", + " 96.862848\n", + " \n", + " \n", + " 17\n", + " 101.860778\n", + " 64.573279\n", + " \n", + " \n", + " 18\n", + " 101.478252\n", + " 121.215643\n", + " \n", + " \n", + " 19\n", + " 99.120532\n", + " -14.404930\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " a b\n", + "0 100.888984 -260.878801\n", + "1 100.496064 -65.508528\n", + "2 100.055572 59.374218\n", + "3 100.060233 121.281507\n", + "4 98.560538 20.311355\n", + "5 99.616512 4.957273\n", + "6 100.886202 161.057846\n", + "7 99.766297 127.399339\n", + "8 100.044351 -86.858194\n", + "9 98.823873 11.641098\n", + "10 100.445507 36.016900\n", + "11 98.538243 171.859937\n", + "12 100.289883 86.607052\n", + "13 99.812775 -10.357610\n", + "14 98.256715 22.007433\n", + "15 101.069478 -188.388349\n", + "16 99.836135 96.862848\n", + "17 101.860778 64.573279\n", + "18 101.478252 121.215643\n", + "19 99.120532 -14.404930" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import StandardScaler \n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns \n", + "\n", + "# vygeneruje 20 náhodných bodů\n", + "example = pd.DataFrame({\"a\": 100+np.random.randn(100), \"b\": 100*np.random.randn(100)})\n", + "example.head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
ab
count100.000000100.000000
mean99.914380-1.182795
std0.967421107.772840
min97.540807-260.878801
25%99.252158-75.710666
64111111.00.00.00.01.050%99.8464265.370076
16621800.00.01.00.01.075%100.74027366.216316
max101.860778217.284230
\n", "
" ], "text/plain": [ - " yrs.since.phd yrs.service sex rank_AssocProf rank_AsstProf \\\n", - "165 1 0 0 0.0 1.0 \n", - "197 4 4 0 0.0 1.0 \n", - "78 26 19 0 0.0 0.0 \n", - "64 11 11 1 1.0 0.0 \n", - "166 21 8 0 0.0 0.0 \n", - "\n", - " rank_Prof discipline_A discipline_B \n", - "165 0.0 0.0 1.0 \n", - "197 0.0 0.0 1.0 \n", - "78 1.0 0.0 1.0 \n", - "64 0.0 0.0 1.0 \n", - "166 1.0 0.0 1.0 " + " a b\n", + "count 100.000000 100.000000\n", + "mean 99.914380 -1.182795\n", + "std 0.967421 107.772840\n", + "min 97.540807 -260.878801\n", + "25% 99.252158 -75.710666\n", + "50% 99.846426 5.370076\n", + "75% 100.740273 66.216316\n", + "max 101.860778 217.284230" ] }, - "execution_count": 12, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "def transform_data(X_raw):\n", - " # vytvoříme nový dataframe obsahující z původního všechny číselné sloupce\n", - " transformed = X_raw[numerical_columns].copy()\n", - " # přidáme sloupec \"sex\" jednoduchým překódováním původního sloupce \n", - " transformed[\"sex\"] = X_raw[\"sex\"].replace({\"Male\": 0, \"Female\": 1})\n", - " # z OneHotEncoderu dostaneme pouze matici hodnot, připravíme si pro ni v dataframe volné sloupce,\n", - " # t.j. přidáme sloupce ze seznamu column_names\n", - " transformed = transformed.reindex(columns=list(transformed.columns)+list(column_names))\n", - " # do těchto sloupců nasypeme výstup OneHotEncoderu aplikovaného na kategorické sloupce\n", - " transformed[column_names] = encoder.transform(X_raw[categorical_columns]).toarray() \n", - " return transformed\n", - "\n", - "X_train_transformed = transform_data(X_train_raw)\n", - "X_test_transformed = transform_data(X_test_raw)\n", - "\n", - "X_train_transformed.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Škálování" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Přeškálování není vždy nutné, ale některým modelům to může pomoci. Řiďte se tedy pravidlem, že rozhodně neuškodí.\n", - "Využijeme [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler).\n", - "\n", - "StandardScaler nám hodnoty přeškáluje, aby zhruba odpovídaly normálnímu rozdělení. Některé algoritmy to předpokládají. Může se pak např. stát, že příznak (sloupeček), která má výrazně větší rozptyl než ostatní, je brán jako významnější. \n", - "\n", - "Nejprve si ukažme jednoduchý příklad." + "example.describe()" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkoAAAG0CAYAAADNUwhtAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy89olMNAAAACXBIWXMAAA9hAAAPYQGoP6dpAABUSElEQVR4nO3deVhUZf8G8HtAdhRkFwFFQiD3cElNUyPRzNxySS3cfRVXcskKtyxSy0hDyUrN1NxyySVNcXtzTdRMBdxQRAEdlF325/eHP87rCEdZBmbh/lzXXDpneeY7h5ln7jnznHMUQggBIiIiIirGQNMFEBEREWkrBiUiIiIiGQxKRERERDIYlIiIiIhkMCgRERERyWBQIiIiIpLBoEREREQkg0GJiIiISAaDEumNwsJC5ObmaroMIiLSIwxKpBfWr18PGxsbWFhYYOrUqZoup0KUSiXmzp2L06dPa7oUIo17/Pgx5s+fj5SUFABAdHQ0wsPDy9RGeno6FixYgMePHwMAjh8/jk2bNlW4tuvXr2Pu3Lm4evVqhdsi7cWgVInq16+PYcOGaboMvZeTk4P//Oc/+Oyzz7Bu3Tps3boVx44d01g9CoUCc+fOLff6kyZNwm+//YamTZuWavkjR45AoVBg69at5X7MksydOxcKhUKtbeobvscrn5mZGW7evImPPvoIhYWFGD58OGrUqFGmNmrWrIlbt25h8uTJSExMRP/+/eHh4aGyjEKhwIQJE0rdZk5ODvr3748bN26gYcOGKvOK3pNHjhwpU53aJC8vD82aNUOPHj1Q1iud3bp1CwqFAl999ZVaa1qzZg0UCgVu3bql1nZfhEGplIr+QGfPni1xfqdOndC4ceMKP87evXsr9CFbHeXm5kIIgZdffhkvv/wybGxskJGRoemyymX37t3YtWsXtm7dCjMzM02XU27Lly/HmjVrNF1GmfA9rr2+/PJL/Prrrxg1ahQyMzMxYsSIMrfx9ddfY+/evXj99dfxwQcfoGXLlhWqKSgoCLVr18ZPP/1UoXa01ZdffolHjx5h7dq1evGFacOGDQgNDS3XugxKlSgmJgY//PBDmdbZu3cv5s2bV0kV6aeaNWti6tSp8PPzQ9OmTWFnZ4c333xT02WVWXp6OsaNG4eVK1fCy8tL0+VUiC4GpfLge7xqODk54ZNPPsHq1auxePFiGBiU/aPLysoKmzZtwtChQyscVB8+fAgnJyds374dxsbGFWpLG0VFRWHRokXYtGkTbG1tNV2OWlQkKJVt/yWViYmJiaZLKLPMzExYWFhouowy++yzzxAQEIC0tDQ0b968XB2pptWsWRN37tzRdBlUBnyPV50ZM2ZgxowZAJ787FWebd++fXu0b9++wrXY2NggODi4wu1oq6tXr+Lnn39G27ZtNV2KVtC9TxMd8uz4hby8PMybNw+enp4wNTWFra0tXnvtNRw4cAAAMGzYMISFhQF48nt50a1IZmYmPvzwQ7i6usLExAReXl746quviv1+/PjxY0yaNAl2dnaoWbMm3nnnHdy9e7fY2JmiMShXrlzB4MGDUbt2bbz22msAgIsXL2LYsGFo0KABTE1N4eTkhBEjRiA5OVnlsYrauHr1KoYOHQorKyvY29sjODgYQgjcuXMHvXr1Qq1ateDk5ISvv/66VNuuaLzA+vXr4eXlBVNTU/j6+hYbezRs2DDUr18fAPDSSy/hlVdegYGBQbHxNY0bN0bnzp2LPU5hYSHq1q2Ld999t8zbOScnB1OnToW9vb20nePj44s9RlEt169fx7Bhw2BtbQ0rKysMHz4cWVlZKsuWNOYlJSUFU6dORf369WFiYgIXFxd88MEHUCqVxZ7L559/DhcXF5iamuKNN97A9evX5TfyU/766y+0atUKpqam8PDwwPfff1/icqtXr0aXLl3g4OAAExMTvPzyy1ixYkWx53D58mUcPXpUeg136tSpVHXoGr7HK/89XtJzePXVV6X569atg6+vL8zMzGBjY4NBgwYV+8Jx7do19OvXD05OTjA1NYWLiwsGDRqE1NTU59a4YMECGBgYYNmyZQCA27dvY/z48fDy8oKZmRlsbW3Rv3//Uo+Z2bJli1SrnZ0dhg4dirt370rzv/rqKygUCty+fbvYurNmzYKxsTEePXoEAPjvf/+L/v37w83NDSYmJnB1dcXUqVOlAetFhg0bBktLS9y9exe9e/eGpaUl7O3tMW3aNBQUFAAAhBCoX78+evXqBQDo1asX+vbtCwDIzs6GlZUVxo4dK7WZnZ2NuXPnomHDhjA1NUWdOnXQt29f3Lhxo1jdK1euhIeHB0xMTNCqVSv8/fffpdpWly9fRpcuXWBmZgYXFxcsWLAAhYWFxZbbuXMnevToAWdnZ5iYmMDDwwOfffaZ9NyAJz+b79mzB7dv35bec0WfG6XBPUpllJqaWuwDCnjSQb7I3LlzERISglGjRqF169ZIS0vD2bNnce7cObz55psYO3Ys7t27hwMHDuCXX35RWVcIgXfeeQeHDx/GyJEj0bx5c+zfvx/Tp0/H3bt38c0330jLDhs2DJs3b8b777+PV199FUePHkWPHj1k6+rfvz88PT3xxRdfSB3ygQMHcPPmTQwfPhxOTk64fPkyVq5cicuXL+PUqVPFfrMeOHAgfHx88OWXX2LPnj1YsGABbGxs8P3336NLly5YuHAh1q9fj2nTpqFVq1bo2LHjC7fX0aNHsWnTJkyaNAkmJiZYvnw5unXrhjNnzpR5rMjAgQMxd+5cJCYmwsnJSZr+119/4d69exg0aFCZt/OoUaOwbt06DB48GO3atcOhQ4eeu50HDBgAd3d3hISE4Ny5c/jxxx/h4OCAhQsXyq6TkZGBDh06ICoqCiNGjMArr7wCpVKJ33//HfHx8bCzs5OW/fLLL2FgYIBp06YhNTUVixYtwpAhQ1549Ny///6Lrl27wt7eHnPnzkV+fj7mzJkDR0fHYsuuWLECjRo1wjvvvIMaNWpg165dGD9+PAoLCxEYGAgACA0NxcSJE2FpaYlPPvkEAEpsS1vxPa6d7/GSnsPnn3+O4OBgDBgwAKNGjcKDBw+wbNkydOzYEefPn4e1tTVyc3Ph7++PnJwcTJw4EU5OTrh79y52796NlJQUWFlZlVjbp59+ii+++ALff/89Ro8eDQD4+++/cfz4cQwaNAguLi6IjY3F8uXL0alTJ1y5cgXm5uayz3XNmjUYPnw4WrVqhZCQECQlJeHbb7/F8ePHpVoHDBiAGTNmYPPmzZg+fbrK+ps3b0bXrl1Ru3ZtAE9CV1ZWFsaNGwdbW1ucOXMGy5YtQ3x8PLZs2aKybkFBAfz9/dGmTRt89dVXOHjwIL7++mt4eHhg3LhxUCgUGDp0KBYtWoSHDx/CxsZGWnfXrl1IS0vD0KFDpbbefvttREREYNCgQZg8eTLS09Nx4MABXLp0SWWQ/IYNG5Ceno6xY8dCoVBg0aJF6Nu3L27evAkjIyPZbZWYmIjOnTsjPz8fH330ESwsLLBy5coSx22uWbMGlpaWCAoKgqWlJQ4dOoTZs2cjLS0NixcvBgB88sknSE1NRXx8vPQ+srS0lH38YgSVyurVqwWA594aNWqksk69evVEQECAdL9Zs2aiR48ez32cwMBAUdKfZceOHQKAWLBggcr0d999VygUCnH9+nUhhBCRkZECgJgyZYrKcsOGDRMAxJw5c6Rpc+bMEQDEe++9V+zxsrKyik379ddfBQBx7NixYm2MGTNGmpafny9cXFyEQqEQX375pTT90aNHwszMTGWbyCnapmfPnpWm3b59W5iamoo+ffpI0wICAkS9evWKrV9UV5GYmBgBQCxbtkxlufHjxwtLS0vp+ZZ2O1+4cEEAEOPHj1dZbvDgwbLbecSIESrL9unTR9ja2qpMe/Y1M3v2bAFAbNu2rdhzLCwsFEIIcfjwYQFA+Pj4iJycHGn+t99+KwCIf//9t9i6T+vdu7cwNTUVt2/flqZduXJFGBoaFnstlvS68Pf3Fw0aNFCZ1qhRI/H6668/93G1Dd/j2vkel3sOt27dEoaGhuLzzz9Xmf7vv/+KGjVqSNPPnz8vAIgtW7a8sJ7AwEAhhBAffvihMDAwEGvWrFFZJjMzs9h6f/31lwAg1q5dK00rek8ePnxYCCFEbm6ucHBwEI0bNxaPHz+Wltu9e7cAIGbPni1Na9u2rfD19VV5jDNnzhR7jJL+fiEhIUKhUKi8lwMCAgQAMX/+fJVlW7RoofI4RX3kihUrVJZ75513RP369aX+ZtWqVQKAWLJkSbHHL1omNjZWABC2trbi4cOH0vydO3cKAGLXrl3F1n3alClTBABx+vRpadr9+/eFlZWVACBiY2Ofux3Gjh0rzM3NRXZ2tjStR48eJX5WlAZ/eiujsLAwHDhwoNitNIdyW1tb4/Lly7h27VqZH3fv3r0wNDTEpEmTVKZ/+OGHEELgjz/+AADs27cPADB+/HiV5SZOnCjb9n/+859i055O7tnZ2VAqldLu7nPnzhVbftSoUdL/DQ0N0bJlSwghMHLkSGm6tbU1vLy8cPPmTdlanta2bVv4+vpK993c3NCrVy/s379fZbdqaTRs2BDNmzdXOXdKQUEBtm7dip49e0rPt7Tbee/evQBQbLkpU6bI1vDsdu7QoQOSk5ORlpYmu85vv/2GZs2aoU+fPsXmPfuNf/jw4SoDSzt06AAAz93eBQUF2L9/P3r37g03Nzdpuo+PD/z9/Yst//TromjPy+uvv46bN2++8GcMXcH3uHa+x599Dtu2bUNhYSEGDBgApVIp3ZycnODp6YnDhw8DgLTHaP/+/cV+6n6WEAITJkzAt99+i3Xr1iEgIEBl/rN7jHJycuDr64vatWuXuM2KnD17Fvfv38f48eNhamoqTe/Rowe8vb2xZ88eadrAgQMRGRmp8jPWpk2bYGJiIv00Bqj+/TIzM6FUKtGuXTsIIXD+/PliNZTU/zz9d2rYsCHatGmD9evXS9MePnyIP/74A0OGDJH6m99++w12dnYlvt5K2gtZtAes6DGB5/dJwJP3wquvvorWrVtL0+zt7TFkyJBiyz69HdLT06FUKtGhQwdkZWUhOjr6uY9TWgxKZdS6dWv4+fkVuz39YpBTdNK0hg0bokmTJpg+fTouXrxYqse9ffs2nJ2dUbNmTZXpPj4+0vyifw0MDODu7q6y3EsvvSTb9rPLAk/eIJMnT4ajoyPMzMxgb28vLVfSB+LTH7LAk87J1NRU5aehoulFv7G/iKenZ7FpDRs2RFZWFh48eFCqNp42cOBAHD9+XBoTcOTIEdy/fx8DBw6Ulinrdn72XCzPO1rt2W1U9Jp53va4ceNGqX9mLE/7Dx48wOPHj0vc1iU9l+PHj8PPzw8WFhawtraGvb09Pv74YwAlvy50Ed/j2vkef/Y5XLt2DUIIeHp6wt7eXuUWFRWF+/fvS+sFBQXhxx9/hJ2dHfz9/REWFlbic1y7di3CwsKwbNkyvPfee8Xm5+TkICQkBN7e3jAzM4OpqSnMzMzw6NGj577+i/52Jb2nvL29VcYk9e/fHwYGBtKXOiEEtmzZgu7du6NWrVrScnFxcRg2bBhsbGykcUevv/46gOJ/P1NTU9jb26tMq127drG/0wcffIDjx49L9WzZsgV5eXl4//33pWVu3LgBLy+vUp3Lqjx9EvBke5W2T7p8+TL69OkDKysr1KpVC/b29tLPhOrqkxiUqlDHjh1x48YNrFq1Co0bN8aPP/6IV155BT/++KNG6yrpd98BAwbghx9+wH/+8x9s27YNf/75p/RNtqQBdYaGhqWaBqDMJy97Hrnze5S0x2ngwIFSpwM8+c3fysoK3bp1U1s9z1PZ26Oy279x4wbeeOMNKJVKLFmyBHv27MGBAwekM6GX9Lqobvgef0Kd7/Eizz6HwsJCKBQK7Nu3r8Q9gE8fkPD111/j4sWL+Pjjj6WB8I0aNSp28EX79u3h6OiI7777Dg8fPixWw+TJkzF79my8++672L59O06cOIGTJ0/Czs5Oba9/Z2dndOjQAZs3bwYAnDp1CnFxcSpf6AoKCvDmm29iz549mDlzJnbs2IEDBw5Ip+R4tha5v9OzBg0aBCMjI2mv0rp169CyZctyn66ksl8fKSkpeP311/HPP/9g/vz52LVrFw4cOCCN+1TX34SDuauYjY0Nhg8fjuHDhyMjIwMdO3bE3Llzpd3ach/89erVw8GDB5Genq7yjbNo12K9evWkfwsLCxEbG6uSyEt79BPwJO1HRERg3rx5mD17tjS9PD8nVERJj3f16lWYm5tL345q164tXdrgaSUdNeLu7o7WrVtj06ZNmDBhArZt24bevXurHGZc1u1c9O2qSExMTPmerAwPDw9cunRJrW0+zd7eHmZmZiVu62efy65du5CTk4Pff/9d5Zti0U8cT9OHE9SVF9/jpVea97gcDw8PCCHg7u5e7MzYJWnSpAmaNGmCTz/9FCdOnED79u0RHh6OBQsWSMu89NJLWLRoETp16oRu3bohIiJC5W+xadMmDBs2TGWdx48flxiqnlb0t4uJiUGXLl1U5sXExEjziwwcOBDjx49HTEwMNm3aBHNzc/Ts2VOa/++//0qH8H/wwQfS9KKjK8vLxsYGPXr0wPr16zFkyBAcP3682LmHPDw8cPr0aeTl5T13QHZF1KtXr1R90pEjR5CcnIxt27apHDwQGxtbbN2K9Enco1SFnj3s1tLSEi+99BJycnKkaUXnN3n2w/+tt95CQUEBvvvuO5Xp33zzDRQKBbp37w4A0riS5cuXqyxXdHhraRR9C3g29Zf3ZF3ldfLkSZXf/e/cuYOdO3eia9euUo0eHh5ITU1V+XkjISEB27dvL7HNgQMH4tSpU1i1ahWUSqXKtzSg9Nu56N+lS5eqLKfubdSvXz/8888/JT4fdXwrMzQ0hL+/P3bs2IG4uDhpelRUFPbv319s2WcfNzU1FatXry7WroWFRYkBVt/xPV42pXmPy+nbty8MDQ0xb968Ys9DCCH9LdLS0pCfn68yv0mTJjAwMFD5uxRp2rQp9u7di6ioKPTs2VPlcHuFQlHs6MfQ0NAX7rlo2bIlHBwcEB4ervKYf/zxB6KiooodsdivXz8YGhri119/xZYtW/D222+rnPuqpL+fEALffvvtc+sojffffx9XrlzB9OnTYWhoKB0R/HRtSqWy2Ov02Xoq4q233sKpU6dw5swZadqDBw9Uxk8BJW+H3NzcYu8N4Mn7rrw/xXGPUhV6+eWX0alTJ/j6+sLGxgZnz57F1q1bVa4vVDSwcdKkSfD395deqD179kTnzp3xySef4NatW2jWrBn+/PNP7Ny5E1OmTJHGyvj6+qJfv34IDQ1FcnKydOhw0UUbS5Oqa9WqhY4dO2LRokXIy8tD3bp18eeff5aY0itT48aN4e/vr3LoMACVsxoPGjQIM2fORJ8+fTBp0iRkZWVhxYoVaNiwYYmDKwcMGIBp06Zh2rRpsLGxgZ+fn8r80m7n5s2b47333sPy5cuRmpqKdu3aISIiokzf6ktj+vTp2Lp1K/r3748RI0bA19cXDx8+xO+//47w8HA0a9aswo8xb9487Nu3Dx06dMD48eORn5+PZcuWoVGjRioBtGvXrjA2NkbPnj0xduxYZGRk4IcffoCDgwMSEhJU2vT19cWKFSuwYMECvPTSS3BwcCj2TVof8T1eNqV5j8vx8PDAggULMGvWLNy6dQu9e/dGzZo1ERsbi+3bt2PMmDGYNm0aDh06hAkTJqB///5o2LAh8vPz8csvv8DQ0BD9+vUrse1XX30VO3fuxFtvvYV3330XO3bsgJGREXr06IF169bB2toaPj4+OHHiBA4fPlxsnNazjIyMsHDhQgwfPhyvv/463nvvPen0APXr1y92IW8HBwd07twZS5YsQXp6erEvdN7e3vDw8MC0adNw9+5d1KpVC7/99lupx4Y9T48ePWBrayuNi3JwcFCZ/8EHH2Dt2rUICgrCmTNn0KFDB2RmZuLgwYMYP368yoDz8poxYwZ++eUXdOvWDZMnT5ZOD1CvXj2VPqldu3aoXbs2AgICMGnSJCgUCvzyyy8lBjZfX19s2rQJQUFBaNWqFSwtLVX20j1XuY6Vq4aKDh3++++/S5z/+uuvv/DQ4QULFojWrVsLa2trYWZmJry9vcXnn38ucnNzpWXy8/PFxIkThb29vVAoFCqHEaenp4upU6cKZ2dnYWRkJDw9PcXixYulQzKLZGZmisDAQGFjYyMsLS1F7969pUM/nz6Ut+iw2wcPHhR7PvHx8aJPnz7C2tpaWFlZif79+4t79+7JHn78bBsBAQHCwsKiVNupJPj/Q3XXrVsnPD09hYmJiWjRooV0uO3T/vzzT9G4cWNhbGwsvLy8xLp164qdHuBp7du3FwDEqFGjSpxf2u38+PFjMWnSJGFrayssLCxEz549xZ07d0q9jYpeU08f6vrsa0YIIZKTk8WECRNE3bp1hbGxsXBxcREBAQFCqVQKIf53KPKzhz8XHaK7evXqEp/n044ePSp8fX2FsbGxaNCggQgPDy9xG/7++++iadOmwtTUVNSvX18sXLhQOlz46eeRmJgoevToIWrWrCkA6MSpAvge1873+POegxBC/Pbbb+K1114TFhYWwsLCQnh7e4vAwEARExMjhBDi5s2bYsSIEcLDw0OYmpoKGxsb0blzZ3Hw4MES63nazp07RY0aNcTAgQNFQUGBePjwoQgICBB2dnbC0tJSvPXWW+Lq1avFXgfPnh6gyKZNm0SLFi2EiYmJsLGxEUOGDBHx8fElPq8ffvhBABA1a9ZUOaVAkStXrgg/Pz9haWkp7OzsxOjRo8U///xT7D0v93d6Xh85fvx4AUBs2LChxPlZWVnik08+Ee7u7sLIyEg4OTmJd999V9y4cUMI8b++Z/HixcXWffb1JefixYvi9ddfF6ampqJu3bris88+Ez/99FOxvub48ePi1VdfFWZmZsLZ2VnMmDFD7N+/v9j2z8jIEIMHDxbW1tYCQJlOFaD4/8JJz124cAEtWrTAunXrSjzEUtsoFAoEBgaWuHuXiIrje5zUZerUqfjpp5+QmJj43JNoVhcco6SHnj2FPfDkd3QDA4NSnS2XiLQb3+NUWbKzs7Fu3Tr069ePIen/cYySHlq0aBEiIyPRuXNn1KhRA3/88Qf++OMPjBkzBq6urpouj4gqiO9xUrf79+/j4MGD2Lp1K5KTkzF58mRNl6Q1GJT0ULt27XDgwAF89tlnyMjIgJubG+bOnStdd4uIdBvf46RuV65cwZAhQ+Dg4IClS5eiefPmmi5Ja3CMEhEREZEMjlEiIiIiksGgRERERCSDY5Tw5How9+7dQ82aNav1pReINEUIgfT0dDg7O8PAoOLf30JCQrBt2zZER0fDzMwM7dq1w8KFC1UuN9OpUyccPXpUZb2xY8ciPDy8VI/BfoNI89Tdd5SEY5QAxMfH80gRIi1w584duLi4VLidbt26YdCgQWjVqhXy8/Px8ccf49KlS7hy5Yp0KYhOnTqhYcOGmD9/vrSeubm5yhXan4f9BpH2UFffURLuUQKkix7euXOn1J0kEalPWloaXF1dVS5AWhH79u1Tub9mzRo4ODggMjJS5TxD5ubmcHJyKtdjsN8g0jx19x0lYVDC/66NVKtWLXZ4RBpUWT9hFV0M08bGRmX6+vXrsW7dOjg5OaFnz54IDg6WPcleTk6OygVN09PTAbDfINIGlfnzN4MSEem1wsJCTJkyBe3bt0fjxo2l6YMHD0a9evXg7OyMixcvYubMmYiJicG2bdtKbCckJKRUF2slIv3CMUp4suvOysoKqamp/GZIpAGV+R4cN24c/vjjD/z111/PHcNw6NAhvPHGG7h+/To8PDyKzX92j1LRLn/2G0SaUxWf39yjRER6a8KECdi9ezeOHTv2woGebdq0AQDZoGRiYgITE5NKqZOItBeDUikVFBQgLy9P02VUCkNDQ9SoUYOHOJPeEEJg4sSJ2L59O44cOQJ3d/cXrnPhwgUAQJ06dSq5OiL1EUIgPz8fBQUFmi6lUmjD5xODUilkZGQgPj4e+vwrpbm5OerUqQNjY2NNl0JUYYGBgdiwYQN27tyJmjVrIjExEQBgZWUFMzMz3LhxAxs2bMBbb70FW1tbXLx4EVOnTkXHjh3RtGlTDVdPVDq5ublISEhAVlaWpkupVJr+fGJQeoGCggLEx8fD3Nwc9vb2erfXRQiB3NxcPHjwALGxsfD09Ky0k3YRVZUVK1YAeHKupKetXr0aw4YNg7GxMQ4ePIjQ0FBkZmbC1dUV/fr1w6effqqBaonKrrCwELGxsTA0NISzszOMjY35+VRJGJReIC8vD0II2Nvbw8zMTNPlVAozMzMYGRnh9u3byM3NhampqaZLIqqQF+39dXV1LXZWbiJdkpubi8LCQri6usqe0kIfaMPnE3cdlJK+JfVncS8SEZHuqQ59t6afo/5vYSIiIqJy4k9v5RQXFwelUlllj2dnZwc3N7cqezwiItI9/GxSPwalcoiLi4O3jw8eV+GRBmbm5oiOitL7FyQREZUPP5sqh0aD0rFjx7B48WJERkYiISEB27dvR+/evaX5QgjMmTMHP/zwA1JSUtC+fXusWLECnp6e0jIPHz7ExIkTsWvXLhgYGKBfv3749ttvYWlpWWl1K5VKPM7KwpCZi+HoVvzEdOqWFHcD6xdOh1Kp1OsXIxERlZ8ufTaFhYVh8eLFSExMRLNmzbBs2TK0bt26kiqtGI0GpczMTDRr1gwjRoxA3759i81ftGgRli5dip9//hnu7u4IDg6Gv78/rly5Io18HzJkCBISEnDgwAHk5eVh+PDhGDNmDDZs2FDp9Tu6ecDFs1GlPw4REVFpaftn06ZNmxAUFITw8HC0adMGoaGh8Pf3R0xMDBwcHDRdXjEaDUrdu3dH9+7dS5wnhEBoaCg+/fRT9OrVCwCwdu1aODo6YseOHRg0aBCioqKwb98+/P3332jZsiUAYNmyZXjrrbfw1VdfwdnZucS2S7pmkz558OABmjRpgkmTJuHjjz8GAJw4cQKdOnXCH3/8gTfeeEPDFdKz4uPjkZycXOb1bG1tX3hpDiKSp+4xPdVhzE5FLVmyBKNHj8bw4cMBAOHh4dizZw9WrVqFjz76SMPVFae1Y5RiY2ORmJgIPz8/aZqVlRXatGmDkydPYtCgQTh58iSsra2lkAQAfn5+MDAwwOnTp9GnT58S29b3q4Db29tj1apV6N27N7p27QovLy+8//77mDBhAkOSFoqPj4eXtzeyMjPLvK65hQVioqMZlojKIS4uDj4+3sjKeqy2Ns3NzRAVFc2wJCM3NxeRkZGYNWuWNM3AwAB+fn44efKkBiuTp7VBqeiSA46OjirTHR0dpXmJiYnFdtPVqFEDNjY20jIlmTVrFoKCgqT7RVcB1ydvvfUWRo8ejSFDhqBly5awsLBASEiIpsuiEiQnJyMrMxMffPIN7F0alHq9B/E3sfbzqUhOTmZQIioHpVKJrKzHWPfxAPi42Ve4vai4Bxj6xWaOJ30OpVKJgoKCEj/bo6OjNVTV82ltUKpM1eUq4F999RUaN26MLVu2IDIyslo8Z11m79IAdT28NV0GUbXj42aPVxrW1XQZpKW09oSTTk5OAICkpCSV6UlJSdI8Jycn3L9/X2V+fn4+Hj58KC1Tnd24cQP37t1DYWEhbt26pelyiIiomrOzs4OhoeFzP9u1jdYGJXd3dzg5OSEiIkKalpaWhtOnT6Nt27YAgLZt2yIlJQWRkZHSMocOHUJhYSHatGlT5TVrk9zcXAwdOhQDBw7EZ599hlGjRhULlURERFXJ2NgYvr6+Kp/thYWFiIiIkD7btY1Gf3rLyMjA9evXpfuxsbG4cOECbGxs4ObmhilTpmDBggXw9PSUTg/g7OwsnWvJx8cH3bp1w+jRoxEeHo68vDxMmDABgwYNkj3iTZ2S4m5U+mOU93E++eQTpKamYunSpbC0tMTevXsxYsQI7N69uxIqJCIibaHNn00AEBQUhICAALRs2RKtW7dGaGgoMjMzpaPgtI1Gg9LZs2fRuXNn6X7RAOuAgACsWbMGM2bMQGZmJsaMGYOUlBS89tpr2Ldvn8rVg9evXy8dzVV0wsmlS5dWat12dnYwMzfH+oXTK/VxnmZmbg47O7tSLXvkyBGEhobi8OHDqFWrFgDgl19+QbNmzbBixQqMGzeuMkslIiIN0PbPpiIDBw7EgwcPMHv2bCQmJqJ58+bYt29fsQHe2kKjQalTp04QQsjOVygUmD9/PubPny+7jI2NTZWcXPJpbm5uiI6K0trr6XTq1Al5eXkq0+rXr4/U1NTKKI2IiLSAtn82PW3ChAmYMGFCJVSkftXyqDd1cHNz4+GfRESkVfjZpH5aO5ibiIiISNMYlIiIiIhkMCgRERERyWBQIiIiIpLBoEREREQkg0GJiIiISAaDEhEREZEMnkepnOLi4nTipF5ERFR98LNJ/RiUyiEuLg4+Pt7IynpcZY9pbm6GqKhovX9BEhFR+fCzqXIwKJWDUqlEVtZjrPt4AHzc7Cv98aLiHmDoF5uhVCr1+sVIRETlpyufTceOHcPixYsRGRmJhIQEbN++XbrYvTZiUKoAHzd7vNKwrqbLICIikmj7Z1NmZiaaNWuGESNGoG/fvpou54U4mFsPrV27Fra2tsjJyVGZ3rt3b7z//vsaqoqIiAjo3r07FixYgD59+mi6lFJhUNJD/fv3R0FBAX7//Xdp2v3797Fnzx6MGDFCg5URERHpFgYlPWRmZobBgwdj9erV0rR169bBzc0NnTp10lxhREREOoZBSU+NHj0af/75J+7evQsAWLNmDYYNGwaFQqHhyoiIiHQHB3PrqRYtWqBZs2ZYu3YtunbtisuXL2PPnj2aLouIiEinMCjpsVGjRiE0NBR3796Fn58fXF1dNV0SERGRTmFQqoCouAda/TiDBw/GtGnT8MMPP2Dt2rVqroqIiLSRtn82ZWRk4Pr169L92NhYXLhwATY2Nlp5rkAGpXKws7ODubkZhn6xucoe09zcDHZ2dmVax8rKCv369cOePXu0+mReRERUcbry2XT27Fl07txZuh8UFAQACAgIwJo1a9RZnlowKJWDm5sboqKideJ6Onfv3sWQIUNgYmJSCVUREZG20JXPpk6dOkEIUUkVqR+DUjm5ublp5S7CIo8ePcKRI0dw5MgRLF++XNPlEBFRFdD2zyZdxKCkp1q0aIFHjx5h4cKF8PLy0nQ5REREOolBSU/dunVL0yUQERHpPJ5wkoiIiEgGg1Ip6dLAs/LQ9+dHRKSPqkPfrennyKD0AoaGhgCA3NxcDVdSubKysgAARkZGGq6EiIhepKivLuq79ZmmP584RukFatSoAXNzczx48ABGRkYwMNCvbCmEQFZWFu7fvw9ra2spGBIRkfYyNDSEtbU17t+/DwAwNzfXu2t5asvnE4PSCygUCtSpUwexsbG4ffu2psupNNbW1nByctJ0GUREVEpFfXZRWNJXmv58YlAqBWNjY3h6eurtz29GRkbck0REpGOKvsg7ODggLy9P0+VUCm34fGJQKiUDAwOYmppqugwiIiIVhoaGGg8T+ky/BtwQERERqRGDEhEREZEMBiUiIiIiGQxKRERERDIYlIiIiIhkMCgRERERyWBQIiIiIpLBoEREREQkg0GJiIiISAaDEhEREZEMBiUiIiIiGQxKRERERDIYlIiIiIhkMCgRERERyWBQIiIiIpLBoEREREQkg0GJiPROSEgIWrVqhZo1a8LBwQG9e/dGTEyMyjLZ2dkIDAyEra0tLC0t0a9fPyQlJWmoYiLSVgxKRKR3jh49isDAQJw6dQoHDhxAXl4eunbtiszMTGmZqVOnYteuXdiyZQuOHj2Ke/fuoW/fvhqsmoi0UQ1NF0BEpG779u1Tub9mzRo4ODggMjISHTt2RGpqKn766Sds2LABXbp0AQCsXr0aPj4+OHXqFF599VVNlE1EWoh7lIhI76WmpgIAbGxsAACRkZHIy8uDn5+ftIy3tzfc3Nxw8uTJEtvIyclBWlqayo2I9B+DEhHptcLCQkyZMgXt27dH48aNAQCJiYkwNjaGtbW1yrKOjo5ITEwssZ2QkBBYWVlJN1dX18ounYi0AIMSEem1wMBAXLp0CRs3bqxQO7NmzUJqaqp0u3PnjpoqJCJtxjFKRKS3JkyYgN27d+PYsWNwcXGRpjs5OSE3NxcpKSkqe5WSkpLg5ORUYlsmJiYwMTGp7JKJSMtwjxIR6R0hBCZMmIDt27fj0KFDcHd3V5nv6+sLIyMjRERESNNiYmIQFxeHtm3bVnW5RKTFuEeJiPROYGAgNmzYgJ07d6JmzZrSuCMrKyuYmZnBysoKI0eORFBQEGxsbFCrVi1MnDgRbdu25RFvRKSCQYmI9M6KFSsAAJ06dVKZvnr1agwbNgwA8M0338DAwAD9+vVDTk4O/P39sXz58iqulIi0HYMSEekdIcQLlzE1NUVYWBjCwsKqoCIi0lUco0REREQkQ6uDUkFBAYKDg+Hu7g4zMzN4eHjgs88+U/m2KITA7NmzUadOHZiZmcHPzw/Xrl3TYNVERESkL7Q6KC1cuBArVqzAd999h6ioKCxcuBCLFi3CsmXLpGUWLVqEpUuXIjw8HKdPn4aFhQX8/f2RnZ2twcqJiIhIH2j1GKUTJ06gV69e6NGjBwCgfv36+PXXX3HmzBkAT/YmhYaG4tNPP0WvXr0AAGvXroWjoyN27NiBQYMGaax2IiIi0n1avUepXbt2iIiIwNWrVwEA//zzD/766y90794dABAbG4vExESV6zVZWVmhTZs2stdrAnjNJiIiIiodrd6j9NFHHyEtLQ3e3t4wNDREQUEBPv/8cwwZMgQApHOjODo6qqz3vOs1AU+u2TRv3rzKK5yIiIj0glbvUdq8eTPWr1+PDRs24Ny5c/j555/x1Vdf4eeff65Qu7xmExEREZWGVu9Rmj59Oj766CNprFGTJk1w+/ZthISEICAgQLomU1JSEurUqSOtl5SUhObNm8u2y2s2ERHpnri4OCiVSrW1FxUVpba2SH9pdVDKysqCgYHqTi9DQ0MUFhYCANzd3eHk5ISIiAgpGKWlpeH06dMYN25cVZdLRESVJC4uDt4+PniclaX2thMepqu9TdIfWh2Uevbsic8//xxubm5o1KgRzp8/jyVLlmDEiBEAAIVCgSlTpmDBggXw9PSEu7s7goOD4ezsjN69e2u2eCIiUhulUonHWVkYMnMxHN081NJmzMVI7Pn+c6Rk8HQyJE+rg9KyZcsQHByM8ePH4/79+3B2dsbYsWMxe/ZsaZkZM2YgMzMTY8aMQUpKCl577TXs27cPpqamGqyciIgqg6ObB1w8G6mlLXX+jEf6S6uDUs2aNREaGorQ0FDZZRQKBebPn4/58+dXXWFERERULWj1UW9EREREmsSgRERERCSDQYmIiIhIBoMSERERkQwGJSIiIiIZDEpEREREMhiUiIiIiGQwKBERERHJYFAiIiIiksGgRERERCSDQYmIiIhIBoMSERERkQwGJSIiIiIZDEpEREREMhiUiIiIiGQwKBERERHJYFAiIiIiksGgRERERCSDQYmIiIhIBoMSERERkQwGJSIiIiIZDEpEREREMhiUiIiIiGQwKBERERHJYFAiIiIiksGgRERERCSDQYmIiIhIBoMSERERkQwGJSIiIiIZDEpEREREMhiUiIiIiGQwKBERERHJYFAiIiIiksGgRERERCSDQYmIiIhIBoMSERERkQwGJSIiIiIZDEpEREREMhiUiIiIiGQwKBERERHJYFAiIiIiksGgRERERCSDQYmIiIhIBoMSERERkQwGJSIiIiIZDEpEREREMhiUiIiIiGQwKBERERHJYFAiIiIiksGgRERERCSDQYmIiIhIBoMSEemdY8eOoWfPnnB2doZCocCOHTtU5g8bNgwKhULl1q1bN80US0RajUGJiPROZmYmmjVrhrCwMNllunXrhoSEBOn266+/VmGFRKQrami6ACIidevevTu6d+/+3GVMTEzg5ORU6jZzcnKQk5Mj3U9LSyt3faRdYhMf4dzVuxVuJyrugRqqIW3DoERE1dKRI0fg4OCA2rVro0uXLliwYAFsbW1llw8JCcG8efOqsEKqbBkpyQCA4FUHELzqgNraTUhIUFtbpHkMSkRU7XTr1g19+/aFu7s7bty4gY8//hjdu3fHyZMnYWhoWOI6s2bNQlBQkHQ/LS0Nrq6uVVUyVYKcrAwAQMee/dGyqU+F24u5dhN7Nq9DSkpKhdsi7cGgRETVzqBBg6T/N2nSBE2bNoWHhweOHDmCN954o8R1TExMYGJiUlUlUhWysrWHS716FW5HmZqhhmpI23AwNxFVew0aNICdnR2uX7+u6VKISMswKBFRtRcfH4/k5GTUqVNH06UQkZbR+qB09+5dDB06FLa2tjAzM0OTJk1w9uxZab4QArNnz0adOnVgZmYGPz8/XLt2TYMVE5GmZWRk4MKFC7hw4QIAIDY2FhcuXEBcXBwyMjIwffp0nDp1Crdu3UJERAR69eqFl156Cf7+/potnIi0jlYHpUePHqF9+/YwMjLCH3/8gStXruDrr79G7dq1pWUWLVqEpUuXIjw8HKdPn4aFhQX8/f2RnZ2twcqJSJPOnj2LFi1aoEWLFgCAoKAgtGjRArNnz4ahoSEuXryId955Bw0bNsTIkSPh6+uL//73vxyDRETFaPVg7oULF8LV1RWrV6+Wprm7u0v/F0IgNDQUn376KXr16gUAWLt2LRwdHbFjxw6VAZtEVH106tQJQgjZ+fv376/CaohIl2n1HqXff/8dLVu2RP/+/eHg4IAWLVrghx9+kObHxsYiMTERfn5+0jQrKyu0adMGJ0+elG03JycHaWlpKjciIiKiZ2l1ULp58yZWrFgBT09P7N+/H+PGjcOkSZPw888/AwASExMBAI6OjirrOTo6SvNKEhISAisrK+nGc6EQERFRSbQ6KBUWFuKVV17BF198gRYtWmDMmDEYPXo0wsPDK9TurFmzkJqaKt3u3LmjpoqJiIhIn2h1UKpTpw5efvlllWk+Pj6Ii4sDAOk6TUlJSSrLJCUlPfcaTiYmJqhVq5bKjYiIiOhZWh2U2rdvj5iYGJVpV69eRb3/P4Oqu7s7nJycEBERIc1PS0vD6dOn0bZt2yqtlYiIiPSPVh/1NnXqVLRr1w5ffPEFBgwYgDNnzmDlypVYuXIlAEChUGDKlClYsGABPD094e7ujuDgYDg7O6N3796aLZ6IiIh0nlYHpVatWmH79u2YNWsW5s+fD3d3d4SGhmLIkCHSMjNmzEBmZibGjBmDlJQUvPbaa9i3bx9MTU01WDkREVVXsbGxOHfunNras7Ozg5ubm9rao7LR6qAEAG+//Tbefvtt2fkKhQLz58/H/Pnzq7AqIiIiVRnpT041ExwcjODgYLW1a2ZujuioKIYlDdH6oERERKQLch4/BgB0HDwJLdt3UkubSXE3sH7hdCiVSgYlDWFQIiIiUiMrRxe4eDbSdBmkJuU66q1BgwZITk4uNj0lJQUNGjSocFFEVD01bdqUfQsRaZVyBaVbt26hoKCg2PScnBzcvXu3wkURUfUUFxfHvoWItEqZfnr7/fffpf/v378fVlZW0v2CggJERESgfv36aiuOiKqHvXv3Sv9n30JE2qRMQano3EQKhQIBAQEq84yMjFC/fn18/fXXaiuOiKqHwYMHA2DfQkTap0xBqbCwEMCTM2L//fffsLOzq5SiiKh6SUlJgZWVFVxcXBAZGcm+hYi0RrmOeouNjVV3HURE+Pfff3ntRSLSKuU+PUBERAQiIiJw//59aU9TkVWrVlW4MCKqnti3EJE2KVdQmjdvHubPn4+WLVuiTp06UCgU6q6LiKqhL7/8EgsXLmTfQkRao1xBKTw8HGvWrMH777+v7nqIqBpbtWoV+xYi0irlOo9Sbm4u2rVrp+5aiKiaY99CRNqmXEFp1KhR2LBhg7prIaJq7oMPPmDfQkRapVw/vWVnZ2PlypU4ePAgmjZtCiMjI5X5S5YsUUtxRFS95OTkYMmSJexbSJZSqYShZYJa2kpLT1dLO89KT0tDQoJ6alQqlWpph8qvXEHp4sWLaN68OQDg0qVLKvM4+JKIyuvy5cvsW6hERcFj27ZtMLS0UUubufefnOomPz9fLe1l5z25/M7Zs2dxPvqmWtosyHgIAGoLXlR25QpKhw8fVncdRETYvXs3z6NEJUpJSQEAdG7RAN5enmpp89ChTEReAwoKi19fsDxy85+008zDAe1atVBLm9Ex17Drn/89f6p65T6PEhERUVWrbWmKOrbqCdPmpsZqaedZFqZGaqsxydJULe1Q+ZUrKHXu3Pm5u8EPHTpU7oKIqPp6++23UaOGfLfEvoWIqlq5glLRGIIieXl5uHDhAi5dulTsgpZERKXVpEkTGBv/71s++xYi0rRyBaVvvvmmxOlz585FRkZGhQoiouorJCSkxDFK7FuISFPKdR4lOUOHDuW1mIhI7di3EJGmqDUonTx5EqamHHhGROrFvoWINKVcP7317dtX5b4QAgkJCTh79iyCg4PVUhgRVT9DhgxROckk+xYi0rRyBSUrKyuV+wYGBvDy8sL8+fPRtWtXtRRGRNWPlZWVSlBi30JEmlauoLR69Wp110FEhOXLl/OEk0SkVSp0wsnIyEhERUUBABo1aoQWLdRzJlIiqt7YtxCRtihXULp//z4GDRqEI0eOwNraGsCT06t37twZGzduhL29vTprJKJq4sGDB+jduzf7FiLSGuUKShMnTkR6ejouX74MHx8fAMCVK1cQEBCASZMm4ddff1VrkURUPUyfPp19ix6Ji4uDUqlUS1uxsbFqaYeorMoVlPbt24eDBw9KHRkAvPzyywgLC+OASyIqt4iICPYteiIuLg4+Pt7Iynqs1nbz8vLV2h7Ri5QrKBUWFqocmVLEyMgIhYWFFS6KiKon9i36Q6lUIivrMdZ9PAA+bhX/yfTHvWex4vfTKChgUKKqVa6g1KVLF0yePBm//vornJ2dAQB3797F1KlT8cYbb6i1QCKqPjp27Mi+Rc/4uNnjlYZ1K9yO85mraqiGqOzKdWbu7777Dmlpaahfvz48PDzg4eEBd3d3pKWlYdmyZequkYiqicWLF7NvISKtUq49Sq6urjh37hwOHjyI6OhoAICPjw/8/PzUWhwRVS8uLi7sW4hIq5QpKB06dAgTJkzAqVOnUKtWLbz55pt48803AQCpqalo1KgRwsPD0aFDh0oploj009GjRwEAaWlp7FuISKuU6ae30NBQjB49usQz51pZWWHs2LFYsmSJ2oojouphxYoVAMC+hYi0TpmC0j///INu3brJzu/atSsiIyMrXBQRVS+XLl167nz2LUSkKWUKSklJSSUeulukRo0aePDgQYWLIqLq5f79+8+dz76FiDSlTEGpbt26z/3md/HiRdSpU6fCRRFR9fKifoN9CxFpSpmC0ltvvYXg4GBkZ2cXm/f48WPMmTMHb7/9ttqKI6Lqoeis2+xbiEjblOmot08//RTbtm1Dw4YNMWHCBHh5eQEAoqOjERYWhoKCAnzyySeVUigR6a/p06dj5cqV8PX1xcSJE9m3EJHWKFNQcnR0xIkTJzBu3DjMmjULQggAgEKhgL+/P8LCwuDo6FgphRKR/nJwcADw5JxJ7FuISJuU+YST9erVw969e/Ho0SNcv34dQgh4enqidu3alVEfEVUjW7duRUFBAfsWItIa5TozNwDUrl0brVq1UmctRETsW/RMVJx6jla8p0xTSzu6KjY2FufOnVNLW3Z2dnBzc1NLW9VBuYMSERGRnISEBADA0C82q7Xdx7n5am1P22WkPwmIwcHBCA4OVkubZubmiI6KYlgqJQYlIiJSu5SUFABAjwFD4eXZoMLtHT52Auf/exA5+QUVbkuX5Dx+DADoOHgSWrbvVOH2kuJuYP3C6VAqlQxKpcSgRERElcbWwQEu9epVuB1L6yg1VKO7rBxd4OLZSNNlVEtlOo8SERERUXXCoEREREQkg0GJiIiISAaDEhEREZEMBiUiIiIiGQxKRERERDIYlIiIiIhkMCgRERERyWBQIiK9c+zYMfTs2RPOzs5QKBTYsWOHynwhBGbPno06derAzMwMfn5+uHbtmmaKJSKtxqBERHonMzMTzZo1Q1hYWInzFy1ahKVLlyI8PBynT5+GhYUF/P39kZ2dXcWVEpG24yVMiEjvdO/eHd27dy9xnhACoaGh+PTTT9GrVy8AwNq1a+Ho6IgdO3Zg0KBBVVkqEWk5BiUiqlZiY2ORmJgIPz8/aZqVlRXatGmDkydPygalnJwc5OTkSPfT0tIqvdaqFBcXB6VSqbb2YmNj1dYWAelpaUhISKhwO+r8G1cXDEpEVK0kJiYCABwdHVWmOzo6SvNKEhISgnnz5lVqbZoSFxcHbx8fPM7KUnvbWTn5am+zOsnOKwAAnD17Fuejb1a4vYKMhwCgltBVXTAoERGVwqxZsxAUFCTdT0tLg6urqwYrUh+lUonHWVkYMnMxHN081NLmmYjd+GvbKuTkMShVRG7+k6DUzMMB7Vq1qHB70THXsOsfICUlpcJtVRc6FZS+/PJLzJo1C5MnT0ZoaCgAIDs7Gx9++CE2btyInJwc+Pv7Y/ny5cW+LRIRAYCTkxMAICkpCXXq1JGmJyUloXnz5rLrmZiYwMTEpLLL0yhHNw+4eDZSS1sxFyPV0g49YWFqhDq2tSrcTpKlqRqqqV505qi3v//+G99//z2aNm2qMn3q1KnYtWsXtmzZgqNHj+LevXvo27evhqokIm3n7u4OJycnRERESNPS0tJw+vRptG3bVoOVEZE20ok9ShkZGRgyZAh++OEHLFiwQJqempqKn376CRs2bECXLl0AAKtXr4aPjw9OnTqFV199tcT29H1QJlF1l5GRgevXr0v3Y2NjceHCBdjY2MDNzQ1TpkzBggUL4OnpCXd3dwQHB8PZ2Rm9e/fWXNFEpJV0Yo9SYGAgevTooXKUCgBERkYiLy9PZbq3tzfc3Nxw8uRJ2fZCQkJgZWUl3fRlnAERPXH27Fm0aNECLVo8GdMRFBSEFi1aYPbs2QCAGTNmYOLEiRgzZgxatWqFjIwM7Nu3D6am/FmCiFRp/R6ljRs34ty5c/j777+LzUtMTISxsTGsra1Vpr/o6BV9HpRJRECnTp0ghJCdr1AoMH/+fMyfP78KqyIiXaTVQenOnTuYPHkyDhw4oNZvetVhUCYRERFVnFb/9BYZGYn79+/jlVdeQY0aNVCjRg0cPXoUS5cuRY0aNeDo6Ijc3NxihzkmJSVJR7YQERERlZdW71F644038O+//6pMGz58OLy9vTFz5ky4urrCyMgIERER6NevHwAgJiYGcXFxPHqFiIiIKkyrg1LNmjXRuHFjlWkWFhawtbWVpo8cORJBQUGwsbFBrVq1MHHiRLRt21b2iDciIiKi0tLqoFQa33zzDQwMDNCvXz+VE04SERERVZTOBaUjR46o3Dc1NUVYWBjCwsI0UxARERHpLZ0LSkRERFQxsbGxOHfunNras7Ozg5ubm9ra0yYMSkRERNVEVs6TixQHBwcjODhYbe2am5shKipaL8MSgxIREVE1kZP3JCjN7P8qBrzhq5Y2o+IeYOgXm6FUKhmUiIiISPe52NfEKw3raroMnaDVJ5wkIiIi0iQGJSIiIiIZDEpEREREMhiUiIiIiGQwKBERERHJYFAiIiIiksGgRERERCSDQYmIiIhIBoMSERERkQwGJSIiIiIZDEpEREREMnitNyIiomrmzoM0nLt6Vy1tRcU9UEs72opBiYiIqJrIykgHACzachqLtpxWa9sJCQlqbU9bMCgRERFVE7k52QCAV7v1QjvfZmppM+baTezZvA4pKSlqaU/bMCgRERFVM7Vs7OBSr55a2lKmZqilHW3FwdxEREREMhiUiIiIiGQwKBERERHJYFAiIiIiksGgRERERCSDQYmIiIhIBoMSERERkQwGJSIiIiIZDEpEREREMhiUiIiIiGQwKBERERHJ4LXeiIiIqMJiY2Nx7tw5tbRlZ2cHNzc3tbRVUQxKREREVG4Z6WkAgODgYAQHB6ulTTNzc0RHRWlFWGJQIiIionLLefwYANBx8CS0bN+pwu0lxd3A+oXToVQqGZSIiIhIP1g5usDFs5Gmy1A7DuYmIiIiksGgRERERCSDQYmIiIhIBoMSERERkQwGJSIiIiIZDEpEREREMhiUiIiIiGQwKBERERHJYFAiIiIiksGgRERERCSDlzAhItJBcXFxUCqVamkrKipKLe0Q6SMGJSIiHRMXFwcfH29kZT1Wa7sZGRlqbY9IHzAoERHpGKVSiaysx1j38QD4uNlXuL29Z64ieNUBZGdnq6E6Iv3CoEREpKN83OzxSsO6FW4nKu6BGqoh0k8czE1EREQkg0GJiIiISAaDEhEREZEMBiUiIiIiGQxKRERERDIYlIiIiIhkMCgRERERyWBQIiIiIpLBoERE1dLcuXOhUChUbt7e3poui4i0DM/MTUTVVqNGjXDw4EHpfo0a7BKJSBV7BdJ78fHxSE5OLvN6tra2cHFxqYSKSFvUqFEDTk5Omi6DiLSYVgelkJAQbNu2DdHR0TAzM0O7du2wcOFCeHl5SctkZ2fjww8/xMaNG5GTkwN/f38sX74cjo6OGqyctEV8fDy8vb2QmZlV5nUtLMwRHR3DsKTHrl27BmdnZ5iamqJt27YICQmBm5tbicvm5OQgJydHup+WllZVZVaZRykpSEhIUEtbaenpammHdEd6WppaXj9KpVIN1aiPVgelo0ePIjAwEK1atUJ+fj4+/vhjdO3aFVeuXIGFhQUAYOrUqdizZw+2bNkCKysrTJgwAX379sXx48c1XD1pg+TkZGRmZmHT7EHwci39VdZj7jzAwPkbkZyczKCkp9q0aYM1a9bAy8sLCQkJmDdvHjp06IBLly6hZs2axZYPCQnBvHnzNFBp5VOmZgIADh86hGNnLqilzdz7sQCA/Px8tbRH2is7rwAAcPbsWZyPvlnh9goyHgKA2kJ7RWl1UNq3b5/K/TVr1sDBwQGRkZHo2LEjUlNT8dNPP2HDhg3o0qULAGD16tXw8fHBqVOn8Oqrr2qibNJCXq72aOZRR9NlkBbp3r279P+mTZuiTZs2qFevHjZv3oyRI0cWW37WrFkICgqS7qelpcHV1bVKaq1s6Y9zAQDtG7mgaZNGamnz0KFMRF4DCgoL1NIeaa/c/Cd/42YeDmjXqkWF24uOuYZd/wApKSkVbksdtDooPSs1NRUAYGNjAwCIjIxEXl4e/Pz8pGW8vb3h5uaGkydPygal6rALnYjKxtraGg0bNsT169dLnG9iYgITE5MqrqpqWZkbo45tLbW0ZW5qrJZ2SHdYmBqp5fWTZGmqhmrUR2dOD1BYWIgpU6agffv2aNy4MQAgMTERxsbGsLa2VlnW0dERiYmJsm2FhITAyspKuunLt0IiKr+MjAzcuHEDdepwzyMR/Y/OBKXAwEBcunQJGzdurHBbs2bNQmpqqnS7c+eOGiokIl0ybdo0HD16FLdu3cKJEyfQp08fGBoa4r333tN0aUSkRXTip7cJEyZg9+7dOHbsmMrAWicnJ+Tm5iIlJUVlr1JSUtJzD/mtDrvQiej54uPj8d577yE5ORn29vZ47bXXcOrUKdjbl37QPxHpP60OSkIITJw4Edu3b8eRI0fg7u6uMt/X1xdGRkaIiIhAv379AAAxMTGIi4tD27ZtNVEyEekIdeydJiL9p9VBKTAwEBs2bMDOnTtRs2ZNadyRlZUVzMzMYGVlhZEjRyIoKAg2NjaoVasWJk6ciLZt2/KINyIiIqowrQ5KK1asAAB06tRJZfrq1asxbNgwAMA333wDAwMD9OvXT+WEk0REREQVpdVBSQjxwmVMTU0RFhaGsLCwKqiIiIiIqhOdOeqNiIiIqKoxKBERERHJYFAiIiIiksGgRERERCRDqwdzExGRvKi4B2pp556S17skksOgRESkYxISEgAAQ7/YrNZ2H+fmq7U9In3AoEREpGNSUlIAAD0GDIWXZ4MKt3f42Amc/+9B5OQXVLgtIn3DoEREpKNsHRzgUq9ehduxtI5SQzVE+omDuYmIiIhkMCgRERERyWBQIiIiIpLBoEREREQkg0GJiIiISAaDEhEREZEMBiUiIiIiGQxKRERERDIYlIiIiIhkMCgRERERyeAlTIiIKllcXByUSqXa2ouNjVVbW0T0fAxKRESVKC4uDj4+3sjKeqz2tvPy8tXeJhGpYlAiIqpESqUSWVmPse7jAfBxs1dLmz/uPYsVv59GQQGDElFlY1AiIqoCPm72eKVhXbW05XzmqlraIaIX42BuIiIiIhkMSkREREQyGJSIiIiIZDAoEREREclgUCIiIiKSwaPeiLTEw2Qlalgmlml5IiKqXAxKRBqWlJQEANiydSsMLWqXer2CzEcq6xMRkfoxKBFpWEpKCgDAz/cleHl6lHq9mGs3sOPC/9YnIiL1Y1Ai0hK1LUzhaFOz1MvftzCtxGqIiAhgUKIKio+PR3JycpnXs7W1hYuLSyVUREREpD4MSlRu8fHx8Pb2QmZmVpnXtbAwR3R0DMMSERFpNQYlKrfk5GRkZmZh0+xB8HIt/cU+Y+48wMD5G5GcnMygREREWo1BiSRl/RktJiYGAODlao9mHnXK/HhF65dFVf9kpws1EhFR5WFQIgAV+xktIyOjTMsnPcyAAsDAgQPL/FhV9ZOdLtRIRESVj0GJAJTvZ7T9f1/FzJX7kZ2TXabHSsl8DAHg+0nd0KaJZ6nXq8qf7HShRiIiqnwMSqSiLD+jxdx5UKHH8nCuXa6f7KqSLtRIRESVh0GJdE5Zxg2VZ4yROpTlcW/dulV5hRARUYUwKJHOqMi4obKOoyqvitSYl5+n/oKIiKhCGJRIZ5Rn3FB5x1GVV3lqXL3vLL797STyCwoqtzgiIiozBiXSOWUZN1TRcVTlVZYanW1rVXI1RERUXgaaLoCIiIhIWzEoEREREclgUCIiIiKSwTFKVaSslwcBeCkMqjx8PRIRlQ6DUhUo7+VBeCkMqgx8PRIRlR6DUhUoz+VBeCkMqix8PRIRlR6DUhUqy+VBiCpbeV6P5TnTOX+yeyIqTn2nqrinTFNbW0T0fAxKRPRCFTnjeHX/yS4hIQEAMPSLzWpv+3FuvtrbJCJVDEpE9ELlOeM4wJ/sACAlJQUA0GPAUHh5NlBLm4ePncD5/x5ETj7P5k5U2RiUiKjUynLGcVJl6+AAl3r11NKWpXWUWtohohdjUNJyHBNCL3Lr1i38888/pV6+PK8pIqLqikFJS3FMCL1IVk4eAGDmzJmYOXNmmdfPyMhQd0lERHqHQUlLcUwIvUjR+JRP32uHd7v4lnq9/X9fxcyV+5Gdk11ZpRER6Q0GJS3HMSH0Iq72tcr0Gom5o77D1ImI9B2v9UZEREQkg0GJiIiISAaDEhEREZEMjlEqo/JcdV0Th2OX9TF5yDgREVFxehOUwsLCsHjxYiQmJqJZs2ZYtmwZWrdurdbHKO9V14tUxeHYFTmtAMBDxqn6qYq+g4h0l14EpU2bNiEoKAjh4eFo06YNQkND4e/vj5iYGDg4OKjtccpz1XWgag/HLu9pBXjIOFVHVdV3EJHu0ougtGTJEowePRrDhw8HAISHh2PPnj1YtWoVPvroo2LL5+TkICcnR7qfmpoKAEhLe/4VuYv2tkTHPZBO9lcasYkPAQD7z97AnUe5pVrndFRcmdd5er3r9x7C2OyeVtdYFevpQo1nY+IBALdib8MQilI/1p3btwAAJ67Ew9gsslJrLO/2iEtKAfDkvfO891fRPCFEqdtWh7L0HeXtN7KynuyBvn79JnJzc567bGkl3Xvy3r535w7OnD2nde1VRpussfrUGBd3F8CT986L3l9V0ncIHZeTkyMMDQ3F9u3bVaZ/8MEH4p133ilxnTlz5ggAvPHGm5bd7ty5UwW9xhNl7TvYb/DGm/beKrPv0Pk9SkqlEgUFBXB0dFSZ7ujoiOjo6BLXmTVrFoKCgqT7hYWFePjwIWxtbaFQlP4bfWVJS0uDq6sr7ty5g1q1amm6HL3B7ap+6tqmQgikp6fD2dlZjdU9X1n7Dm3rN3Tx9ayLNQO6Wbcu1gyUve6q6Dt0PiiVh4mJCUxMTFSmWVtba6aY56hVq5ZOvcB1Bber+qljm1pZWampmsqhrf2GLr6edbFmQDfr1sWagbLVXdl9h86fR8nOzg6GhoZISkpSmZ6UlAQnJycNVUVE2o59BxGVhs4HJWNjY/j6+iIiIkKaVlhYiIiICLRt21aDlRGRNmPfQUSloRc/vQUFBSEgIAAtW7ZE69atERoaiszMTOlIFl1jYmKCOXPmFNvNTxXD7ap+ur5Ndbnv0MVtr4s1A7pZty7WDGhn3Qohqvh43Ery3XffSSeNa968OZYuXYo2bdpouiwi0nLsO4joefQmKBERERGpm86PUSIiIiKqLAxKRERERDIYlIiIiIhkMCgRERERyWBQ0qBbt25h5MiRcHd3h5mZGTw8PDBnzhzk5qpedPTixYvo0KEDTE1N4erqikWLFhVra8uWLfD29oapqSmaNGmCvXv3VtXT0AlhYWGoX78+TE1N0aZNG5w5c0bTJWmtkJAQtGrVCjVr1oSDgwN69+6NmJgYlWWys7MRGBgIW1tbWFpaol+/fsVO3BgXF4cePXrA3NwcDg4OmD59OvLz86vyqVQLpe1HtM3nn3+Odu3awdzcXCvOcC5H1/qOY8eOoWfPnnB2doZCocCOHTs0XdILlabP0SQGJQ2Kjo5GYWEhvv/+e1y+fBnffPMNwsPD8fHHH0vLpKWloWvXrqhXrx4iIyOxePFizJ07FytXrpSWOXHiBN577z2MHDkS58+fR+/evdG7d29cunRJE09L62zatAlBQUGYM2cOzp07h2bNmsHf3x/379/XdGla6ejRowgMDMSpU6dw4MAB5OXloWvXrsjMzJSWmTp1Knbt2oUtW7bg6NGjuHfvHvr27SvNLygoQI8ePZCbm4sTJ07g559/xpo1azB79mxNPCW9Vpp+RBvl5uaif//+GDdunKZLkaWLfUdmZiaaNWuGsLAwTZdSaqXpczSq0i63S+WyaNEi4e7uLt1fvny5qF27tsjJyZGmzZw5U3h5eUn3BwwYIHr06KHSTps2bcTYsWMrv2Ad0Lp1axEYGCjdLygoEM7OziIkJESDVemO+/fvCwDi6NGjQgghUlJShJGRkdiyZYu0TFRUlAAgTp48KYQQYu/evcLAwEAkJiZKy6xYsULUqlVL5bVMlePZfkSbrV69WlhZWWm6jBLpet8BQGzfvl3TZZTZs32OpnGPkpZJTU2FjY2NdP/kyZPo2LEjjI2NpWn+/v6IiYnBo0ePpGX8/PxU2vH398fJkyerpmgtlpubi8jISJXtY2BgAD8/P26fUkpNTQUA6XUZGRmJvLw8lW3q7e0NNzc3aZuePHkSTZo0gaOjo7SMv78/0tLScPny5Sqsvnp6th+hsmPfoTnP9jmaxqCkRa5fv45ly5Zh7Nix0rTExESVDxsA0v3ExMTnLlM0vzpTKpUoKCjg9imnwsJCTJkyBe3bt0fjxo0BPHm9GRsbFxtX8vQ2Lc3rlipHSf0IlR37Ds0oqc/RNAalSvDRRx9BoVA89xYdHa2yzt27d9GtWzf0798fo0eP1lDlRKoCAwNx6dIlbNy4UdOlVDu62I+Up2aip2ljn6MXF8XVNh9++CGGDRv23GUaNGgg/f/evXvo3Lkz2rVrpzJIGwCcnJyKHU1UdN/Jyem5yxTNr87s7OxgaGjI7VMOEyZMwO7du3Hs2DG4uLhI052cnJCbm4uUlBSVvUpPb1MnJ6diRwc9+7ql51NnP1JVylqzNmPfUfXk+hyN0/QgqeouPj5eeHp6ikGDBon8/Pxi84sGc+fm5krTZs2aVWww99tvv62yXtu2bTmY+/+1bt1aTJgwQbpfUFAg6tatqzMDMqtaYWGhCAwMFM7OzuLq1avF5hcN5t66das0LTo6usTB3ElJSdIy33//vahVq5bIzs6u/CdRzbyoH9Fm2j6YW5f7DujIYO4X9TmaxqCkQfHx8eKll14Sb7zxhoiPjxcJCQnSrUhKSopwdHQU77//vrh06ZLYuHGjMDc3F99//720zPHjx0WNGjXEV199JaKiosScOXOEkZGR+PfffzXxtLTOxo0bhYmJiVizZo24cuWKGDNmjLC2tlY5Iov+Z9y4ccLKykocOXJE5TWZlZUlLfOf//xHuLm5iUOHDomzZ8+Ktm3birZt20rz8/PzRePGjUXXrl3FhQsXxL59+4S9vb2YNWuWJp6SXitNP6KNbt++Lc6fPy/mzZsnLC0txfnz58X58+dFenq6pkuT6GLfkZ6eLm1LAGLJkiXi/Pnz4vbt25ouTVZp+hxNYlDSoNWrVwsAJd6e9s8//4jXXntNmJiYiLp164ovv/yyWFubN28WDRs2FMbGxqJRo0Ziz549VfU0dMKyZcuEm5ubMDY2Fq1btxanTp3SdElaS+41uXr1ammZx48fi/Hjx4vatWsLc3Nz0adPn2IfzLdu3RLdu3cXZmZmws7OTnz44YciLy+vip+N/ittP6JtAgICSqz58OHDmi5Nha71HYcPHy5xuwYEBGi6NFml6XM0SSGEEJX62x4RERGRjuJRb0REREQyGJSIiIiIZDAoEREREclgUCIiIiKSwaBEREREJINBiYiIiEgGgxIRERGRDAYlIiIiIhkMSkREREQyGJSIiIiIZDAoEREREcn4P3i2qdg69SXeAAAAAElFTkSuQmCC\n", + "image/png": "", "text/plain": [ "
" ] @@ -802,13 +1253,6 @@ } ], "source": [ - "from sklearn.preprocessing import StandardScaler \n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns \n", - "\n", - "# vygeneruje 20 náhodných bodů\n", - "example = pd.DataFrame({\"x\": 100+np.random.randn(100), \"y\": 100*np.random.randn(100)})\n", - "\n", "example_scaler = StandardScaler()\n", "transformed_example = example_scaler.fit_transform(example)\n", "\n", @@ -819,53 +1263,105 @@ "sns.histplot(transformed_example, ax=ax2);" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Zpátky k našim datům. Transformaci musíme nastavit (fit) pouze na trénovacích datech, škálovat pak budeme stejným způsobem trénovací i testovací data." - ] - }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01
count1.000000e+021.000000e+02
mean8.699708e-155.551115e-18
std1.005038e+001.005038e+00
min-2.465865e+00-2.421800e+00
25%-6.879710e-01-6.950112e-01
50%-7.059610e-026.110893e-02
75%8.580067e-016.285318e-01
max2.022080e+002.037319e+00
\n", + "
" + ], "text/plain": [ - "array([[-1.52392312, -1.3377031 , -0.37482778, ..., -1.21007674,\n", - " -0.58292866, 0.58292866],\n", - " [-1.27246669, -0.98473364, -0.37482778, ..., -1.21007674,\n", - " -0.58292866, 0.58292866],\n", - " [ 0.57154709, 0.33890184, -0.37482778, ..., 0.82639387,\n", - " -0.58292866, 0.58292866],\n", - " ...,\n", - " [-1.3562855 , -1.072976 , 2.66789188, ..., -1.21007674,\n", - " -0.58292866, 0.58292866],\n", - " [ 0.48772828, 0.86835603, 2.66789188, ..., 0.82639387,\n", - " -0.58292866, 0.58292866],\n", - " [-0.3504598 , -0.10230999, -0.37482778, ..., 0.82639387,\n", - " -0.58292866, 0.58292866]])" + " 0 1\n", + "count 1.000000e+02 1.000000e+02\n", + "mean 8.699708e-15 5.551115e-18\n", + "std 1.005038e+00 1.005038e+00\n", + "min -2.465865e+00 -2.421800e+00\n", + "25% -6.879710e-01 -6.950112e-01\n", + "50% -7.059610e-02 6.110893e-02\n", + "75% 8.580067e-01 6.285318e-01\n", + "max 2.022080e+00 2.037319e+00" ] }, - "execution_count": 14, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "scaler = StandardScaler()\n", - "\n", - "X_train = scaler.fit_transform(X_train_transformed)\n", - "X_test = scaler.transform(X_test_transformed)\n", - "\n", - "X_train" + "pd.DataFrame(transformed_example).describe()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -889,180 +1385,100 @@ " \n", " \n", " \n", - " yrs.since.phd\n", - " yrs.service\n", - " sex\n", - " rank_AssocProf\n", - " rank_AsstProf\n", - " rank_Prof\n", - " discipline_A\n", - " discipline_B\n", + " 0\n", + " 1\n", " \n", " \n", " \n", " \n", " 0\n", - " -1.523923\n", - " -1.337703\n", - " -0.374828\n", - " -0.470360\n", - " 1.857852\n", - " -1.210077\n", - " -0.582929\n", - " 0.582929\n", + " 1.012500\n", + " -2.421800\n", " \n", " \n", " 1\n", - " -1.272467\n", - " -0.984734\n", - " -0.374828\n", - " -0.470360\n", - " 1.857852\n", - " -1.210077\n", - " -0.582929\n", - " 0.582929\n", + " 0.604302\n", + " -0.599871\n", " \n", " \n", " 2\n", - " 0.571547\n", - " 0.338902\n", - " -0.374828\n", - " -0.470360\n", - " -0.538256\n", - " 0.826394\n", - " -0.582929\n", - " 0.582929\n", + " 0.146683\n", + " 0.564726\n", " \n", " \n", " 3\n", - " -0.685735\n", - " -0.367037\n", - " 2.667892\n", - " 2.126029\n", - " -0.538256\n", - " -1.210077\n", - " -0.582929\n", - " 0.582929\n", + " 0.151525\n", + " 1.142043\n", " \n", " \n", " 4\n", - " 0.152453\n", - " -0.631764\n", - " -0.374828\n", - " -0.470360\n", - " -0.538256\n", - " 0.826394\n", - " -0.582929\n", - " 0.582929\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 133\n", - " 0.739185\n", - " 1.839022\n", - " -0.374828\n", - " -0.470360\n", - " -0.538256\n", - " 0.826394\n", - " -0.582929\n", - " 0.582929\n", - " \n", - " \n", - " 134\n", - " 0.487728\n", - " 0.250659\n", - " -0.374828\n", - " -0.470360\n", - " -0.538256\n", - " 0.826394\n", - " -0.582929\n", - " 0.582929\n", - " \n", - " \n", - " 135\n", - " -1.356286\n", - " -1.072976\n", - " 2.667892\n", - " -0.470360\n", - " 1.857852\n", - " -1.210077\n", - " -0.582929\n", - " 0.582929\n", - " \n", - " \n", - " 136\n", - " 0.487728\n", - " 0.868356\n", - " 2.667892\n", - " -0.470360\n", - " -0.538256\n", - " 0.826394\n", - " -0.582929\n", - " 0.582929\n", - " \n", - " \n", - " 137\n", - " -0.350460\n", - " -0.102310\n", - " -0.374828\n", - " -0.470360\n", - " -0.538256\n", - " 0.826394\n", - " -0.582929\n", - " 0.582929\n", + " -1.406483\n", + " 0.200444\n", " \n", " \n", "\n", - "

138 rows × 8 columns

\n", "" ], "text/plain": [ - " yrs.since.phd yrs.service sex rank_AssocProf rank_AsstProf \\\n", - "0 -1.523923 -1.337703 -0.374828 -0.470360 1.857852 \n", - "1 -1.272467 -0.984734 -0.374828 -0.470360 1.857852 \n", - "2 0.571547 0.338902 -0.374828 -0.470360 -0.538256 \n", - "3 -0.685735 -0.367037 2.667892 2.126029 -0.538256 \n", - "4 0.152453 -0.631764 -0.374828 -0.470360 -0.538256 \n", - ".. ... ... ... ... ... \n", - "133 0.739185 1.839022 -0.374828 -0.470360 -0.538256 \n", - "134 0.487728 0.250659 -0.374828 -0.470360 -0.538256 \n", - "135 -1.356286 -1.072976 2.667892 -0.470360 1.857852 \n", - "136 0.487728 0.868356 2.667892 -0.470360 -0.538256 \n", - "137 -0.350460 -0.102310 -0.374828 -0.470360 -0.538256 \n", - "\n", - " rank_Prof discipline_A discipline_B \n", - "0 -1.210077 -0.582929 0.582929 \n", - "1 -1.210077 -0.582929 0.582929 \n", - "2 0.826394 -0.582929 0.582929 \n", - "3 -1.210077 -0.582929 0.582929 \n", - "4 0.826394 -0.582929 0.582929 \n", - ".. ... ... ... \n", - "133 0.826394 -0.582929 0.582929 \n", - "134 0.826394 -0.582929 0.582929 \n", - "135 -1.210077 -0.582929 0.582929 \n", - "136 0.826394 -0.582929 0.582929 \n", - "137 0.826394 -0.582929 0.582929 \n", - "\n", - "[138 rows x 8 columns]" + " 0 1\n", + "0 1.012500 -2.421800\n", + "1 0.604302 -0.599871\n", + "2 0.146683 0.564726\n", + "3 0.151525 1.142043\n", + "4 -1.406483 0.200444" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(transformed_example).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zpátky k našim datům. Transformaci musíme nastavit (fit) pouze na trénovacích datech, škálovat pak budeme stejným způsobem trénovací i testovací data." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.47036043, 1.85785169, -1.21007674, ..., 0.37482778,\n", + " -1.52392312, -1.3377031 ],\n", + " [-0.47036043, 1.85785169, -1.21007674, ..., 0.37482778,\n", + " -1.27246669, -0.98473364],\n", + " [-0.47036043, -0.5382561 , 0.82639387, ..., 0.37482778,\n", + " 0.57154709, 0.33890184],\n", + " ...,\n", + " [-0.47036043, 1.85785169, -1.21007674, ..., -2.66789188,\n", + " -1.3562855 , -1.072976 ],\n", + " [-0.47036043, -0.5382561 , 0.82639387, ..., -2.66789188,\n", + " 0.48772828, 0.86835603],\n", + " [-0.47036043, -0.5382561 , 0.82639387, ..., 0.37482778,\n", + " -0.3504598 , -0.10230999]])" ] }, - "execution_count": 15, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pd.DataFrame(X_train, columns=X_train_transformed.columns)" + "scaler = StandardScaler()\n", + "\n", + "X_train = scaler.fit_transform(X_train_transformed)\n", + "X_test = scaler.transform(X_test_transformed)\n", + "\n", + "X_train" ] }, { @@ -1115,7 +1531,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -1140,9 +1556,427 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "model.fit(X_train, y_train)" ] @@ -1163,7 +1997,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -1180,7 +2014,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1218,7 +2052,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1242,71 +2076,71 @@ " \n", " \n", " \n", + " rank\n", + " discipline\n", " yrs.since.phd\n", " yrs.service\n", " sex\n", - " AssocProf\n", - " AsstProf\n", - " Prof\n", - " A\n", - " B\n", " \n", " \n", " \n", " \n", " 0\n", - " 5\n", - " 4\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 1\n", + " Prof\n", + " B\n", + " 15\n", + " 10\n", + " Male\n", " \n", " \n", "\n", "" ], "text/plain": [ - " yrs.since.phd yrs.service sex AssocProf AsstProf Prof A B\n", - "0 5 4 0 0 1 0 0 1" + " rank discipline yrs.since.phd yrs.service sex\n", + "0 Prof B 15 10 Male" ] }, - "execution_count": 20, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dotaz = pd.DataFrame({\n", - " \"yrs.since.phd\": 5,\n", - " \"yrs.service\": 4,\n", - " \"sex\": 0,\n", - " \"AssocProf\": 0,\n", - " \"AsstProf\": 1,\n", - " \"Prof\": 0,\n", - " \"A\": 0,\n", - " \"B\": 1\n", + " \"rank\": \"Prof\",\n", + " \"discipline\": \"B\",\n", + " \"yrs.since.phd\": 15,\n", + " \"yrs.service\": 10,\n", + " \"sex\": \"Male\",\n", "}, index=[0])\n", "dotaz" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "X_query = scaler.transform(transformer.transform(dotaz))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Odhadovaný plat vašeho pracovníka je: 92517.39403735907\n" + "Odhadovaný plat vašeho pracovníka je: 132301.01\n" ] } ], "source": [ - "print(\"Odhadovaný plat vašeho pracovníka je: \", model.predict(dotaz.values)[0])" + "print(f\"Odhadovaný plat vašeho pracovníka je: {model.predict(X_query)[0]:.2f}\")" ] }, { @@ -1325,7 +2159,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -1333,7 +2167,7 @@ "output_type": "stream", "text": [ "R2 na trénovací množině: 0.5135908532845631\n", - "R2 na testovací množině: 0.5444018189943944\n" + "R2 na testovací množině: 0.5444018189943942\n" ] } ], @@ -1352,7 +2186,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -1401,16 +2235,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.5444018189943944" + "0.5444018189943942" ] }, - "execution_count": 24, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -1456,9 +2290,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.12.6" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From 79800ddb17c427288ed87603bce2b487ad42e367 Mon Sep 17 00:00:00 2001 From: Petra Vidnerova Date: Sun, 10 Nov 2024 14:08:05 +0100 Subject: [PATCH 2/5] visualization fixed transformation changed --- lessons/pydata/homework_revisited/index.ipynb | 362 ++++++++++-------- 1 file changed, 203 insertions(+), 159 deletions(-) diff --git a/lessons/pydata/homework_revisited/index.ipynb b/lessons/pydata/homework_revisited/index.ipynb index fc82691..7923668 100644 --- a/lessons/pydata/homework_revisited/index.ipynb +++ b/lessons/pydata/homework_revisited/index.ipynb @@ -19,6 +19,17 @@ "np.random.seed(42)" ] }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "from sklearn.exceptions import ConvergenceWarning\n", + "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -28,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -194,7 +205,7 @@ "[123 rows x 7 columns]" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -216,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -235,7 +246,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -254,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -278,73 +289,83 @@ " \n", " \n", " \n", - " Length1\n", - " Length2\n", - " Length3\n", - " Height\n", - " Width\n", - " Species_Bream\n", - " Species_Parkki\n", - " Species_Perch\n", - " Species_Pike\n", - " Species_Roach\n", - " Species_Smelt\n", - " Species_Whitefish\n", + " onehotencoder__Species_Bream\n", + " onehotencoder__Species_Parkki\n", + " onehotencoder__Species_Perch\n", + " onehotencoder__Species_Pike\n", + " onehotencoder__Species_Roach\n", + " onehotencoder__Species_Smelt\n", + " onehotencoder__Species_Whitefish\n", + " remainder__Length1\n", + " remainder__Length2\n", + " remainder__Length3\n", + " remainder__Height\n", + " remainder__Width\n", " \n", " \n", " \n", " \n", - " 11\n", + " 0\n", + " 1.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", " 28.7\n", " 31.0\n", " 36.2\n", " 14.3714\n", " 4.8146\n", - " 1.0\n", + " \n", + " \n", + " 1\n", " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 1.0\n", " 0.0\n", " 0.0\n", - " \n", - " \n", - " 45\n", " 20.5\n", " 22.5\n", " 25.3\n", " 7.0334\n", " 3.8203\n", + " \n", + " \n", + " 2\n", + " 1.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", - " 1.0\n", " 0.0\n", " 0.0\n", - " \n", - " \n", - " 26\n", " 32.0\n", " 35.0\n", " 40.6\n", " 16.3618\n", " 6.0900\n", - " 1.0\n", + " \n", + " \n", + " 3\n", " 0.0\n", " 0.0\n", + " 1.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", - " \n", - " \n", - " 87\n", " 20.0\n", " 22.0\n", " 23.5\n", " 5.6400\n", " 3.5250\n", + " \n", + " \n", + " 4\n", " 0.0\n", " 0.0\n", " 1.0\n", @@ -352,21 +373,11 @@ " 0.0\n", " 0.0\n", " 0.0\n", - " \n", - " \n", - " 74\n", " 13.8\n", " 15.0\n", " 16.0\n", " 3.8240\n", " 2.4320\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", " \n", " \n", " ...\n", @@ -384,79 +395,79 @@ " ...\n", " \n", " \n", - " 123\n", + " 81\n", + " 0.0\n", + " 0.0\n", + " 1.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", " 39.0\n", " 42.0\n", " 44.6\n", " 12.8002\n", " 6.8684\n", + " \n", + " \n", + " 82\n", + " 1.0\n", " 0.0\n", " 0.0\n", - " 1.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", - " \n", - " \n", - " 29\n", " 33.5\n", " 37.0\n", " 42.6\n", " 18.9570\n", " 6.6030\n", - " 1.0\n", + " \n", + " \n", + " 83\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 1.0\n", " 0.0\n", " 0.0\n", " 0.0\n", - " \n", - " \n", - " 130\n", " 32.7\n", " 35.0\n", " 38.8\n", " 5.9364\n", " 4.3844\n", + " \n", + " \n", + " 84\n", " 0.0\n", " 0.0\n", " 0.0\n", - " 1.0\n", " 0.0\n", " 0.0\n", + " 1.0\n", " 0.0\n", - " \n", - " \n", - " 153\n", " 11.4\n", " 12.0\n", " 13.2\n", " 2.2044\n", " 1.1484\n", + " \n", + " \n", + " 85\n", " 0.0\n", " 0.0\n", + " 1.0\n", " 0.0\n", " 0.0\n", " 0.0\n", - " 1.0\n", " 0.0\n", - " \n", - " \n", - " 101\n", " 25.0\n", " 26.5\n", " 28.0\n", " 7.1680\n", " 4.1440\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", " \n", " \n", "\n", @@ -464,71 +475,94 @@ "" ], "text/plain": [ - " Length1 Length2 Length3 Height Width Species_Bream \\\n", - "11 28.7 31.0 36.2 14.3714 4.8146 1.0 \n", - "45 20.5 22.5 25.3 7.0334 3.8203 0.0 \n", - "26 32.0 35.0 40.6 16.3618 6.0900 1.0 \n", - "87 20.0 22.0 23.5 5.6400 3.5250 0.0 \n", - "74 13.8 15.0 16.0 3.8240 2.4320 0.0 \n", - ".. ... ... ... ... ... ... \n", - "123 39.0 42.0 44.6 12.8002 6.8684 0.0 \n", - "29 33.5 37.0 42.6 18.9570 6.6030 1.0 \n", - "130 32.7 35.0 38.8 5.9364 4.3844 0.0 \n", - "153 11.4 12.0 13.2 2.2044 1.1484 0.0 \n", - "101 25.0 26.5 28.0 7.1680 4.1440 0.0 \n", + " onehotencoder__Species_Bream onehotencoder__Species_Parkki \\\n", + "0 1.0 0.0 \n", + "1 0.0 0.0 \n", + "2 1.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + ".. ... ... \n", + "81 0.0 0.0 \n", + "82 1.0 0.0 \n", + "83 0.0 0.0 \n", + "84 0.0 0.0 \n", + "85 0.0 0.0 \n", + "\n", + " onehotencoder__Species_Perch onehotencoder__Species_Pike \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 1.0 0.0 \n", + "4 1.0 0.0 \n", + ".. ... ... \n", + "81 1.0 0.0 \n", + "82 0.0 0.0 \n", + "83 0.0 1.0 \n", + "84 0.0 0.0 \n", + "85 1.0 0.0 \n", + "\n", + " onehotencoder__Species_Roach onehotencoder__Species_Smelt \\\n", + "0 0.0 0.0 \n", + "1 1.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + ".. ... ... \n", + "81 0.0 0.0 \n", + "82 0.0 0.0 \n", + "83 0.0 0.0 \n", + "84 0.0 1.0 \n", + "85 0.0 0.0 \n", "\n", - " Species_Parkki Species_Perch Species_Pike Species_Roach \\\n", - "11 0.0 0.0 0.0 0.0 \n", - "45 0.0 0.0 0.0 1.0 \n", - "26 0.0 0.0 0.0 0.0 \n", - "87 0.0 1.0 0.0 0.0 \n", - "74 0.0 1.0 0.0 0.0 \n", - ".. ... ... ... ... \n", - "123 0.0 1.0 0.0 0.0 \n", - "29 0.0 0.0 0.0 0.0 \n", - "130 0.0 0.0 1.0 0.0 \n", - "153 0.0 0.0 0.0 0.0 \n", - "101 0.0 1.0 0.0 0.0 \n", + " onehotencoder__Species_Whitefish remainder__Length1 remainder__Length2 \\\n", + "0 0.0 28.7 31.0 \n", + "1 0.0 20.5 22.5 \n", + "2 0.0 32.0 35.0 \n", + "3 0.0 20.0 22.0 \n", + "4 0.0 13.8 15.0 \n", + ".. ... ... ... \n", + "81 0.0 39.0 42.0 \n", + "82 0.0 33.5 37.0 \n", + "83 0.0 32.7 35.0 \n", + "84 0.0 11.4 12.0 \n", + "85 0.0 25.0 26.5 \n", "\n", - " Species_Smelt Species_Whitefish \n", - "11 0.0 0.0 \n", - "45 0.0 0.0 \n", - "26 0.0 0.0 \n", - "87 0.0 0.0 \n", - "74 0.0 0.0 \n", - ".. ... ... \n", - "123 0.0 0.0 \n", - "29 0.0 0.0 \n", - "130 0.0 0.0 \n", - "153 1.0 0.0 \n", - "101 0.0 0.0 \n", + " remainder__Length3 remainder__Height remainder__Width \n", + "0 36.2 14.3714 4.8146 \n", + "1 25.3 7.0334 3.8203 \n", + "2 40.6 16.3618 6.0900 \n", + "3 23.5 5.6400 3.5250 \n", + "4 16.0 3.8240 2.4320 \n", + ".. ... ... ... \n", + "81 44.6 12.8002 6.8684 \n", + "82 42.6 18.9570 6.6030 \n", + "83 38.8 5.9364 4.3844 \n", + "84 13.2 2.2044 1.1484 \n", + "85 28.0 7.1680 4.1440 \n", "\n", "[86 rows x 12 columns]" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.compose import make_column_transformer\n", "\n", "categorical_columns = [\"Species\"] \n", "\n", - "encoder = OneHotEncoder()\n", - "encoder.fit(X_train_raw[categorical_columns])\n", - "column_names = encoder.get_feature_names_out()\n", - " \n", - "def transform_species(X_raw):\n", - " X_res = X_raw.drop(columns=[\"Species\"])\n", - " X_res = X_res.reindex(columns=list(X_res.columns)+list(column_names))\n", - " X_res[list(column_names)] = encoder.transform(X_raw[categorical_columns]).toarray() \n", - " return X_res\n", + "transformer = make_column_transformer(\n", + " (OneHotEncoder(sparse_output=False), [\"Species\"]),\n", + " remainder=\"passthrough\"\n", + ")\n", "\n", - "X_train_onehot = transform_species(X_train_raw)\n", - "X_test_onehot = transform_species(X_test_raw)\n", - "X_train_onehot" + "X_train_onehot = transformer.fit_transform(X_train_raw)\n", + "X_test_onehot = transformer.transform(X_test_raw)\n", + "\n", + "pd.DataFrame(X_train_onehot, columns=transformer.get_feature_names_out())" ] }, { @@ -540,7 +574,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -590,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -600,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -631,7 +665,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -658,19 +692,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/petra/pydata-course/podzim_2022/venv/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.667e+03, tolerance: 1.127e+03\n", - " model = cd_fast.enet_coordinate_descent(\n", - "/home/petra/pydata-course/podzim_2022/venv/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.308e+05, tolerance: 1.127e+03\n", - " model = cd_fast.enet_coordinate_descent(\n" - ] - }, { "data": { "text/html": [ @@ -710,18 +734,18 @@ " \n", " \n", " 1\n", - " 65.457444\n", - " 7999.732745\n", - " 67.698824\n", - " 7481.296957\n", + " 65.461029\n", + " 8000.189809\n", + " 67.699174\n", + " 7481.774269\n", " lasso_var1\n", " \n", " \n", " 2\n", - " 65.143581\n", - " 7723.470509\n", - " 66.929641\n", - " 7734.792363\n", + " 65.162065\n", + " 7730.766645\n", + " 66.984117\n", + " 7743.530561\n", " lasso_var2\n", " \n", " \n", @@ -747,13 +771,13 @@ "text/plain": [ " MAE_train MSE_train MAE_test MSE_test model\n", "0 65.075967 7438.347512 64.076083 7327.508225 linear_regression\n", - "1 65.457444 7999.732745 67.698824 7481.296957 lasso_var1\n", - "2 65.143581 7723.470509 66.929641 7734.792363 lasso_var2\n", + "1 65.461029 8000.189809 67.699174 7481.774269 lasso_var1\n", + "2 65.162065 7730.766645 66.984117 7743.530561 lasso_var2\n", "3 14.885810 1083.558622 29.440300 1983.156024 SVR_rbf\n", "4 15.792914 1263.177350 37.137981 3351.282774 SVR_poly" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -778,7 +802,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -813,7 +837,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -823,13 +847,13 @@ "y_real_test = test_data.pop(\"Weight\")\n", "X_real_test = test_data \n", "\n", - "X_real_test = transform_species(X_real_test)\n", - "X_real_test_scaled = scaler.transform(X_real_test)" + "X_real_test_transformed = transformer.transform(X_real_test)\n", + "X_real_test_scaled = scaler.transform(X_real_test_transformed)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -837,7 +861,8 @@ "output_type": "stream", "text": [ "MAE 37.263\n", - "MSE 4050.929\n" + "MSE 4050.929\n", + "R2 0.972\n" ] } ], @@ -845,12 +870,13 @@ "y_pred_test = best_model.predict(X_real_test_scaled)\n", "\n", "print(f\"MAE {mean_absolute_error(y_real_test, y_pred_test):.3f}\")\n", - "print(f\"MSE {mean_squared_error(y_real_test, y_pred_test):.3f}\")" + "print(f\"MSE {mean_squared_error(y_real_test, y_pred_test):.3f}\")\n", + "print(f\"R2 {r2_score(y_real_test, y_pred_test):.3f}\")" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -917,17 +943,17 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "is_bream = X_real_test[\"Species_Bream\"] == 1 \n", + "is_bream = X_real_test[\"Species\"] == \"Bream\"\n", "bream = X_real_test[is_bream][\"Length3\"]\n", "\n", "bream_weights = y_real_test[is_bream]\n", "predicted_bream_weights = best_model.predict(X_real_test_scaled[is_bream])\n", "\n", - "is_roach = X_real_test[\"Species_Roach\"] == 1\n", + "is_roach = X_real_test[\"Species\"] == \"Roach\"\n", "roach = X_real_test[is_roach][\"Length3\"]\n", "roach_weights = y_real_test[is_roach]\n", "predicted_roach_weights = best_model.predict(X_real_test_scaled[is_roach])" @@ -935,12 +961,31 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "result_bream = pd.DataFrame()\n", + "result_bream[\"length\"] = bream\n", + "result_bream[\"weight\"] = bream_weights\n", + "result_bream[\"predicted\"] = predicted_bream_weights\n", + "result_bream = result_bream.sort_values(\"length\")\n", + "\n", + "result_roach = pd.DataFrame()\n", + "result_roach[\"length\"] = roach\n", + "result_roach[\"weight\"] = roach_weights\n", + "result_roach[\"predicted\"] = predicted_roach_weights\n", + "result_roach = result_roach.sort_values(\"length\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] @@ -951,17 +996,16 @@ ], "source": [ "import matplotlib.pyplot as plt \n", - "%matplotlib inline\n", "\n", "fig, ax = plt.subplots(1, 2)\n", "\n", - "ax[0].scatter(bream, bream_weights, label=\"true weight\");\n", - "ax[0].scatter(bream, predicted_bream_weights, label=\"prediction\");\n", + "ax[0].plot(result_bream[\"length\"], result_bream[\"weight\"], label=\"true weight\", marker=\"o\");\n", + "ax[0].plot(result_bream[\"length\"], result_bream[\"predicted\"], label=\"prediction\", marker=\"o\");\n", "ax[0].legend()\n", "ax[0].set_title(\"Bream\")\n", "\n", - "ax[1].scatter(roach, roach_weights, label=\"true weight\");\n", - "ax[1].scatter(roach, predicted_roach_weights, label=\"prediction\");\n", + "ax[1].plot(result_roach[\"length\"], result_roach[\"weight\"], label=\"true weight\", marker=\"o\");\n", + "ax[1].plot(result_roach[\"length\"], result_roach[\"predicted\"], label=\"prediction\", marker=\"o\");\n", "ax[1].legend()\n", "ax[1].set_title(\"Roach\");" ] @@ -983,9 +1027,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.6" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From 174a67b2c1e8d2f79d946232831f0f5937104611 Mon Sep 17 00:00:00 2001 From: Petra Vidnerova Date: Sun, 10 Nov 2024 14:15:56 +0100 Subject: [PATCH 3/5] kosmeticka uprava --- lessons/pydata/homework_revisited/index.ipynb | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/lessons/pydata/homework_revisited/index.ipynb b/lessons/pydata/homework_revisited/index.ipynb index 7923668..1b2cebe 100644 --- a/lessons/pydata/homework_revisited/index.ipynb +++ b/lessons/pydata/homework_revisited/index.ipynb @@ -948,14 +948,9 @@ "outputs": [], "source": [ "is_bream = X_real_test[\"Species\"] == \"Bream\"\n", - "bream = X_real_test[is_bream][\"Length3\"]\n", - "\n", - "bream_weights = y_real_test[is_bream]\n", "predicted_bream_weights = best_model.predict(X_real_test_scaled[is_bream])\n", "\n", "is_roach = X_real_test[\"Species\"] == \"Roach\"\n", - "roach = X_real_test[is_roach][\"Length3\"]\n", - "roach_weights = y_real_test[is_roach]\n", "predicted_roach_weights = best_model.predict(X_real_test_scaled[is_roach])" ] }, @@ -966,14 +961,14 @@ "outputs": [], "source": [ "result_bream = pd.DataFrame()\n", - "result_bream[\"length\"] = bream\n", - "result_bream[\"weight\"] = bream_weights\n", + "result_bream[\"length\"] = X_real_test[is_bream][\"Length3\"]\n", + "result_bream[\"weight\"] = y_real_test[is_bream]\n", "result_bream[\"predicted\"] = predicted_bream_weights\n", "result_bream = result_bream.sort_values(\"length\")\n", "\n", "result_roach = pd.DataFrame()\n", - "result_roach[\"length\"] = roach\n", - "result_roach[\"weight\"] = roach_weights\n", + "result_roach[\"length\"] = X_real_test[is_roach][\"Length3\"]\n", + "result_roach[\"weight\"] = y_real_test[is_roach]\n", "result_roach[\"predicted\"] = predicted_roach_weights\n", "result_roach = result_roach.sort_values(\"length\")" ] From 64eb53ca3074ebcdb9189480379f8012fe68f326 Mon Sep 17 00:00:00 2001 From: Petra Vidnerova Date: Sun, 10 Nov 2024 15:19:14 +0100 Subject: [PATCH 4/5] aktualizovana pracovni verze notebooku --- .../homework_revisited/index_na_hodinu.ipynb | 131 ++++++++++++------ 1 file changed, 91 insertions(+), 40 deletions(-) diff --git a/lessons/pydata/homework_revisited/index_na_hodinu.ipynb b/lessons/pydata/homework_revisited/index_na_hodinu.ipynb index 8f34a53..2b829a8 100644 --- a/lessons/pydata/homework_revisited/index_na_hodinu.ipynb +++ b/lessons/pydata/homework_revisited/index_na_hodinu.ipynb @@ -19,6 +19,24 @@ "np.random.seed(42)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "from sklearn.exceptions import ConvergenceWarning\n", + "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ÚKOL 1**: Načtěte data, vyhoďte přebytečné sloupce, vyberte vstupy a výstupy a připravte rozdělení na trénovací a testovací množinu.\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -33,7 +51,7 @@ "outputs": [], "source": [ "fish_data = pd.read_csv(\"fish_data.csv\", index_col=0)\n", - "# fish_data = fish_data.drop(columns=[____])\n", + "# fish_data = fish_data.drop(columns=[___])\n", "fish_data" ] }, @@ -73,8 +91,14 @@ "source": [ "from sklearn.model_selection import train_test_split \n", "\n", - "X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y)\n", - "# X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=X[\"Species\"])" + "X_train_raw, X_test_raw, y_train, y_test = ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ÚKOL 2**: Překódujte kategorické proměnné a přeškálujte všechny sloupce." ] }, { @@ -91,22 +115,19 @@ "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.compose import make_column_transformer\n", "\n", - "categorical_columns = [____] \n", + "categorical_columns = ___ \n", "\n", - "encoder = OneHotEncoder()\n", - "encoder.fit(X_train_raw[categorical_columns])\n", - "column_names = encoder.get_feature_names_out()\n", - " \n", - "def transform_species(X_raw):\n", - " X_res = X_raw.drop(columns=[\"Species\"])\n", - " X_res = X_res.reindex(columns=list(X_res.columns)+list(column_names))\n", - " X_res[list(column_names)] = encoder.transform(X_raw[categorical_columns]).toarray() \n", - " return X_res\n", + "transformer = make_column_transformer(\n", + " (_______, _____),\n", + " remainder=\"passthrough\"\n", + ")\n", "\n", - "X_train_onehot = transform_species(X_train_raw)\n", - "X_test_onehot = transform_species(X_test_raw)\n", - "X_train_onehot" + "X_train_onehot = transformer._______(X_train_raw)\n", + "X_test_onehot = transformer.________(X_test_raw)\n", + "\n", + "pd.DataFrame(X_train_onehot, columns=transformer.get_feature_names_out())" ] }, { @@ -124,11 +145,11 @@ "source": [ "from sklearn.preprocessing import StandardScaler\n", "\n", - "scaler = StandardScaler()\n", - "scaler.____(X_train_onehot)\n", + "scaler = ______\n", + "scaler._____(_____)\n", "\n", - "X_train = scaler.____(X_train_onehot)\n", - "X_test = scaler.____(X_test_onehot)" + "X_train = scaler.transform(X_train_onehot)\n", + "X_test = scaler.transform(X_test_onehot)" ] }, { @@ -151,6 +172,13 @@ " * C, float, optional (default=1.0)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -207,6 +235,13 @@ "``` " ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ÚKOL 3**: Dopište funkci `fit_and_eval` dle instrukcí." + ] + }, { "cell_type": "code", "execution_count": null, @@ -218,11 +253,13 @@ "def fit_and_eval(X_train, y_train, X_test, y_test, model, name):\n", " \"\"\" 1. Natrénuje model na trénovací množině.\n", " 2. Spočte hodnoty metrik na trénovací i testovací množině.\n", - " vrátí slovník ve tvaru {\"název metriky train\": hodnota , \"název metriky test\": hodnota} \n", + " vrátí slovník ve tvaru {\"název metriky\": hodnota} \n", " \"\"\" \n", - " ...\n", + " # zde dopiš kód\n", + " ... \n", " return {\n", - " .... \n", + " \"MSE_test\": ____,\n", + " \"MSE_train\": ____\n", " }" ] }, @@ -296,8 +333,8 @@ "y_real_test = test_data.pop(\"Weight\")\n", "X_real_test = test_data \n", "\n", - "X_real_test = transform_species(X_real_test)\n", - "X_real_test_scaled = scaler.transform(X_real_test)" + "X_real_test_transformed = _____\n", + "X_real_test_scaled = _______" ] }, { @@ -309,7 +346,8 @@ "y_pred_test = best_model.predict(X_real_test_scaled)\n", "\n", "print(f\"MAE {mean_absolute_error(y_real_test, y_pred_test):.3f}\")\n", - "print(f\"MSE {mean_squared_error(y_real_test, y_pred_test):.3f}\")" + "print(f\"MSE {mean_squared_error(y_real_test, y_pred_test):.3f}\")\n", + "print(f\"R2 {r2_score(y_real_test, y_pred_test):.3f}\")" ] }, { @@ -342,18 +380,32 @@ "metadata": {}, "outputs": [], "source": [ - "is_bream = X_real_test[\"Species_Bream\"] == 1 \n", - "bream = X_real_test[is_bream][\"Length3\"]\n", - "\n", - "bream_weights = y_real_test[is_bream]\n", + "is_bream = X_real_test[\"Species\"] == \"Bream\"\n", "predicted_bream_weights = best_model.predict(X_real_test_scaled[is_bream])\n", "\n", - "is_roach = X_real_test[\"Species_Roach\"] == 1\n", - "roach = X_real_test[is_roach][\"Length3\"]\n", - "roach_weights = y_real_test[is_roach]\n", + "is_roach = X_real_test[\"Species\"] == \"Roach\"\n", "predicted_roach_weights = best_model.predict(X_real_test_scaled[is_roach])" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result_bream = pd.DataFrame()\n", + "result_bream[\"length\"] = X_real_test[is_bream][\"Length3\"]\n", + "result_bream[\"weight\"] = y_real_test[is_bream]\n", + "result_bream[\"predicted\"] = predicted_bream_weights\n", + "result_bream = result_bream.sort_values(\"length\")\n", + "\n", + "result_roach = pd.DataFrame()\n", + "result_roach[\"length\"] = X_real_test[is_roach][\"Length3\"]\n", + "result_roach[\"weight\"] = y_real_test[is_roach]\n", + "result_roach[\"predicted\"] = predicted_roach_weights\n", + "result_roach = result_roach.sort_values(\"length\")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -361,17 +413,16 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt \n", - "%matplotlib inline\n", "\n", "fig, ax = plt.subplots(1, 2)\n", "\n", - "ax[0].scatter(bream, bream_weights, label=\"true weight\");\n", - "ax[0].scatter(bream, predicted_bream_weights, label=\"prediction\");\n", + "ax[0].plot(result_bream[\"length\"], result_bream[\"weight\"], label=\"true weight\", marker=\"o\");\n", + "ax[0].plot(result_bream[\"length\"], result_bream[\"predicted\"], label=\"prediction\", marker=\"o\");\n", "ax[0].legend()\n", "ax[0].set_title(\"Bream\")\n", "\n", - "ax[1].scatter(roach, roach_weights, label=\"true weight\");\n", - "ax[1].scatter(roach, predicted_roach_weights, label=\"prediction\");\n", + "ax[1].plot(result_roach[\"length\"], result_roach[\"weight\"], label=\"true weight\", marker=\"o\");\n", + "ax[1].plot(result_roach[\"length\"], result_roach[\"predicted\"], label=\"prediction\", marker=\"o\");\n", "ax[1].legend()\n", "ax[1].set_title(\"Roach\");" ] @@ -393,9 +444,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.6" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From d5aebd6fbe392bd44f6b2a88513c8749a8d53914 Mon Sep 17 00:00:00 2001 From: Petra Vidnerova Date: Sun, 10 Nov 2024 15:22:02 +0100 Subject: [PATCH 5/5] ML2 do obsahu kurzu --- course.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/course.yml b/course.yml index bfefe21..fa4b503 100644 --- a/course.yml +++ b/course.yml @@ -63,6 +63,14 @@ plan: - lesson: pydata/regression_exercises - lesson: pydata/regression_resume + - title: "Strojové učení - Scikit-learn, ML workflow" + slug: ml2 + date: 2024-11-11 + materials: + - lesson: pydata/scikitlearn_api + - lesson: pydata/homework_revisited + - lesson: pydata/scikitlearn_resume + - title: "Svátky klidu a konce kurzu" slug: konec serial: null