diff --git a/Deep_Learning/Tweet_Classificaion_DL/Disaster Tweet Classification using NLP and Deep Learning.ipynb b/Deep_Learning/Tweet_Classificaion_DL/Disaster Tweet Classification using NLP and Deep Learning.ipynb new file mode 100644 index 0000000000..9b64e902a8 --- /dev/null +++ b/Deep_Learning/Tweet_Classificaion_DL/Disaster Tweet Classification using NLP and Deep Learning.ipynb @@ -0,0 +1,3103 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Real or Not? NLP with Disaster Tweets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://img3.goodfon.com/wallpaper/nbig/f/29/katastrofa-razrusheniya-zdaniya.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Import" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "_uuid": "e8f35d2c74c370d4b01dc540862a67254031e8b5" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['nlp-getting-started']\n" + ] + } + ], + "source": [ + "# System\n", + "import os\n", + "\n", + "# Time\n", + "import time\n", + "import datetime\n", + "\n", + "# Numerical\n", + "import numpy as np\n", + "import pandas as pd\n", + "import random\n", + "\n", + "# Tools\n", + "import itertools\n", + "from collections import Counter\n", + "\n", + "# NLP\n", + "import re\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize \n", + "from nltk.stem import SnowballStemmer\n", + "from nltk.stem import PorterStemmer\n", + "from nltk.stem import WordNetLemmatizer\n", + "# from pywsd.utils import lemmatize_sentence\n", + "\n", + "# Preprocessing\n", + "from sklearn import preprocessing\n", + "from sklearn.utils import class_weight as cw\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from bs4 import BeautifulSoup\n", + "\n", + "# Model Selection\n", + "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV\n", + "\n", + "# Machine Learning Models\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn import svm\n", + "from sklearn.svm import SVC\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "# Evaluation Metrics\n", + "from sklearn import metrics \n", + "from sklearn.metrics import f1_score, accuracy_score,confusion_matrix,classification_report\n", + "\n", + "# Deep Learing Preprocessing - Keras\n", + "from keras.preprocessing.text import Tokenizer\n", + "from keras.preprocessing import sequence\n", + "from keras.utils import to_categorical\n", + "\n", + "# Deep Learning Model - Keras\n", + "from keras.models import Model\n", + "from keras.models import Sequential\n", + "\n", + "# Deep Learning Model - Keras - CNN\n", + "from keras.layers import Conv1D, Conv2D, Convolution1D, MaxPooling1D, SeparableConv1D, SpatialDropout1D, \\\n", + " GlobalAvgPool1D, GlobalMaxPool1D, GlobalMaxPooling1D \n", + "from keras.layers.pooling import _GlobalPooling1D\n", + "from keras.layers import MaxPooling2D, GlobalMaxPooling2D, GlobalAveragePooling2D\n", + "\n", + "from keras.layers import MaxPooling3D, GlobalMaxPooling3D, GlobalAveragePooling3D\n", + "\n", + "\n", + "\n", + "# Deep Learning Model - Keras - RNN\n", + "from keras.layers import Embedding, LSTM, Bidirectional\n", + "\n", + "# Deep Learning Model - Keras - General\n", + "from keras.layers import Input, Add, concatenate, Dense, Activation, BatchNormalization, Dropout, Flatten\n", + "from keras.layers import LeakyReLU, PReLU, Lambda, Multiply\n", + "\n", + "\n", + "\n", + "# Deep Learning Parameters - Keras\n", + "from keras.optimizers import RMSprop, Adam\n", + "\n", + "# Deep Learning Callbacs - Keras\n", + "from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau\n", + "\n", + "# Visualization\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "%matplotlib inline\n", + "\n", + "print(os.listdir(\"../input\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "_uuid": "bcf244fb2623377e6c472520181f55b570f22225" + }, + "outputs": [], + "source": [ + "# print date and time for given type of representation\n", + "def date_time(x):\n", + " if x==1:\n", + " return 'Timestamp: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())\n", + " if x==2: \n", + " return 'Timestamp: {:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now())\n", + " if x==3: \n", + " return 'Date now: %s' % datetime.datetime.now()\n", + " if x==4: \n", + " return 'Date today: %s' % datetime.date.today() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "f9941e463091c0974df63b98ddf42ef039d07044" + }, + "source": [ + "# 2. Read Data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "_uuid": "24a1fce867bde9c19af86e6ad3fdcc9368db4907" + }, + "outputs": [], + "source": [ + "input_directory = r\"../input/nlp-getting-started/\"\n", + "output_directory = r\"../output/\"\n", + "\n", + "if not os.path.exists(output_directory):\n", + " os.mkdir(output_directory)\n", + " \n", + "figure_directory = \"../output/figures\"\n", + "if not os.path.exists(figure_directory):\n", + " os.mkdir(figure_directory)\n", + " \n", + " \n", + "file_name_pred_batch = figure_directory + r\"/result\"\n", + "file_name_pred_sample = figure_directory + r\"/sample\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", + "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(7613, 5) (3263, 4) (3263, 2)\n" + ] + } + ], + "source": [ + "# Load data\n", + "\n", + "df_train = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\n", + "df_test = pd.read_csv(\"../input/nlp-getting-started/test.csv\")\n", + "sub_sample = pd.read_csv(\"/kaggle/input/nlp-getting-started/sample_submission.csv\")\n", + "\n", + "print (df_train.shape, df_test.shape, sub_sample.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Explorarory Data Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets see first few columns of the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "_uuid": "82c38ef0a849c0e1b35dfaa6609497033690ecb3" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idkeywordlocationtexttarget
01NaNNaNOur Deeds are the Reason of this #earthquake M...1
14NaNNaNForest fire near La Ronge Sask. Canada1
25NaNNaNAll residents asked to 'shelter in place' are ...1
36NaNNaN13,000 people receive #wildfires evacuation or...1
47NaNNaNJust got sent this photo from Ruby #Alaska as ...1
\n", + "
" + ], + "text/plain": [ + " id keyword location text \\\n", + "0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n", + "1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n", + "2 5 NaN NaN All residents asked to 'shelter in place' are ... \n", + "3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n", + "4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n", + "\n", + " target \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "_kg_hide-input": true, + "_uuid": "5cf2744b1ea01acdb6e9100bba9c3e2a41833de0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 7613 entries, 0 to 7612\n", + "Data columns (total 5 columns):\n", + "id 7613 non-null int64\n", + "keyword 7552 non-null object\n", + "location 5080 non-null object\n", + "text 7613 non-null object\n", + "target 7613 non-null int64\n", + "dtypes: int64(2), object(3)\n", + "memory usage: 297.5+ KB\n" + ] + } + ], + "source": [ + "df_train.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are many null values in location and keyword" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Following are the unique values of location and keyword" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "221 3341\n", + "221 1602\n" + ] + } + ], + "source": [ + "print (df_train.keyword.nunique(), df_train.location.nunique())\n", + "print (df_test.keyword.nunique(), df_test.location.nunique())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "set()" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check if train and test have the same set of keywords\n", + "set(df_train.keyword.unique()) - set(df_test.keyword.unique())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets see few tweet texts" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 : Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all \n", + "\n", + "1 : Forest fire near La Ronge Sask. Canada \n", + "\n", + "2 : All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected \n", + "\n", + "3 : 13,000 people receive #wildfires evacuation orders in California \n", + "\n", + "4 : Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school \n", + "\n", + "5 : #RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires \n", + "\n", + "6 : #flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas \n", + "\n", + "7 : I'm on top of the hill and I can see a fire in the woods... \n", + "\n", + "8 : There's an emergency evacuation happening now in the building across the street \n", + "\n", + "9 : I'm afraid that the tornado is coming to our area... \n", + "\n" + ] + } + ], + "source": [ + "c = 10\n", + "for i in range(c):\n", + " r = random.randint(0, len(df_train)-1)\n", + " r=i\n", + " print(r,\" : \", df_train.iloc[r].text, \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "_kg_hide-input": true, + "_uuid": "cd9d8e8cccbe7be826ff80ebba85c94059b19d81" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns = df_train.columns\n", + "columns" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "13bfcbe5c9e172fad039501b7c6604dc81abe010" + }, + "source": [ + "## 3.2 Visualize Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Number of tweets for disaster and non disaster in dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "_uuid": "0ffeb6d063fb09a80db926657faddf0c719333b5" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Count')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "figsize=(20, 5)\n", + "\n", + "ticksize = 18\n", + "titlesize = ticksize + 8\n", + "labelsize = ticksize + 5\n", + "\n", + "params = {'figure.figsize' : figsize,\n", + " 'axes.labelsize' : labelsize,\n", + " 'axes.titlesize' : titlesize,\n", + " 'xtick.labelsize': ticksize,\n", + " 'ytick.labelsize': ticksize}\n", + "\n", + "plt.rcParams.update(params)\n", + "\n", + "col = \"target\"\n", + "xlabel = \"Real vs Not\"\n", + "ylabel = \"Count\"\n", + "\n", + "sns.countplot(x=df_train[col])\n", + "plt.title(\"Real vs Not Count\")\n", + "# plt.xticks(rotation=90)\n", + "plt.xlabel(xlabel)\n", + "plt.ylabel(ylabel)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Number of null values in feature columns" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "_uuid": "6415fba24b67ddc166a43327d1327bc8c75f7f2c" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "figsize=(18, 5)\n", + "\n", + "ticksize = 14\n", + "titlesize = ticksize + 8\n", + "labelsize = ticksize + 5\n", + "\n", + "xlabel = \"Train Feature\"\n", + "ylabel = \"Null Count\"\n", + "\n", + "title = \"Train Feature Null Count\"\n", + "\n", + "\n", + "params = {'figure.figsize' : figsize,\n", + " 'axes.labelsize' : labelsize,\n", + " 'axes.titlesize' : titlesize,\n", + " 'xtick.labelsize': ticksize,\n", + " 'ytick.labelsize': ticksize}\n", + "\n", + "plt.rcParams.update(params)\n", + "\n", + "df_train.isnull().sum().plot(kind=\"bar\")\n", + "plt.title(title)\n", + "plt.xlabel(xlabel)\n", + "plt.ylabel(ylabel)\n", + "plt.xticks(rotation=90)\n", + "plt.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "figsize=(18, 5)\n", + "\n", + "ticksize = 14\n", + "titlesize = ticksize + 8\n", + "labelsize = ticksize + 5\n", + "\n", + "xlabel = \"Test Feature\"\n", + "ylabel = \"Null Count\"\n", + "\n", + "title = \"Test Feature Null Count\"\n", + "\n", + "\n", + "params = {'figure.figsize' : figsize,\n", + " 'axes.labelsize' : labelsize,\n", + " 'axes.titlesize' : titlesize,\n", + " 'xtick.labelsize': ticksize,\n", + " 'ytick.labelsize': ticksize}\n", + "\n", + "plt.rcParams.update(params)\n", + "\n", + "df_train.isnull().sum().plot(kind=\"bar\")\n", + "plt.title(title)\n", + "plt.xlabel(xlabel)\n", + "plt.ylabel(ylabel)\n", + "plt.xticks(rotation=90)\n", + "plt.plot()\n", + "\n", + "\n", + "df_test.isnull().sum().plot(kind=\"bar\")\n", + "plt.title(title)\n", + "plt.xlabel(xlabel)\n", + "plt.ylabel(ylabel)\n", + "plt.xticks(rotation=90)\n", + "plt.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Top keywords for tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "figsize=(22, 5)\n", + "\n", + "ticksize = 14\n", + "titlesize = ticksize + 8\n", + "labelsize = ticksize + 5\n", + "\n", + "# xlabel = \"Train Feature\"\n", + "# ylabel = \"Null Count\"\n", + "\n", + "# title = \"Train Feature Null Count\"\n", + "\n", + "\n", + "params = {'figure.figsize' : figsize,\n", + " 'axes.labelsize' : labelsize,\n", + " 'axes.titlesize' : titlesize,\n", + " 'xtick.labelsize': ticksize,\n", + " 'ytick.labelsize': ticksize}\n", + "\n", + "plt.rcParams.update(params)\n", + "\n", + "kw = df_train.keyword.value_counts().head(10)\n", + "kw_d = df_train[df_train.target==1].keyword.value_counts().head(10)\n", + "kw_nd = df_train[df_train.target==0].keyword.value_counts().head(10)\n", + "\n", + "# plt.figure(figsize=(13,5))\n", + "sns.barplot(kw, kw.index)\n", + "plt.title('Top keywords for tweets')\n", + "plt.show()\n", + "plt.subplot(121)\n", + "sns.barplot(kw_d, kw_d.index)\n", + "plt.title('Top keywords for disaster tweets')\n", + "plt.subplot(122)\n", + "sns.barplot(kw_nd, kw_nd.index)\n", + "plt.title('Top keywords for non-disaster tweets')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Top locations for tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "figsize=(22, 5)\n", + "\n", + "ticksize = 14\n", + "titlesize = ticksize + 8\n", + "labelsize = ticksize + 5\n", + "\n", + "# xlabel = \"Train Feature\"\n", + "# ylabel = \"Null Count\"\n", + "\n", + "# title = \"Train Feature Null Count\"\n", + "\n", + "\n", + "params = {'figure.figsize' : figsize,\n", + " 'axes.labelsize' : labelsize,\n", + " 'axes.titlesize' : titlesize,\n", + " 'xtick.labelsize': ticksize,\n", + " 'ytick.labelsize': ticksize}\n", + "\n", + "plt.rcParams.update(params)\n", + "\n", + "kw = df_train.location.value_counts().head(10)\n", + "kw_d = df_train[df_train.target==1].location.value_counts().head(10)\n", + "kw_nd = df_train[df_train.target==0].location.value_counts().head(10)\n", + "\n", + "# plt.figure(figsize=(13,5))\n", + "sns.barplot(kw, kw.index)\n", + "plt.title('Top locations for tweets')\n", + "plt.show()\n", + "plt.subplot(121)\n", + "sns.barplot(kw_d, kw_d.index)\n", + "plt.title('Top locations for disaster tweets')\n", + "plt.subplot(122)\n", + "sns.barplot(kw_nd, kw_nd.index)\n", + "plt.title('Top locations for non-disaster tweets')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Feature Extraction" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idkeywordlocationtexttargettext_lenword_countunique_word_counthash_countmention_count
01NaNNaNOur Deeds are the Reason of this #earthquake M...169131310
14NaNNaNForest fire near La Ronge Sask. Canada1387700
25NaNNaNAll residents asked to 'shelter in place' are ...1133222000
36NaNNaN13,000 people receive #wildfires evacuation or...1658810
47NaNNaNJust got sent this photo from Ruby #Alaska as ...188161520
\n", + "
" + ], + "text/plain": [ + " id keyword location text \\\n", + "0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n", + "1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n", + "2 5 NaN NaN All residents asked to 'shelter in place' are ... \n", + "3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n", + "4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n", + "\n", + " target text_len word_count unique_word_count hash_count mention_count \n", + "0 1 69 13 13 1 0 \n", + "1 1 38 7 7 0 0 \n", + "2 1 133 22 20 0 0 \n", + "3 1 65 8 8 1 0 \n", + "4 1 88 16 15 2 0 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Tweet length\n", + "df_train['text_len'] = df_train['text'].apply(len)\n", + "df_test['text_len'] = df_test['text'].apply(len)\n", + "# Word count\n", + "df_train[\"word_count\"] = df_train[\"text\"].apply(lambda x: len(str(x).split()))\n", + "df_test[\"word_count\"] = df_test[\"text\"].apply(lambda x: len(str(x).split()))\n", + "# Unique word count\n", + "df_train[\"unique_word_count\"] = df_train[\"text\"].apply(lambda x: len(set(str(x).split())))\n", + "df_test[\"unique_word_count\"] = df_test[\"text\"].apply(lambda x: len(set(str(x).split())))\n", + "# Count of hashtags (#)\n", + "df_train['hash_count'] = df_train['text'].apply(lambda x: str(x).count(\"#\"))\n", + "df_test['hash_count'] = df_test['text'].apply(lambda x: str(x).count(\"#\"))\n", + "# Count of mentions (@)\n", + "df_train['mention_count'] = df_train['text'].apply(lambda x: str(x).count(\"@\"))\n", + "df_test['mention_count'] = df_test['text'].apply(lambda x: str(x).count(\"@\"))\n", + "\n", + "df_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idkeywordlocationtexttext_lenword_countunique_word_counthash_countmention_counttarget
01NaNNaNOur Deeds are the Reason of this #earthquake M...691313101
14NaNNaNForest fire near La Ronge Sask. Canada3877001
25NaNNaNAll residents asked to 'shelter in place' are ...1332220001
36NaNNaN13,000 people receive #wildfires evacuation or...6588101
47NaNNaNJust got sent this photo from Ruby #Alaska as ...881615201
\n", + "
" + ], + "text/plain": [ + " id keyword location text \\\n", + "0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n", + "1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n", + "2 5 NaN NaN All residents asked to 'shelter in place' are ... \n", + "3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n", + "4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n", + "\n", + " text_len word_count unique_word_count hash_count mention_count target \n", + "0 69 13 13 1 0 1 \n", + "1 38 7 7 0 0 1 \n", + "2 133 22 20 0 0 1 \n", + "3 65 8 8 1 0 1 \n", + "4 88 16 15 2 0 1 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train = df_train[['id', 'keyword', 'location', 'text', 'text_len', 'word_count',\n", + " 'unique_word_count', 'hash_count', 'mention_count', 'target']]\n", + "df_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets see" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idkeywordlocationtexttext_lenword_countunique_word_counthash_countmention_count
00NaNNaNJust happened a terrible car crash346600
12NaNNaNHeard about #earthquake is different cities, s...649910
23NaNNaNthere is a forest fire at spot pond, geese are...96191900
39NaNNaNApocalypse lighting. #Spokane #wildfires404420
411NaNNaNTyphoon Soudelor kills 28 in China and Taiwan458800
\n", + "
" + ], + "text/plain": [ + " id keyword location text \\\n", + "0 0 NaN NaN Just happened a terrible car crash \n", + "1 2 NaN NaN Heard about #earthquake is different cities, s... \n", + "2 3 NaN NaN there is a forest fire at spot pond, geese are... \n", + "3 9 NaN NaN Apocalypse lighting. #Spokane #wildfires \n", + "4 11 NaN NaN Typhoon Soudelor kills 28 in China and Taiwan \n", + "\n", + " text_len word_count unique_word_count hash_count mention_count \n", + "0 34 6 6 0 0 \n", + "1 64 9 9 1 0 \n", + "2 96 19 19 0 0 \n", + "3 40 4 4 2 0 \n", + "4 45 8 8 0 0 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idkeywordlocationtexttext_modifiedtext_lenword_countunique_word_counthash_countmention_counttarget
01NaNNaNOur Deeds are the Reason of this #earthquake M...Our Deeds are the Reason of this #earthquake M...691313101
14NaNNaNForest fire near La Ronge Sask. CanadaForest fire near La Ronge Sask . Canada3877001
25NaNNaNAll residents asked to 'shelter in place' are ...All residents asked to ' shelter in place ' ar...1332220001
36NaNNaN13,000 people receive #wildfires evacuation or...13 , 0 0 0 people receive #wildfires evacuatio...6588101
47NaNNaNJust got sent this photo from Ruby #Alaska as ...Just got sent this photo from Ruby #Alaska as ...881615201
\n", + "
" + ], + "text/plain": [ + " id keyword location text \\\n", + "0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n", + "1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n", + "2 5 NaN NaN All residents asked to 'shelter in place' are ... \n", + "3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n", + "4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n", + "\n", + " text_modified text_len word_count \\\n", + "0 Our Deeds are the Reason of this #earthquake M... 69 13 \n", + "1 Forest fire near La Ronge Sask . Canada 38 7 \n", + "2 All residents asked to ' shelter in place ' ar... 133 22 \n", + "3 13 , 0 0 0 people receive #wildfires evacuatio... 65 8 \n", + "4 Just got sent this photo from Ruby #Alaska as ... 88 16 \n", + "\n", + " unique_word_count hash_count mention_count target \n", + "0 13 1 0 1 \n", + "1 7 0 0 1 \n", + "2 20 0 0 1 \n", + "3 8 1 0 1 \n", + "4 15 2 0 1 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Tweet length\n", + "df_train['text_modified'] = df_train['text']\n", + "df_train['text_modified'] = df_train['text_modified'].apply(lambda x: re.sub(r\"(%20)\", r\" \", x, flags=re.MULTILINE))\n", + "df_train['text_modified'] = df_train['text_modified'].apply(lambda x: re.sub(r'^https?:\\/\\/.*[\\r\\n]*', 'http://', x, flags=re.MULTILINE))\n", + "df_train['text_modified'] = df_train['text_modified'].apply(lambda x: re.sub(r\"([^a-zA-Z1-9#])\", r\" \\1 \", x, flags=re.MULTILINE))\n", + "df_train['text_modified'] = df_train['text_modified'].apply(lambda x: re.sub(r\"\\s+\", \" \", x, flags=re.MULTILINE))\n", + "\n", + "\n", + "df_train = df_train[['id', 'keyword', 'location', 'text', 'text_modified', 'text_len', 'word_count',\n", + " 'unique_word_count', 'hash_count', 'mention_count', 'target']]\n", + "df_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 : @ Camilla _ 33 @ CrayKain Hate to shatter your delusions but a hatchet is a deadly weapon justifying lethal force . #gunsense\n", + "1 : Thu Aug 0 6 2 0 15 0 1 : 2 0 : 32 GMT + 0 0 0 0 ( UTC ) #millcityio #2 0 15 0 613 theramin sirens\n", + "1 : @ Bill _ Roose That looks so desolate and just . . . depressing\n", + "0 : @ nprfreshair I really can ' t believe he is skipping out before the Republican meltdown . . . I mean ' debate ' . \n", + "0 : @ RaynbowAffair Editor In Chief @ DiamondKesawn Releases Issue #7 http : / / t . co / 7mzYcU2IHo of #RAmag . #Fashion #Models and #Mayhem\n", + "1 : California man facing manslaughter charge in Sunday ' s wrong - way fatal crash in . . . - http : / / t . co / 1vz3RmjHy4 : Ca . . . http : / / t . co / xevUEEfQBZ\n", + "0 : Yay for sirens\n", + "1 : Udhampur terror attack : Militants attack police post 2 SPOs injured : Suspected militants tonight attacked a p . . . http : / / t . co / FPhFESemyJ\n", + "1 : A Look at State Actions a Year After #Ferguson ' s Upheaval http : / / t . co / qwSbVfLPE1\n", + "0 : ' Education is the most powerful weapon which you can use to change the world . ' Nelson #Mandela #quote http : / / t . co / QR1L2JYUEZ\n" + ] + } + ], + "source": [ + "for i in range(10):\n", + " r = random.randrange(len(df_train)-1)\n", + " print(df_train.target[r], \" : \", df_train.text_modified\t[r])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(df_train.loc[:,'id':'mention_count'], df_train['target'], test_size=0.2, random_state=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Training Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.1 Gaussian Naive Bayes and TFIDF Vectorizer " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.1.1 Original Text\n", + "Using Gassuan Naive Bayes, as machine learning algorithm and TFIDF vectorizer on original text" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of mislabeled points out of a total 1523 points : 359\n", + "Percentage of Correct points : 76.43\n" + ] + } + ], + "source": [ + "from sklearn.naive_bayes import GaussianNB\n", + "\n", + "vectorizer = TfidfVectorizer(\n", + "# input='content', \n", + "# encoding='utf-8', \n", + "# decode_error='strict', \n", + "# strip_accents=None, \n", + "# lowercase=False,\n", + "# preprocessor=None,\n", + "# tokenizer=None, \n", + "# analyzer='word',\n", + " stop_words=None, \n", + " ngram_range=(1, 2), \n", + " max_df=0.6, \n", + " min_df=4, \n", + " max_features=None, \n", + "# vocabulary=None, \n", + "# binary=False, \n", + " norm='l2', \n", + " use_idf=True, \n", + " smooth_idf=True,\n", + " sublinear_tf=True\n", + ")\n", + "\n", + "X_train_tf = vectorizer.fit_transform(X_train.text.tolist()).toarray()\n", + "y_train = y_train\n", + "\n", + "# print(train_X.shape)\n", + "# print(len(Y_train))\n", + "\n", + "\n", + "############################\n", + "\n", + "gnb = GaussianNB()\n", + "\n", + "X_test_tf = vectorizer.transform(X_test.text.tolist()).toarray()\n", + "\n", + "y_pred = gnb.fit(X_train_tf, y_train).predict(X_test_tf)\n", + "\n", + "print(\"Number of mislabeled points out of a total %d points : %d\" % (X_test.shape[0], (y_test != y_pred).sum()))\n", + "print(\"Percentage of Correct points : %.2f\" % (100-((y_test != y_pred).sum())/(len(y_test))*100))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.1.2 Modified Text\n", + "Using Gassuan Naive Bayes, as machine learning algorithm and TFIDF vectorizer on modified text" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of mislabeled points out of a total 1523 points : 379\n", + "Percentage of Correct points : 75.11\n" + ] + } + ], + "source": [ + "from sklearn.naive_bayes import GaussianNB\n", + "\n", + "vectorizer = TfidfVectorizer(\n", + "# input='content', \n", + "# encoding='utf-8', \n", + "# decode_error='strict', \n", + "# strip_accents=None, \n", + " lowercase=False,\n", + "# preprocessor=None,\n", + "# tokenizer=None, \n", + "# analyzer='word',\n", + " stop_words=None, \n", + " ngram_range=(1, 2), \n", + " max_df=.6, \n", + " min_df=4, \n", + " max_features=None, \n", + "# vocabulary=None, \n", + "# binary=False, \n", + " norm='l2', \n", + " use_idf=True, \n", + " smooth_idf=True,\n", + " sublinear_tf=True\n", + ")\n", + "\n", + "X_train_tf = vectorizer.fit_transform(X_train.text_modified.tolist()).toarray()\n", + "y_train = y_train\n", + "\n", + "# print(train_X.shape)\n", + "# print(len(Y_train))\n", + "\n", + "\n", + "############################\n", + "\n", + "gnb = GaussianNB()\n", + "\n", + "X_test_tf = vectorizer.transform(X_test.text_modified.tolist()).toarray()\n", + "\n", + "y_pred = gnb.fit(X_train_tf, y_train).predict(X_test_tf)\n", + "\n", + "print(\"Number of mislabeled points out of a total %d points : %d\" % (X_test.shape[0], (y_test != y_pred).sum()))\n", + "print(\"Percentage of Correct points : %.2f\" % (100-((y_test != y_pred).sum())/(len(y_test))*100))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.2 Logistic Regression of derived numerical features" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "macro: 0.5968089777364534\n", + "micro: 0.6231122783978988\n", + "weighted: 0.6136457948693581\n", + "None: [0.69979079 0.49382716]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", + " FutureWarning)\n" + ] + } + ], + "source": [ + "# Most stupid model: Just to test submission\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "\n", + "X_train_at = X_train.loc[:,'text_len':'mention_count']\n", + "X_test_at = X_test.loc[:,'text_len':'mention_count']\n", + "\n", + "\n", + "model = LogisticRegression()\n", + "model.fit(X_train_at, y_train)\n", + "y_pred = model.predict(X_test_at)\n", + "\n", + "from sklearn.metrics import f1_score\n", + "print('macro: ', f1_score(y_test, y_pred, average='macro'))\n", + "print('micro: ', f1_score(y_test, y_pred, average='micro'))\n", + "print('weighted: ', f1_score(y_test, y_pred, average='weighted'))\n", + "print('None: ', f1_score(y_test, y_pred, average=None))\n", + "# f1_score(y_test, y_pred, zero_division=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "63bd6eb6b6e61629f6584772897655d98fa4253e" + }, + "source": [ + "## 4.3. Deep Learning" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "727e5f0c3130d2df7403d96c743c8d4f4c3b7c4d" + }, + "source": [ + "### 4.3.1 Output Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "_uuid": "c8703accc04a96fda2e561b6e66e474a23dde81a" + }, + "outputs": [], + "source": [ + "main_model_dir = output_directory + r\"models/\"\n", + "main_log_dir = output_directory + r\"logs/\"\n", + "\n", + "try:\n", + " os.mkdir(main_model_dir)\n", + "except:\n", + " print(\"Could not create main model directory\")\n", + " \n", + "try:\n", + " os.mkdir(main_log_dir)\n", + "except:\n", + " print(\"Could not create main log directory\")\n", + "\n", + "\n", + "\n", + "model_dir = main_model_dir + time.strftime('%Y-%m-%d %H-%M-%S') + \"/\"\n", + "log_dir = main_log_dir + time.strftime('%Y-%m-%d %H-%M-%S')\n", + "\n", + "\n", + "try:\n", + " os.mkdir(model_dir)\n", + "except:\n", + " print(\"Could not create model directory\")\n", + " \n", + "try:\n", + " os.mkdir(log_dir)\n", + "except:\n", + " print(\"Could not create log directory\")\n", + " \n", + "model_file = model_dir + \"{epoch:02d}-val_acc-{val_acc:.2f}-val_loss-{val_loss:.2f}.hdf5\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.3.2 Callback Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "_uuid": "0eeca633f28358dbfafd3db7fd22518fc77a276a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Settting Callbacks\n", + "Set Callbacks at Timestamp: 2020-01-30 08:14:01\n" + ] + } + ], + "source": [ + "print(\"Settting Callbacks\")\n", + "\n", + "checkpoint = ModelCheckpoint(\n", + " model_file, \n", + " monitor='val_accuracy', \n", + " save_best_only=True)\n", + "\n", + "early_stopping = EarlyStopping(\n", + " monitor='val_loss',\n", + " patience=5,\n", + " verbose=1,\n", + " restore_best_weights=True)\n", + "\n", + "\n", + "reduce_lr = ReduceLROnPlateau(\n", + " monitor='val_loss',\n", + " factor=0.5,\n", + " patience=2,\n", + " verbose=1)\n", + "\n", + "\n", + "# callbacks = [checkpoint, reduce_lr, early_stopping]\n", + "\n", + "callbacks = [reduce_lr, early_stopping]\n", + "\n", + "print(\"Set Callbacks at \", date_time(1))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "42410b8507438ce15c28b5e9778908515dfaeb51" + }, + "source": [ + "### 4.3.3. Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate Class Weights\n", + "def get_weight(y):\n", + " class_weight_current = cw.compute_class_weight('balanced', np.unique(y), y)\n", + " return class_weight_current" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "_uuid": "c97ce5c809f58080f13ab1da76fbeff96817a321" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 1 1 ... 1 1 1]\n", + "[[0. 1.]\n", + " [0. 1.]\n", + " [0. 1.]\n", + " ...\n", + " [0. 1.]\n", + " [0. 1.]\n", + " [0. 1.]]\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[0., 1.],\n", + " [0., 1.],\n", + " [0., 1.],\n", + " ...,\n", + " [0., 1.],\n", + " [0., 1.],\n", + " [0., 1.]], dtype=float32)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = df_train.text\n", + "Y = df_train.target\n", + "\n", + "label_encoder = LabelEncoder()\n", + "\n", + "Y = label_encoder.fit_transform(Y)\n", + "print(Y)\n", + "Y = to_categorical(Y)\n", + "print(Y)\n", + "# Y = Y.reshape(-1, 1)\n", + "Y" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "_uuid": "972b31926122af0747be40fa2b5d3bf14dcbe16a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(28366, 157)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)\n", + "\n", + "max_words = len(set(\" \".join(X_train).split()))\n", + "max_len = X_train.apply(lambda x: len(x)).max()\n", + "\n", + "# max_words = 1000\n", + "# max_len = 150\n", + "max_words, max_len" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "_uuid": "6f82546a785e1eba0e6df397078497dffc94e902" + }, + "outputs": [], + "source": [ + "tokenizer = Tokenizer(num_words=max_words)\n", + "\n", + "tokenizer.fit_on_texts(X_train)\n", + "\n", + "X_train_seq = tokenizer.texts_to_sequences(X_train)\n", + "X_train_seq = sequence.pad_sequences(X_train_seq, maxlen=max_len)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "_uuid": "15756f1979d321902f18964c00c515fb31342613" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.87540584, 1.16594595])" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_weight = get_weight(np.argmax(Y_train ,axis=1))\n", + "class_weight" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "19a3f4532b9398234b6f78e758688d7903d6b66b" + }, + "source": [ + "### 4.3.4. Model" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "_uuid": "ffc3ae3a4fb844cd20a9cb33750de42b2c71392d" + }, + "outputs": [], + "source": [ + "# def get_rnn_model(num_class=2):\n", + "# model = Sequential()\n", + " \n", + "# model.add(Embedding(max_words, 100, input_length=max_len))\n", + "# model.add(LSTM(256))\n", + " \n", + "# model.add(Dropout(0.5))\n", + "# model.add(BatchNormalization())\n", + "# model.add(Dropout(0.5))\n", + " \n", + "# model.add(Dense(512, activation='relu'))\n", + " \n", + "# model.add(Dropout(0.5))\n", + "# model.add(BatchNormalization())\n", + "# model.add(Dropout(0.5))\n", + " \n", + "# if num_class>=2:\n", + "# model.add(Dense(num_class, activation='softmax'))\n", + "# else:\n", + "# model.add(Dense(1, activation='sigmoid'))\n", + " \n", + "# model.summary()\n", + " \n", + "# return model\n", + "\n", + "def get_rnn_model(num_class=2):\n", + " model = Sequential()\n", + " \n", + " model.add(Embedding(max_words, 100, input_length=max_len))\n", + " \n", + " model.add(Dropout(0.2))\n", + " model.add(Bidirectional(LSTM(256)))\n", + " model.add(Dropout(0.2))\n", + " \n", + " model.add(Dense(512, activation='relu'))\n", + " \n", + " model.add(Dropout(0.5))\n", + " model.add(BatchNormalization())\n", + " \n", + " if num_class>=2:\n", + " model.add(Dense(num_class, activation='softmax'))\n", + " else:\n", + " model.add(Dense(1, activation='sigmoid'))\n", + " \n", + " model.summary()\n", + " \n", + " return model\n", + "\n", + "\n", + "def get_cnn_model(num_class=2): \n", + " model = Sequential()\n", + " \n", + " model.add(Embedding(max_words, 100, input_length=max_len))\n", + " \n", + " model.add(Conv1D(1024, 3, padding='valid', activation='relu', strides=1))\n", + " model.add(GlobalMaxPooling1D())\n", + " \n", + " \n", + " model.add(Dropout(0.5))\n", + " model.add(BatchNormalization())\n", + " model.add(Dropout(0.5))\n", + " \n", + " model.add(Dense(2048, activation='relu'))\n", + " \n", + " model.add(Dropout(0.5))\n", + " model.add(BatchNormalization())\n", + " model.add(Dropout(0.5))\n", + " \n", + " if num_class>=2:\n", + " model.add(Dense(num_class, activation='softmax'))\n", + " else:\n", + " model.add(Dense(1, activation='sigmoid'))\n", + " \n", + " model.summary()\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "_uuid": "a5472569b1c7c8f13fe14c9dcac3a7957db1300e" + }, + "outputs": [], + "source": [ + "def plot_performance(history=None, figure_directory=None):\n", + " xlabel = 'Epoch'\n", + " legends = ['Training', 'Validation']\n", + "\n", + "# ylim_pad = [0.1, 0.005]\n", + " ylim_pad = [0, 0.5]\n", + "\n", + "\n", + " plt.figure(figsize=(20, 5))\n", + "\n", + " # Plot training & validation Accuracy values\n", + "\n", + " y1 = history.history['accuracy']\n", + " y2 = history.history['val_accuracy']\n", + "\n", + " min_y = min(min(y1), min(y2))-ylim_pad[0]\n", + " max_y = max(max(y1), max(y2))+ylim_pad[0]\n", + " \n", + " min_y = 0\n", + " max_y = 1\n", + "\n", + "\n", + " plt.subplot(121)\n", + "\n", + " plt.plot(y1)\n", + " plt.plot(y2)\n", + "\n", + " plt.title('Model Accuracy\\n'+date_time(1), fontsize=17)\n", + " plt.xlabel(xlabel, fontsize=15)\n", + " plt.ylabel('Accuracy', fontsize=15)\n", + " plt.ylim(min_y, max_y)\n", + " plt.legend(legends, loc='upper left')\n", + " plt.grid()\n", + "\n", + "\n", + " # Plot training & validation loss values\n", + "\n", + " y1 = history.history['loss']\n", + " y2 = history.history['val_loss']\n", + "\n", + " min_y = min(min(y1), min(y2))-ylim_pad[1]\n", + " max_y = max(max(y1), max(y2))+ylim_pad[1]\n", + "\n", + "# min_y = 0\n", + "# max_y = .8\n", + "\n", + " plt.subplot(122)\n", + "\n", + " plt.plot(y1)\n", + " plt.plot(y2)\n", + "\n", + " plt.title('Model Loss\\n'+date_time(1), fontsize=17)\n", + " plt.xlabel(xlabel, fontsize=15)\n", + " plt.ylabel('Loss', fontsize=15)\n", + " plt.ylim(min_y, max_y)\n", + " plt.legend(legends, loc='upper left')\n", + " plt.grid()\n", + " \n", + " if figure_directory:\n", + " plt.savefig(figure_directory+\"/history\")\n", + "\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "_uuid": "b424eb9a6e314ef397df89dcfeffc4c9c15839ee" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential_1\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "embedding_1 (Embedding) (None, 157, 100) 2836600 \n", + "_________________________________________________________________\n", + "dropout_1 (Dropout) (None, 157, 100) 0 \n", + "_________________________________________________________________\n", + "bidirectional_1 (Bidirection (None, 512) 731136 \n", + "_________________________________________________________________\n", + "dropout_2 (Dropout) (None, 512) 0 \n", + "_________________________________________________________________\n", + "dense_1 (Dense) (None, 512) 262656 \n", + "_________________________________________________________________\n", + "dropout_3 (Dropout) (None, 512) 0 \n", + "_________________________________________________________________\n", + "batch_normalization_1 (Batch (None, 512) 2048 \n", + "_________________________________________________________________\n", + "dense_2 (Dense) (None, 2) 1026 \n", + "=================================================================\n", + "Total params: 3,833,466\n", + "Trainable params: 3,832,442\n", + "Non-trainable params: 1,024\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "num_class = 2\n", + "model1 = get_rnn_model(num_class=num_class)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "_uuid": "72903b1167548759e416d9b8ba643fdf90305e06" + }, + "outputs": [], + "source": [ + "loss = 'categorical_crossentropy'\n", + "# loss = 'binary_crossentropy'\n", + "metrics = ['accuracy']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "2bd1166ccac602ca7d908e9f9b17c8ffb2a1f92c" + }, + "source": [ + "### 4.3.5 Model Trainning" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "a1ce715c032b60131950b0946776b41040ec97f4" + }, + "source": [ + "#### 4.3.5.1 RNN" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "_uuid": "3499506821ecfcfa499a1195742796152939cc94" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting...\n", + "\n", + "Timestamp: 2020-01-30 08:14:06\n", + "\n", + "\n", + "Compliling Model ...\n", + "\n", + "Trainning Model ...\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", + " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train on 5176 samples, validate on 1295 samples\n", + "Epoch 1/100\n", + "5176/5176 [==============================] - 96s 19ms/step - loss: 0.6403 - accuracy: 0.6285 - val_loss: 0.6457 - val_accuracy: 0.5807\n", + "Epoch 2/100\n", + "5176/5176 [==============================] - 94s 18ms/step - loss: 0.4553 - accuracy: 0.7960 - val_loss: 0.4862 - val_accuracy: 0.7792\n", + "Epoch 3/100\n", + "5176/5176 [==============================] - 94s 18ms/step - loss: 0.3238 - accuracy: 0.8644 - val_loss: 0.5362 - val_accuracy: 0.7776\n", + "Epoch 4/100\n", + "5176/5176 [==============================] - 94s 18ms/step - loss: 0.2323 - accuracy: 0.9042 - val_loss: 0.7195 - val_accuracy: 0.7575\n", + "\n", + "Epoch 00004: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.\n", + "Epoch 5/100\n", + "5176/5176 [==============================] - 94s 18ms/step - loss: 0.1582 - accuracy: 0.9397 - val_loss: 0.8979 - val_accuracy: 0.6903\n", + "Epoch 6/100\n", + "5176/5176 [==============================] - 94s 18ms/step - loss: 0.1274 - accuracy: 0.9507 - val_loss: 0.8110 - val_accuracy: 0.7483\n", + "\n", + "Epoch 00006: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.\n", + "Epoch 7/100\n", + "5176/5176 [==============================] - 95s 18ms/step - loss: 0.1064 - accuracy: 0.9614 - val_loss: 0.8494 - val_accuracy: 0.7367\n", + "Restoring model weights from the end of the best epoch\n", + "Epoch 00007: early stopping\n", + "\n", + "Elapsed Time: 00:11:02\n", + "Completed Model Trainning Timestamp: 2020-01-30 08:25:08\n" + ] + } + ], + "source": [ + "print(\"Starting...\\n\")\n", + "\n", + "start_time = time.time()\n", + "print(date_time(1))\n", + "\n", + "print(\"\\n\\nCompliling Model ...\\n\")\n", + "learning_rate = 0.0001\n", + "optimizer = Adam(learning_rate)\n", + "# optimizer = Adam()\n", + "\n", + "model1.compile(optimizer=optimizer, loss=loss, metrics=metrics)\n", + "\n", + "verbose = 1\n", + "epochs = 100\n", + "batch_size = 16\n", + "validation_split = 0.2\n", + "\n", + "print(\"Trainning Model ...\\n\")\n", + "\n", + "history1 = model1.fit(\n", + " X_train_seq,\n", + " Y_train,\n", + " batch_size=batch_size,\n", + " epochs=epochs,\n", + " verbose=verbose,\n", + " callbacks=callbacks,\n", + " validation_split=validation_split,\n", + " class_weight=class_weight\n", + " )\n", + "\n", + "elapsed_time = time.time() - start_time\n", + "elapsed_time = time.strftime(\"%H:%M:%S\", time.gmtime(elapsed_time))\n", + "\n", + "print(\"\\nElapsed Time: \" + elapsed_time)\n", + "print(\"Completed Model Trainning\", date_time(1))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "174b2e76795372f6e7d09d6fa48579f782609b78" + }, + "source": [ + "##### 4.3.5.1.2 Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "_uuid": "5e6865d7574bb08bfc6e20ce51c8c0a6ab39ef4c" + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_performance(history=history1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "29bb4e66b2f030e88d88ca12d48209a93991647c" + }, + "source": [ + "#### 4.3.5.2. CNN" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "_uuid": "106063a66e2031658fd6649c2bc68ff54a2d6e9a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential_2\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "embedding_2 (Embedding) (None, 157, 100) 2836600 \n", + "_________________________________________________________________\n", + "conv1d_1 (Conv1D) (None, 155, 1024) 308224 \n", + "_________________________________________________________________\n", + "global_max_pooling1d_1 (Glob (None, 1024) 0 \n", + "_________________________________________________________________\n", + "dropout_4 (Dropout) (None, 1024) 0 \n", + "_________________________________________________________________\n", + "batch_normalization_2 (Batch (None, 1024) 4096 \n", + "_________________________________________________________________\n", + "dropout_5 (Dropout) (None, 1024) 0 \n", + "_________________________________________________________________\n", + "dense_3 (Dense) (None, 2048) 2099200 \n", + "_________________________________________________________________\n", + "dropout_6 (Dropout) (None, 2048) 0 \n", + "_________________________________________________________________\n", + "batch_normalization_3 (Batch (None, 2048) 8192 \n", + "_________________________________________________________________\n", + "dropout_7 (Dropout) (None, 2048) 0 \n", + "_________________________________________________________________\n", + "dense_4 (Dense) (None, 2) 4098 \n", + "=================================================================\n", + "Total params: 5,260,410\n", + "Trainable params: 5,254,266\n", + "Non-trainable params: 6,144\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "num_class = 2\n", + "model2 = get_cnn_model(num_class=num_class)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "_uuid": "f15fae20f343530b2e83e9c890fe674fc0d3ea1a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting...\n", + "\n", + "Timestamp: 2020-01-30 08:25:10\n", + "\n", + "\n", + "Compliling Model ...\n", + "\n", + "Trainning Model ...\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", + " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train on 5176 samples, validate on 1295 samples\n", + "Epoch 1/100\n", + "5176/5176 [==============================] - 7s 1ms/step - loss: 1.2181 - accuracy: 0.5189 - val_loss: 0.7872 - val_accuracy: 0.4270\n", + "Epoch 2/100\n", + "5176/5176 [==============================] - 3s 574us/step - loss: 1.1197 - accuracy: 0.5526 - val_loss: 0.7037 - val_accuracy: 0.5784\n", + "Epoch 3/100\n", + "5176/5176 [==============================] - 3s 580us/step - loss: 1.0146 - accuracy: 0.5767 - val_loss: 0.6748 - val_accuracy: 0.6255\n", + "Epoch 4/100\n", + "5176/5176 [==============================] - 3s 579us/step - loss: 0.9761 - accuracy: 0.5852 - val_loss: 0.6368 - val_accuracy: 0.6394\n", + "Epoch 5/100\n", + "5176/5176 [==============================] - 3s 571us/step - loss: 0.8764 - accuracy: 0.6086 - val_loss: 0.6256 - val_accuracy: 0.6425\n", + "Epoch 6/100\n", + "5176/5176 [==============================] - 3s 565us/step - loss: 0.8392 - accuracy: 0.6190 - val_loss: 0.6117 - val_accuracy: 0.6502\n", + "Epoch 7/100\n", + "5176/5176 [==============================] - 3s 592us/step - loss: 0.7346 - accuracy: 0.6528 - val_loss: 0.5723 - val_accuracy: 0.6842\n", + "Epoch 8/100\n", + "5176/5176 [==============================] - 4s 701us/step - loss: 0.6582 - accuracy: 0.7031 - val_loss: 0.5310 - val_accuracy: 0.7320\n", + "Epoch 9/100\n", + "5176/5176 [==============================] - 3s 603us/step - loss: 0.5772 - accuracy: 0.7344 - val_loss: 0.5211 - val_accuracy: 0.7475\n", + "Epoch 10/100\n", + "5176/5176 [==============================] - 3s 641us/step - loss: 0.4970 - accuracy: 0.7734 - val_loss: 0.5228 - val_accuracy: 0.7552\n", + "Epoch 11/100\n", + "5176/5176 [==============================] - 3s 576us/step - loss: 0.4468 - accuracy: 0.8058 - val_loss: 0.5314 - val_accuracy: 0.7506\n", + "\n", + "Epoch 00011: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.\n", + "Epoch 12/100\n", + "5176/5176 [==============================] - 3s 591us/step - loss: 0.3779 - accuracy: 0.8360 - val_loss: 0.5435 - val_accuracy: 0.7568\n", + "Epoch 13/100\n", + "5176/5176 [==============================] - 3s 583us/step - loss: 0.3420 - accuracy: 0.8547 - val_loss: 0.5502 - val_accuracy: 0.7544\n", + "\n", + "Epoch 00013: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.\n", + "Epoch 14/100\n", + "5176/5176 [==============================] - 3s 567us/step - loss: 0.3103 - accuracy: 0.8729 - val_loss: 0.5540 - val_accuracy: 0.7575\n", + "Restoring model weights from the end of the best epoch\n", + "Epoch 00014: early stopping\n", + "\n", + "Elapsed Time: 00:00:48\n", + "Completed Model Trainning Timestamp: 2020-01-30 08:25:58\n" + ] + } + ], + "source": [ + "print(\"Starting...\\n\")\n", + "\n", + "start_time = time.time()\n", + "print(date_time(1))\n", + "\n", + "print(\"\\n\\nCompliling Model ...\\n\")\n", + "learning_rate = 0.0001\n", + "optimizer = Adam(learning_rate)\n", + "# optimizer = Adam()\n", + "\n", + "model2.compile(optimizer=optimizer, loss=loss, metrics=metrics)\n", + "\n", + "verbose = 1\n", + "epochs = 100\n", + "batch_size = 16\n", + "validation_split = 0.2\n", + "\n", + "print(\"Trainning Model ...\\n\")\n", + "\n", + "history2 = model2.fit(\n", + " X_train_seq,\n", + " Y_train,\n", + " batch_size=batch_size,\n", + " epochs=epochs,\n", + " verbose=verbose,\n", + " callbacks=callbacks,\n", + " validation_split=validation_split,\n", + " class_weight =class_weight\n", + " )\n", + "\n", + "elapsed_time = time.time() - start_time\n", + "elapsed_time = time.strftime(\"%H:%M:%S\", time.gmtime(elapsed_time))\n", + "\n", + "print(\"\\nElapsed Time: \" + elapsed_time)\n", + "print(\"Completed Model Trainning\", date_time(1))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "357b590e09d3e623822c3e68b280ef42db952075" + }, + "source": [ + "#### 4.3.5.2.2 Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "_uuid": "50d81ffc4f78b45a59b7f08ec404d008a3c63f94" + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_performance(history=history2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "b64920b6ac5a1bb4e60f1d937245007292dd1681" + }, + "source": [ + "### 14.3.6 Inference/ Prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "_uuid": "a6e1d8e3639c5e739a67c484b00faf148fa6edd4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1142/1142 [==============================] - 2s 2ms/step\n", + "1142/1142 [==============================] - 0s 108us/step\n" + ] + } + ], + "source": [ + "test_X_seq = tokenizer.texts_to_sequences(X_test)\n", + "test_X_seq = sequence.pad_sequences(test_X_seq, maxlen=max_len)\n", + "accuracy1 = model1.evaluate(test_X_seq, Y_test)\n", + "accuracy2 = model2.evaluate(test_X_seq, Y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "08c78c65d45aaa855de2d16533dc5ddeb1a52c9b" + }, + "source": [ + "### 4.3.7 Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "_uuid": "fd3d9e8a332ab50eb69d3995fa0847c642ca942d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Performance of RNN (Test Accuracy):\n", + "Accuracy: 79.16%\n", + "Loss: 0.479\n", + "\n", + "\n", + "Model Performance of RNN (Test Accuracy):\n", + "v: 76.01%\n", + "Loss: 0.486\n", + "\n" + ] + } + ], + "source": [ + "print(\"Model Performance of RNN (Test Accuracy):\")\n", + "print('Accuracy: {:0.2f}%\\nLoss: {:0.3f}\\n'.format(accuracy1[1]*100, accuracy1[0]))\n", + "\n", + "print(\"\\nModel Performance of RNN (Test Accuracy):\")\n", + "print('v: {:0.2f}%\\nLoss: {:0.3f}\\n'.format(accuracy2[1]*100, accuracy2[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "_uuid": "68d1b5720cecae415e852c1f35563da08cfea2b5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1142/1142 [==============================] - 2s 2ms/step\n", + "1142/1142 [==============================] - 0s 144us/step\n" + ] + } + ], + "source": [ + "ypreds1 = model1.predict_classes(test_X_seq, verbose=1)\n", + "ypreds2 = model2.predict_classes(test_X_seq, verbose=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "_uuid": "3feb7baf3e5658dce6bdcf929af34676b5f23fd7" + }, + "outputs": [], + "source": [ + "def plot_model_performace(result):\n", + " sns.set_style(\"ticks\")\n", + " figsize=(22, 6)\n", + "\n", + " ticksize = 12\n", + " titlesize = ticksize + 8\n", + " labelsize = ticksize + 5\n", + "\n", + " xlabel = \"Model\"\n", + " ylabel = \"Score\"\n", + "\n", + " title = \"Model Performance\"\n", + "\n", + " params = {'figure.figsize' : figsize,\n", + " 'axes.labelsize' : labelsize,\n", + " 'axes.titlesize' : titlesize,\n", + " 'xtick.labelsize': ticksize,\n", + " 'ytick.labelsize': ticksize}\n", + "\n", + " plt.rcParams.update(params)\n", + "\n", + " col1 = \"model\"\n", + " col2 = \"score\"\n", + " sns.barplot(x=col1, y=col2, data=result)\n", + " plt.title(title.title())\n", + " plt.xlabel(xlabel)\n", + " plt.ylabel(ylabel)\n", + " plt.xticks(rotation=90)\n", + " plt.grid()\n", + " plt.plot()\n", + " plt.show()\n", + " print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "_uuid": "dc19b344d18b16e4f15d4b21eb86bf6acde8c2f0" + }, + "outputs": [], + "source": [ + "# print(classification_report(Y_test, ypreds1))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "_uuid": "47e7e60a9e06662083a4971f84589129d307be40" + }, + "outputs": [], + "source": [ + "# plot_confusion_matrix(Y_test, ypreds1, title=\"RNN\")" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "_uuid": "0fc73dddd076f5c15ff41a9dff2abf47d7126589" + }, + "outputs": [], + "source": [ + "# print(classification_report(Y_test, ypreds2))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "_uuid": "73e7f87fdfb70dd3eddabd607a9eefb96a086831" + }, + "source": [ + "#### 10.5.1.2 Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "_uuid": "675a75e41d24b34491a9b9e310b30803b90e9188" + }, + "outputs": [], + "source": [ + "# plot_confusion_matrix(Y_test, ypreds2, title=\"CNN\")" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "_uuid": "6679f27d3d8d66fcc4c99d3183821051f823f688" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: \n", + ".ix is deprecated. Please use\n", + ".loc for label based indexing or\n", + ".iloc for positional indexing\n", + "\n", + "See the documentation here:\n", + "http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated\n", + " This is separate from the ipykernel package so we can avoid doing imports until\n" + ] + } + ], + "source": [ + "result = pd.DataFrame({'model': 'RNN', 'score': accuracy1[1]*100}, index=[-1])\n", + "row2 = pd.DataFrame({'model': 'CNN', 'score': accuracy2[1]*100}, index=[-1])\n", + "result = pd.concat([row2, result.ix[:]]).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "_uuid": "fa8ce5f06a1b2884dd5b1c113dcd041c86aa26bb" + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " model score\n", + "0 CNN 76.007003\n", + "1 RNN 79.159367\n" + ] + } + ], + "source": [ + "plot_model_performace(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Final" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idkeywordlocationtexttext_modifiedtext_lenword_countunique_word_counthash_countmention_counttarget
01NaNNaNOur Deeds are the Reason of this #earthquake M...Our Deeds are the Reason of this #earthquake M...691313101
14NaNNaNForest fire near La Ronge Sask. CanadaForest fire near La Ronge Sask . Canada3877001
25NaNNaNAll residents asked to 'shelter in place' are ...All residents asked to ' shelter in place ' ar...1332220001
36NaNNaN13,000 people receive #wildfires evacuation or...13 , 0 0 0 people receive #wildfires evacuatio...6588101
47NaNNaNJust got sent this photo from Ruby #Alaska as ...Just got sent this photo from Ruby #Alaska as ...881615201
\n", + "
" + ], + "text/plain": [ + " id keyword location text \\\n", + "0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n", + "1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n", + "2 5 NaN NaN All residents asked to 'shelter in place' are ... \n", + "3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n", + "4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n", + "\n", + " text_modified text_len word_count \\\n", + "0 Our Deeds are the Reason of this #earthquake M... 69 13 \n", + "1 Forest fire near La Ronge Sask . Canada 38 7 \n", + "2 All residents asked to ' shelter in place ' ar... 133 22 \n", + "3 13 , 0 0 0 people receive #wildfires evacuatio... 65 8 \n", + "4 Just got sent this photo from Ruby #Alaska as ... 88 16 \n", + "\n", + " unique_word_count hash_count mention_count target \n", + "0 13 1 0 1 \n", + "1 7 0 0 1 \n", + "2 20 0 0 1 \n", + "3 8 1 0 1 \n", + "4 15 2 0 1 " + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.naive_bayes import GaussianNB\n", + "\n", + "vectorizer = TfidfVectorizer(\n", + "# input='content', \n", + "# encoding='utf-8', \n", + "# decode_error='strict', \n", + "# strip_accents=None, \n", + "# lowercase=False,\n", + "# preprocessor=None,\n", + "# tokenizer=None, \n", + "# analyzer='word',\n", + " stop_words=None, \n", + " ngram_range=(1, 2), \n", + " max_df=0.6, \n", + " min_df=4, \n", + " max_features=None, \n", + "# vocabulary=None, \n", + "# binary=False, \n", + " norm='l2', \n", + " use_idf=True, \n", + " smooth_idf=True,\n", + " sublinear_tf=True\n", + ")\n", + "\n", + "X_train_tf = vectorizer.fit_transform(df_train.text.tolist()).toarray()\n", + "\n", + "y_train = df_train.target.tolist()\n", + "\n", + "# print(train_X.shape)\n", + "# print(len(Y_train))\n", + "\n", + "\n", + "############################\n", + "\n", + "gnb = GaussianNB()\n", + "\n", + "X_test_tf = vectorizer.transform(df_test.text.tolist()).toarray()\n", + "\n", + "y_pred = gnb.fit(X_train_tf, y_train).predict(X_test_tf)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idkeywordlocationtexttext_lenword_countunique_word_counthash_countmention_count
00NaNNaNJust happened a terrible car crash346600
12NaNNaNHeard about #earthquake is different cities, s...649910
23NaNNaNthere is a forest fire at spot pond, geese are...96191900
39NaNNaNApocalypse lighting. #Spokane #wildfires404420
411NaNNaNTyphoon Soudelor kills 28 in China and Taiwan458800
\n", + "
" + ], + "text/plain": [ + " id keyword location text \\\n", + "0 0 NaN NaN Just happened a terrible car crash \n", + "1 2 NaN NaN Heard about #earthquake is different cities, s... \n", + "2 3 NaN NaN there is a forest fire at spot pond, geese are... \n", + "3 9 NaN NaN Apocalypse lighting. #Spokane #wildfires \n", + "4 11 NaN NaN Typhoon Soudelor kills 28 in China and Taiwan \n", + "\n", + " text_len word_count unique_word_count hash_count mention_count \n", + "0 34 6 6 0 0 \n", + "1 64 9 9 1 0 \n", + "2 96 19 19 0 0 \n", + "3 40 4 4 2 0 \n", + "4 45 8 8 0 0 " + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "sample_list = list(sub_sample.id)\n", + "\n", + "pred_dict = dict((key, value) for (key, value) in zip(df_test.id, y_pred))\n", + "\n", + "pred_list_new = [pred_dict[f] for f in sample_list]\n", + "\n", + "test_df = pd.DataFrame({'id': sample_list, 'target': pred_list_new})\n", + "\n", + "test_df.to_csv(\"submission.csv\", header=True, index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Project-Structure.md b/Project-Structure.md index 06ebc58d7b..6b2d7b8147 100644 --- a/Project-Structure.md +++ b/Project-Structure.md @@ -554,6 +554,8 @@ * [Thyroid-Disease-Detection-Using-Deep-Learning](Deep_Learning/Thyroid_detection/thyroid-disease-detection-using-deep-learning.ipynb) * Traffic Accident Prediction Model Using Deep Learning * [Traffic Accident Prediction Model Using Deep Learning](Deep_Learning/Traffic%20Accident%20Prediction%20Model%20using%20Deep%20Learning/Traffic%20Accident%20Prediction%20Model%20using%20Deep%20Learning.ipynb) + * Tweet Classificaion Dl + * [Disaster Tweet Classification Using Nlp And Deep Learning](Deep_Learning/Tweet_Classificaion_DL/Disaster%20Tweet%20Classification%20using%20NLP%20and%20Deep%20Learning.ipynb) * Yolo-Drowsiness-Detection-Main * [Drowsiness Detection Tutorial](Deep_Learning/YOLO-Drowsiness-Detection-main/Drowsiness%20Detection%20Tutorial.ipynb) * Smart Attendance System