diff --git a/Toxic Comment Analysis/Dataset/README.md b/Toxic Comment Analysis/Dataset/README.md
new file mode 100644
index 000000000..d5e3ea76c
--- /dev/null
+++ b/Toxic Comment Analysis/Dataset/README.md
@@ -0,0 +1,7 @@
+DATASET
+
+
+
+Dataset can be found at below link:
+
+https://www.kaggle.com/datasets/devkhant24/toxic-comment
diff --git a/Toxic Comment Analysis/Images/Accuracy_score.png b/Toxic Comment Analysis/Images/Accuracy_score.png
new file mode 100644
index 000000000..118afb62e
Binary files /dev/null and b/Toxic Comment Analysis/Images/Accuracy_score.png differ
diff --git a/Toxic Comment Analysis/Model/README.md b/Toxic Comment Analysis/Model/README.md
new file mode 100644
index 000000000..db8ff72a0
--- /dev/null
+++ b/Toxic Comment Analysis/Model/README.md
@@ -0,0 +1,49 @@
+TOXIC COMMENT ANALYSIS
+
+
+GOAL
+
+Implementtion of algorithm like keras Model with GRU units to see the accuracy score.
+
+
+DATASET
+
+https://www.kaggle.com/datasets/devkhant24/toxic-comment
+
+
+DESCRIPTION
+
+The main aim of the project is to make a model that helps to predict toxic comment on the given dataset.
+
+
+WORK DONE
+
+Defining constants and cleaning of the comments.
+Balacing the dataset and defining keras Model with GRU units.
+Then tokenizing the comments from train dataset and predicting acuracy score of comment id.
+
+
+LIBRARIES NEEDED
+
+Numpy
+Pandas
+math
+os
+re
+unidecode
+
+
+ACCURACY SCORE
+
+
+
+
+CONCLUSION
+
+We created a model which predict the toxic comments. The model gives an accuracy score of the comments on the dataset.
+
+
+CONTRIBUTION BY
+
+Sumaiya Khan
+
diff --git a/Toxic Comment Analysis/Model/TOXIC_COMMENT_ANALYSIS.ipynb b/Toxic Comment Analysis/Model/TOXIC_COMMENT_ANALYSIS.ipynb
new file mode 100644
index 000000000..e4fa9f0c5
--- /dev/null
+++ b/Toxic Comment Analysis/Model/TOXIC_COMMENT_ANALYSIS.ipynb
@@ -0,0 +1,825 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "4c7ae996",
+ "metadata": {
+ "papermill": {
+ "duration": 0.013811,
+ "end_time": "2021-11-14T04:34:51.059041",
+ "exception": false,
+ "start_time": "2021-11-14T04:34:51.045230",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "4c7ae996"
+ },
+ "source": [
+ "\n",
+ "##Introduction \n",
+ "\n",
+ "It is simple approach to rate severity of toxic comments. And find probability of the comment to be toxic one and give the score based on the probability.\n",
+ "The final score is the probability predicted by the Model.\n",
+ "\n",
+ "Notebook is build upon this public kernel: https://www.kaggle.com/datasets/devkhant24/toxic-comment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9f79be62",
+ "metadata": {
+ "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
+ "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:34:51.092708Z",
+ "iopub.status.busy": "2021-11-14T04:34:51.091887Z",
+ "iopub.status.idle": "2021-11-14T04:34:57.436680Z",
+ "shell.execute_reply": "2021-11-14T04:34:57.435929Z",
+ "shell.execute_reply.started": "2021-11-14T04:24:35.812950Z"
+ },
+ "papermill": {
+ "duration": 6.366657,
+ "end_time": "2021-11-14T04:34:57.436825",
+ "exception": false,
+ "start_time": "2021-11-14T04:34:51.070168",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "9f79be62"
+ },
+ "outputs": [],
+ "source": [
+ "# Importing libraries\n",
+ "\n",
+ "import math\n",
+ "import os\n",
+ "import random\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import re\n",
+ "import unidecode\n",
+ "\n",
+ "from sklearn.model_selection import StratifiedKFold\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from bs4 import BeautifulSoup\n",
+ "from nltk.corpus import stopwords\n",
+ "from nltk.tokenize import word_tokenize\n",
+ "from imblearn.under_sampling import RandomUnderSampler\n",
+ "\n",
+ "import tensorflow as tf\n",
+ "from tensorflow.keras.layers import Dense, Dropout, GRU, Embedding, Bidirectional\n",
+ "from tensorflow.keras.models import Sequential\n",
+ "from tensorflow.keras.preprocessing.text import Tokenizer\n",
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
+ "from tensorflow.keras.callbacks import EarlyStopping"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ " Defining constants"
+ ],
+ "metadata": {
+ "id": "F8Kg8G-p2zAr"
+ },
+ "id": "F8Kg8G-p2zAr"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "49a3ab06",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:34:57.467075Z",
+ "iopub.status.busy": "2021-11-14T04:34:57.466106Z",
+ "iopub.status.idle": "2021-11-14T04:34:57.467969Z",
+ "shell.execute_reply": "2021-11-14T04:34:57.468493Z",
+ "shell.execute_reply.started": "2021-11-14T04:19:34.522370Z"
+ },
+ "papermill": {
+ "duration": 0.019856,
+ "end_time": "2021-11-14T04:34:57.468627",
+ "exception": false,
+ "start_time": "2021-11-14T04:34:57.448771",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "49a3ab06"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "voc_size = 50000\n",
+ "max_sequence_length = 250\n",
+ "embedding_dim = 100\n",
+ "Batch_size = 16\n",
+ "\n",
+ "train_prev_comp = \"../input/toxic-comment/jigsaw-toxic-comment-train.csv\"\n",
+ "test_cur_comp = \"../input/jigsaw-toxic-severity-rating/comments_to_score.csv\"\n",
+ "\n",
+ "\n",
+ "def seed_everything():\n",
+ " np.random.seed(123)\n",
+ " random.seed(123)\n",
+ " tf.random.set_seed(123)\n",
+ " os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = '2'\n",
+ " os.environ['PYTHONHASHSEED'] = str(123)\n",
+ "\n",
+ "seed_everything()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ab0dd5fe",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:34:57.500523Z",
+ "iopub.status.busy": "2021-11-14T04:34:57.499735Z",
+ "iopub.status.idle": "2021-11-14T04:34:57.501985Z",
+ "shell.execute_reply": "2021-11-14T04:34:57.502459Z",
+ "shell.execute_reply.started": "2021-11-14T04:19:34.533779Z"
+ },
+ "papermill": {
+ "duration": 0.022171,
+ "end_time": "2021-11-14T04:34:57.502606",
+ "exception": false,
+ "start_time": "2021-11-14T04:34:57.480435",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "ab0dd5fe"
+ },
+ "outputs": [],
+ "source": [
+ "# Function for cleaning comments\n",
+ "\n",
+ "def clean_data(data):\n",
+ " final = []\n",
+ " for sent in data:\n",
+ " sent = sent.replace('\\\\n', ' ').replace('\\n', ' ').replace('\\t',' ').replace('\\\\', ' ').replace('. com', '.com')\n",
+ " soup = BeautifulSoup(sent, \"html.parser\")\n",
+ " sent = soup.get_text(separator=\" \")\n",
+ " remove_https = re.sub(r'http\\S+', '', sent)\n",
+ " sent = re.sub(r\"\\ [A-Za-z]*\\.com\", \" \", remove_https)\n",
+ " sent = unidecode.unidecode(sent)\n",
+ " sent = sent.lower()\n",
+ " sent = re.sub(r\"[^a-zA-Z0-9:$-,()%.?!]+\", ' ', sent) \n",
+ " sent = re.sub(r\"[:$-,()%.?!]+\", ' ',sent)\n",
+ " stoplist = stopwords.words(\"english\")\n",
+ " sent = [word for word in word_tokenize(sent) if word not in stoplist]\n",
+ " sent = \" \".join(sent)\n",
+ " final.append(sent)\n",
+ " \n",
+ " return final"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "905c0723",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:34:57.531781Z",
+ "iopub.status.busy": "2021-11-14T04:34:57.531111Z",
+ "iopub.status.idle": "2021-11-14T04:35:00.259353Z",
+ "shell.execute_reply": "2021-11-14T04:35:00.260529Z",
+ "shell.execute_reply.started": "2021-11-14T04:19:34.546950Z"
+ },
+ "papermill": {
+ "duration": 2.746227,
+ "end_time": "2021-11-14T04:35:00.260737",
+ "exception": false,
+ "start_time": "2021-11-14T04:34:57.514510",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "905c0723",
+ "outputId": "45b5bf8b-756a-4a5a-8460-fda673716ded"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " comment_text | \n",
+ " y | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Explanation\\nWhy the edits made under my usern... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " D'aww! He matches this background colour I'm s... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Hey man, I'm really not trying to edit war. It... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " \"\\nMore\\nI can't make any real suggestions on ... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " You, sir, are my hero. Any chance you remember... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " comment_text y\n",
+ "0 Explanation\\nWhy the edits made under my usern... 0\n",
+ "1 D'aww! He matches this background colour I'm s... 0\n",
+ "2 Hey man, I'm really not trying to edit war. It... 0\n",
+ "3 \"\\nMore\\nI can't make any real suggestions on ... 0\n",
+ "4 You, sir, are my hero. Any chance you remember... 0"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Reading train file from previous competition\n",
+ "\n",
+ "df = pd.read_csv(train_prev_comp)\n",
+ "\n",
+ "\n",
+ "df[\"y\"] = (df[[\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]].sum(axis=1) > 0).astype(int)\n",
+ "df.drop([\"id\",\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"], axis=1, inplace = True)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2c0e3db0",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:35:00.316965Z",
+ "iopub.status.busy": "2021-11-14T04:35:00.314873Z",
+ "iopub.status.idle": "2021-11-14T04:35:00.321233Z",
+ "shell.execute_reply": "2021-11-14T04:35:00.322026Z",
+ "shell.execute_reply.started": "2021-11-14T04:19:37.178205Z"
+ },
+ "papermill": {
+ "duration": 0.039247,
+ "end_time": "2021-11-14T04:35:00.322245",
+ "exception": false,
+ "start_time": "2021-11-14T04:35:00.282998",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "2c0e3db0",
+ "outputId": "9cd6dfda-1f09-4669-e48b-1289938d74f0"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 201081\n",
+ "1 22468\n",
+ "Name: y, dtype: int64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Seeing that dataset is imbalanced\n",
+ "\n",
+ "df[\"y\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Balacing dataset"
+ ],
+ "metadata": {
+ "id": "XSrdS9zI2-8p"
+ },
+ "id": "XSrdS9zI2-8p"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dbed13aa",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:35:00.375695Z",
+ "iopub.status.busy": "2021-11-14T04:35:00.374580Z",
+ "iopub.status.idle": "2021-11-14T04:35:00.540196Z",
+ "shell.execute_reply": "2021-11-14T04:35:00.541009Z",
+ "shell.execute_reply.started": "2021-11-14T04:19:37.189664Z"
+ },
+ "papermill": {
+ "duration": 0.196883,
+ "end_time": "2021-11-14T04:35:00.541396",
+ "exception": false,
+ "start_time": "2021-11-14T04:35:00.344513",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "dbed13aa",
+ "outputId": "158e832e-a549-4401-bd85-bc1f95db87c5"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 22468\n",
+ "1 22468\n",
+ "Name: target, dtype: int64"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "X = np.array(df[\"comment_text\"].values)\n",
+ "X = X.reshape(-1,1)\n",
+ "y = np.array(df[\"y\"].values)\n",
+ "rus = RandomUnderSampler(random_state=0)\n",
+ "x, y = rus.fit_resample(X, y)\n",
+ "\n",
+ "x = x.flatten()\n",
+ "df = pd.DataFrame()\n",
+ "df[\"text\"] = x\n",
+ "df[\"target\"] = y\n",
+ "\n",
+ "\n",
+ "# Now its balanced\n",
+ "\n",
+ "df[\"target\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bf1cc71c",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:35:00.600340Z",
+ "iopub.status.busy": "2021-11-14T04:35:00.599491Z",
+ "iopub.status.idle": "2021-11-14T04:35:46.833670Z",
+ "shell.execute_reply": "2021-11-14T04:35:46.833209Z",
+ "shell.execute_reply.started": "2021-11-14T04:19:37.291635Z"
+ },
+ "papermill": {
+ "duration": 46.26705,
+ "end_time": "2021-11-14T04:35:46.833796",
+ "exception": false,
+ "start_time": "2021-11-14T04:35:00.566746",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "bf1cc71c"
+ },
+ "outputs": [],
+ "source": [
+ "# Creating column clean_text for cleaned comments\n",
+ "\n",
+ "df[\"text\"] = clean_data(df[\"text\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Defining keras Model with GRU units"
+ ],
+ "metadata": {
+ "id": "t66LA_jV3Ikb"
+ },
+ "id": "t66LA_jV3Ikb"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6218d295",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:35:46.866116Z",
+ "iopub.status.busy": "2021-11-14T04:35:46.865533Z",
+ "iopub.status.idle": "2021-11-14T04:35:46.868820Z",
+ "shell.execute_reply": "2021-11-14T04:35:46.868396Z",
+ "shell.execute_reply.started": "2021-11-14T04:24:27.459783Z"
+ },
+ "papermill": {
+ "duration": 0.023249,
+ "end_time": "2021-11-14T04:35:46.868944",
+ "exception": false,
+ "start_time": "2021-11-14T04:35:46.845695",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "6218d295"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "class GRU_model(tf.keras.Model):\n",
+ " def __init__(self):\n",
+ " super().__init__()\n",
+ " self.Embedding = Embedding(voc_size, embedding_dim, input_length = max_sequence_length)\n",
+ " self.GRU1 = Bidirectional(GRU(128, return_sequences=True))\n",
+ " self.Dropout1 = Dropout(0.25)\n",
+ " self.GRU2 = Bidirectional(GRU(64, return_sequences = False))\n",
+ " self.Dropout2 = Dropout(0.25)\n",
+ " self.Dense1 = Dense(64, activation=\"relu\")\n",
+ " self.Dropout3 = Dropout(0.2)\n",
+ " self.Dense2 = Dense(1, activation=\"sigmoid\")\n",
+ " \n",
+ " def call(self, inputs):\n",
+ " x = self.Embedding(inputs)\n",
+ " x = self.GRU1(x)\n",
+ " x = self.Dropout1(x)\n",
+ " x = self.GRU2(x)\n",
+ " x = self.Dropout2(x)\n",
+ " x = self.Dense1(x)\n",
+ " x = self.Dropout3(x)\n",
+ " x = self.Dense2(x)\n",
+ " \n",
+ " return x"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dcfbed0d",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:35:46.896473Z",
+ "iopub.status.busy": "2021-11-14T04:35:46.895748Z",
+ "iopub.status.idle": "2021-11-14T04:35:46.897736Z",
+ "shell.execute_reply": "2021-11-14T04:35:46.898129Z",
+ "shell.execute_reply.started": "2021-11-14T04:20:31.747499Z"
+ },
+ "papermill": {
+ "duration": 0.017612,
+ "end_time": "2021-11-14T04:35:46.898247",
+ "exception": false,
+ "start_time": "2021-11-14T04:35:46.880635",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "dcfbed0d"
+ },
+ "outputs": [],
+ "source": [
+ "# Using early_stopping as callback function \n",
+ "# It takes the weigths of epoch with the best val_accuracy\n",
+ "\n",
+ "early_stopping = EarlyStopping(patience = 5,restore_best_weights = True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Tokenizing the comments from train dataset"
+ ],
+ "metadata": {
+ "id": "_Bom6ucC3N9L"
+ },
+ "id": "_Bom6ucC3N9L"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c3898633",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:35:46.964608Z",
+ "iopub.status.busy": "2021-11-14T04:35:46.954704Z",
+ "iopub.status.idle": "2021-11-14T04:35:50.091614Z",
+ "shell.execute_reply": "2021-11-14T04:35:50.091002Z",
+ "shell.execute_reply.started": "2021-11-14T04:20:31.777482Z"
+ },
+ "papermill": {
+ "duration": 3.181771,
+ "end_time": "2021-11-14T04:35:50.091743",
+ "exception": false,
+ "start_time": "2021-11-14T04:35:46.909972",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "c3898633"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "tokenizer = Tokenizer(num_words = voc_size)\n",
+ "tokenizer.fit_on_texts(df[\"text\"].values)\n",
+ "X = tokenizer.texts_to_sequences(df[\"text\"].values)\n",
+ "X = pad_sequences(X, maxlen = max_sequence_length)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "58864ac8",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:35:50.121160Z",
+ "iopub.status.busy": "2021-11-14T04:35:50.120539Z",
+ "iopub.status.idle": "2021-11-14T04:46:58.436311Z",
+ "shell.execute_reply": "2021-11-14T04:46:58.435629Z",
+ "shell.execute_reply.started": "2021-11-14T04:24:39.828475Z"
+ },
+ "papermill": {
+ "duration": 668.332415,
+ "end_time": "2021-11-14T04:46:58.436449",
+ "exception": false,
+ "start_time": "2021-11-14T04:35:50.104034",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "58864ac8",
+ "outputId": "5d9d8afa-b5ba-4f72-fa0d-e43a08fc90f9"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1/10\n",
+ "2247/2247 [==============================] - 101s 42ms/step - loss: 0.2942 - accuracy: 0.8761 - val_loss: 0.2347 - val_accuracy: 0.9227\n",
+ "Epoch 2/10\n",
+ "2247/2247 [==============================] - 94s 42ms/step - loss: 0.1639 - accuracy: 0.9375 - val_loss: 0.2342 - val_accuracy: 0.9211\n",
+ "Epoch 3/10\n",
+ "2247/2247 [==============================] - 94s 42ms/step - loss: 0.0894 - accuracy: 0.9676 - val_loss: 0.2741 - val_accuracy: 0.9000\n",
+ "Epoch 4/10\n",
+ "2247/2247 [==============================] - 94s 42ms/step - loss: 0.0501 - accuracy: 0.9830 - val_loss: 0.4452 - val_accuracy: 0.8856\n",
+ "Epoch 5/10\n",
+ "2247/2247 [==============================] - 94s 42ms/step - loss: 0.0312 - accuracy: 0.9898 - val_loss: 0.6138 - val_accuracy: 0.8623\n",
+ "Epoch 6/10\n",
+ "2247/2247 [==============================] - 94s 42ms/step - loss: 0.0215 - accuracy: 0.9927 - val_loss: 0.5662 - val_accuracy: 0.8569\n",
+ "Epoch 7/10\n",
+ "2247/2247 [==============================] - 94s 42ms/step - loss: 0.0147 - accuracy: 0.9951 - val_loss: 0.4859 - val_accuracy: 0.9047\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model = GRU_model()\n",
+ "model.compile(\n",
+ " loss = tf.keras.losses.BinaryCrossentropy(),\n",
+ " optimizer = \"Adam\",\n",
+ " metrics = [\"accuracy\"]\n",
+ " )\n",
+ "\n",
+ "model.fit(\n",
+ " X, \n",
+ " df.target, \n",
+ " epochs = 10, \n",
+ " validation_split = 0.2,\n",
+ " batch_size = Batch_size, \n",
+ " callbacks = [early_stopping]\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5499067b",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:47:02.789433Z",
+ "iopub.status.busy": "2021-11-14T04:47:02.788712Z",
+ "iopub.status.idle": "2021-11-14T04:47:32.291140Z",
+ "shell.execute_reply": "2021-11-14T04:47:32.290642Z",
+ "shell.execute_reply.started": "2021-11-14T04:34:08.703401Z"
+ },
+ "papermill": {
+ "duration": 31.545177,
+ "end_time": "2021-11-14T04:47:32.291279",
+ "exception": false,
+ "start_time": "2021-11-14T04:47:00.746102",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "5499067b"
+ },
+ "outputs": [],
+ "source": [
+ "# Reading given test dataset \n",
+ "\n",
+ "test = pd.read_csv(test_cur_comp)\n",
+ "\n",
+ "test[\"text\"] = clean_data(test[\"text\"])\n",
+ "x_test = tokenizer.texts_to_sequences(test[\"text\"].values)\n",
+ "x_test = pad_sequences(x_test, maxlen = max_sequence_length)\n",
+ "\n",
+ "pred = model.predict(x_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2d11ff76",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:47:36.685493Z",
+ "iopub.status.busy": "2021-11-14T04:47:36.684828Z",
+ "iopub.status.idle": "2021-11-14T04:47:36.710436Z",
+ "shell.execute_reply": "2021-11-14T04:47:36.709543Z",
+ "shell.execute_reply.started": "2021-11-14T04:34:27.686805Z"
+ },
+ "papermill": {
+ "duration": 2.10009,
+ "end_time": "2021-11-14T04:47:36.710555",
+ "exception": false,
+ "start_time": "2021-11-14T04:47:34.610465",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "2d11ff76"
+ },
+ "outputs": [],
+ "source": [
+ "# Making submission file\n",
+ "\n",
+ "final = pd.DataFrame()\n",
+ "final[\"comment_id\"] = test[\"comment_id\"]\n",
+ "final[\"score\"] = pred\n",
+ "final.to_csv(\"submission.csv\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5ce5f54a",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-11-14T04:47:41.313743Z",
+ "iopub.status.busy": "2021-11-14T04:47:41.313173Z",
+ "iopub.status.idle": "2021-11-14T04:47:41.317997Z",
+ "shell.execute_reply": "2021-11-14T04:47:41.318382Z",
+ "shell.execute_reply.started": "2021-11-14T04:34:27.719860Z"
+ },
+ "papermill": {
+ "duration": 2.503346,
+ "end_time": "2021-11-14T04:47:41.318536",
+ "exception": false,
+ "start_time": "2021-11-14T04:47:38.815190",
+ "status": "completed"
+ },
+ "tags": [],
+ "id": "5ce5f54a",
+ "outputId": "88ed6f34-c7c6-497c-855d-58a8025b72e1"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " comment_id | \n",
+ " score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 114890 | \n",
+ " 0.001387 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 732895 | \n",
+ " 0.007995 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1139051 | \n",
+ " 0.001851 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1434512 | \n",
+ " 0.014324 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2084821 | \n",
+ " 0.828060 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " comment_id score\n",
+ "0 114890 0.001387\n",
+ "1 732895 0.007995\n",
+ "2 1139051 0.001851\n",
+ "3 1434512 0.014324\n",
+ "4 2084821 0.828060"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "final.head()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.10"
+ },
+ "papermill": {
+ "default_parameters": {},
+ "duration": 783.084669,
+ "end_time": "2021-11-14T04:47:46.757169",
+ "environment_variables": {},
+ "exception": null,
+ "input_path": "__notebook__.ipynb",
+ "output_path": "__notebook__.ipynb",
+ "parameters": {},
+ "start_time": "2021-11-14T04:34:43.672500",
+ "version": "2.3.3"
+ },
+ "colab": {
+ "provenance": []
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file