diff --git a/Toxic Comment Analysis/Dataset/README.md b/Toxic Comment Analysis/Dataset/README.md new file mode 100644 index 000000000..d5e3ea76c --- /dev/null +++ b/Toxic Comment Analysis/Dataset/README.md @@ -0,0 +1,7 @@ +DATASET + + + +Dataset can be found at below link: + +https://www.kaggle.com/datasets/devkhant24/toxic-comment diff --git a/Toxic Comment Analysis/Images/Accuracy_score.png b/Toxic Comment Analysis/Images/Accuracy_score.png new file mode 100644 index 000000000..118afb62e Binary files /dev/null and b/Toxic Comment Analysis/Images/Accuracy_score.png differ diff --git a/Toxic Comment Analysis/Model/README.md b/Toxic Comment Analysis/Model/README.md new file mode 100644 index 000000000..db8ff72a0 --- /dev/null +++ b/Toxic Comment Analysis/Model/README.md @@ -0,0 +1,49 @@ +TOXIC COMMENT ANALYSIS + + +GOAL + +Implementtion of algorithm like keras Model with GRU units to see the accuracy score. + + +DATASET + +https://www.kaggle.com/datasets/devkhant24/toxic-comment + + +DESCRIPTION + +The main aim of the project is to make a model that helps to predict toxic comment on the given dataset. + + +WORK DONE + +Defining constants and cleaning of the comments. +Balacing the dataset and defining keras Model with GRU units. +Then tokenizing the comments from train dataset and predicting acuracy score of comment id. + + +LIBRARIES NEEDED + +Numpy +Pandas +math +os +re +unidecode + + +ACCURACY SCORE + +rosp + + +CONCLUSION + +We created a model which predict the toxic comments. The model gives an accuracy score of the comments on the dataset. + + +CONTRIBUTION BY + +Sumaiya Khan + diff --git a/Toxic Comment Analysis/Model/TOXIC_COMMENT_ANALYSIS.ipynb b/Toxic Comment Analysis/Model/TOXIC_COMMENT_ANALYSIS.ipynb new file mode 100644 index 000000000..e4fa9f0c5 --- /dev/null +++ b/Toxic Comment Analysis/Model/TOXIC_COMMENT_ANALYSIS.ipynb @@ -0,0 +1,825 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4c7ae996", + "metadata": { + "papermill": { + "duration": 0.013811, + "end_time": "2021-11-14T04:34:51.059041", + "exception": false, + "start_time": "2021-11-14T04:34:51.045230", + "status": "completed" + }, + "tags": [], + "id": "4c7ae996" + }, + "source": [ + "\n", + "##Introduction \n", + "\n", + "It is simple approach to rate severity of toxic comments. And find probability of the comment to be toxic one and give the score based on the probability.\n", + "The final score is the probability predicted by the Model.\n", + "\n", + "Notebook is build upon this public kernel: https://www.kaggle.com/datasets/devkhant24/toxic-comment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f79be62", + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", + "execution": { + "iopub.execute_input": "2021-11-14T04:34:51.092708Z", + "iopub.status.busy": "2021-11-14T04:34:51.091887Z", + "iopub.status.idle": "2021-11-14T04:34:57.436680Z", + "shell.execute_reply": "2021-11-14T04:34:57.435929Z", + "shell.execute_reply.started": "2021-11-14T04:24:35.812950Z" + }, + "papermill": { + "duration": 6.366657, + "end_time": "2021-11-14T04:34:57.436825", + "exception": false, + "start_time": "2021-11-14T04:34:51.070168", + "status": "completed" + }, + "tags": [], + "id": "9f79be62" + }, + "outputs": [], + "source": [ + "# Importing libraries\n", + "\n", + "import math\n", + "import os\n", + "import random\n", + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import unidecode\n", + "\n", + "from sklearn.model_selection import StratifiedKFold\n", + "from sklearn.metrics import accuracy_score\n", + "from bs4 import BeautifulSoup\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "\n", + "import tensorflow as tf\n", + "from tensorflow.keras.layers import Dense, Dropout, GRU, Embedding, Bidirectional\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.preprocessing.text import Tokenizer\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "from tensorflow.keras.callbacks import EarlyStopping" + ] + }, + { + "cell_type": "markdown", + "source": [ + " Defining constants" + ], + "metadata": { + "id": "F8Kg8G-p2zAr" + }, + "id": "F8Kg8G-p2zAr" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49a3ab06", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:34:57.467075Z", + "iopub.status.busy": "2021-11-14T04:34:57.466106Z", + "iopub.status.idle": "2021-11-14T04:34:57.467969Z", + "shell.execute_reply": "2021-11-14T04:34:57.468493Z", + "shell.execute_reply.started": "2021-11-14T04:19:34.522370Z" + }, + "papermill": { + "duration": 0.019856, + "end_time": "2021-11-14T04:34:57.468627", + "exception": false, + "start_time": "2021-11-14T04:34:57.448771", + "status": "completed" + }, + "tags": [], + "id": "49a3ab06" + }, + "outputs": [], + "source": [ + "\n", + "voc_size = 50000\n", + "max_sequence_length = 250\n", + "embedding_dim = 100\n", + "Batch_size = 16\n", + "\n", + "train_prev_comp = \"../input/toxic-comment/jigsaw-toxic-comment-train.csv\"\n", + "test_cur_comp = \"../input/jigsaw-toxic-severity-rating/comments_to_score.csv\"\n", + "\n", + "\n", + "def seed_everything():\n", + " np.random.seed(123)\n", + " random.seed(123)\n", + " tf.random.set_seed(123)\n", + " os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = '2'\n", + " os.environ['PYTHONHASHSEED'] = str(123)\n", + "\n", + "seed_everything()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab0dd5fe", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:34:57.500523Z", + "iopub.status.busy": "2021-11-14T04:34:57.499735Z", + "iopub.status.idle": "2021-11-14T04:34:57.501985Z", + "shell.execute_reply": "2021-11-14T04:34:57.502459Z", + "shell.execute_reply.started": "2021-11-14T04:19:34.533779Z" + }, + "papermill": { + "duration": 0.022171, + "end_time": "2021-11-14T04:34:57.502606", + "exception": false, + "start_time": "2021-11-14T04:34:57.480435", + "status": "completed" + }, + "tags": [], + "id": "ab0dd5fe" + }, + "outputs": [], + "source": [ + "# Function for cleaning comments\n", + "\n", + "def clean_data(data):\n", + " final = []\n", + " for sent in data:\n", + " sent = sent.replace('\\\\n', ' ').replace('\\n', ' ').replace('\\t',' ').replace('\\\\', ' ').replace('. com', '.com')\n", + " soup = BeautifulSoup(sent, \"html.parser\")\n", + " sent = soup.get_text(separator=\" \")\n", + " remove_https = re.sub(r'http\\S+', '', sent)\n", + " sent = re.sub(r\"\\ [A-Za-z]*\\.com\", \" \", remove_https)\n", + " sent = unidecode.unidecode(sent)\n", + " sent = sent.lower()\n", + " sent = re.sub(r\"[^a-zA-Z0-9:$-,()%.?!]+\", ' ', sent) \n", + " sent = re.sub(r\"[:$-,()%.?!]+\", ' ',sent)\n", + " stoplist = stopwords.words(\"english\")\n", + " sent = [word for word in word_tokenize(sent) if word not in stoplist]\n", + " sent = \" \".join(sent)\n", + " final.append(sent)\n", + " \n", + " return final" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "905c0723", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:34:57.531781Z", + "iopub.status.busy": "2021-11-14T04:34:57.531111Z", + "iopub.status.idle": "2021-11-14T04:35:00.259353Z", + "shell.execute_reply": "2021-11-14T04:35:00.260529Z", + "shell.execute_reply.started": "2021-11-14T04:19:34.546950Z" + }, + "papermill": { + "duration": 2.746227, + "end_time": "2021-11-14T04:35:00.260737", + "exception": false, + "start_time": "2021-11-14T04:34:57.514510", + "status": "completed" + }, + "tags": [], + "id": "905c0723", + "outputId": "45b5bf8b-756a-4a5a-8460-fda673716ded" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
comment_texty
0Explanation\\nWhy the edits made under my usern...0
1D'aww! He matches this background colour I'm s...0
2Hey man, I'm really not trying to edit war. It...0
3\"\\nMore\\nI can't make any real suggestions on ...0
4You, sir, are my hero. Any chance you remember...0
\n", + "
" + ], + "text/plain": [ + " comment_text y\n", + "0 Explanation\\nWhy the edits made under my usern... 0\n", + "1 D'aww! He matches this background colour I'm s... 0\n", + "2 Hey man, I'm really not trying to edit war. It... 0\n", + "3 \"\\nMore\\nI can't make any real suggestions on ... 0\n", + "4 You, sir, are my hero. Any chance you remember... 0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Reading train file from previous competition\n", + "\n", + "df = pd.read_csv(train_prev_comp)\n", + "\n", + "\n", + "df[\"y\"] = (df[[\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]].sum(axis=1) > 0).astype(int)\n", + "df.drop([\"id\",\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"], axis=1, inplace = True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c0e3db0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:35:00.316965Z", + "iopub.status.busy": "2021-11-14T04:35:00.314873Z", + "iopub.status.idle": "2021-11-14T04:35:00.321233Z", + "shell.execute_reply": "2021-11-14T04:35:00.322026Z", + "shell.execute_reply.started": "2021-11-14T04:19:37.178205Z" + }, + "papermill": { + "duration": 0.039247, + "end_time": "2021-11-14T04:35:00.322245", + "exception": false, + "start_time": "2021-11-14T04:35:00.282998", + "status": "completed" + }, + "tags": [], + "id": "2c0e3db0", + "outputId": "9cd6dfda-1f09-4669-e48b-1289938d74f0" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 201081\n", + "1 22468\n", + "Name: y, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Seeing that dataset is imbalanced\n", + "\n", + "df[\"y\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Balacing dataset" + ], + "metadata": { + "id": "XSrdS9zI2-8p" + }, + "id": "XSrdS9zI2-8p" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbed13aa", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:35:00.375695Z", + "iopub.status.busy": "2021-11-14T04:35:00.374580Z", + "iopub.status.idle": "2021-11-14T04:35:00.540196Z", + "shell.execute_reply": "2021-11-14T04:35:00.541009Z", + "shell.execute_reply.started": "2021-11-14T04:19:37.189664Z" + }, + "papermill": { + "duration": 0.196883, + "end_time": "2021-11-14T04:35:00.541396", + "exception": false, + "start_time": "2021-11-14T04:35:00.344513", + "status": "completed" + }, + "tags": [], + "id": "dbed13aa", + "outputId": "158e832e-a549-4401-bd85-bc1f95db87c5" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 22468\n", + "1 22468\n", + "Name: target, dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "X = np.array(df[\"comment_text\"].values)\n", + "X = X.reshape(-1,1)\n", + "y = np.array(df[\"y\"].values)\n", + "rus = RandomUnderSampler(random_state=0)\n", + "x, y = rus.fit_resample(X, y)\n", + "\n", + "x = x.flatten()\n", + "df = pd.DataFrame()\n", + "df[\"text\"] = x\n", + "df[\"target\"] = y\n", + "\n", + "\n", + "# Now its balanced\n", + "\n", + "df[\"target\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf1cc71c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:35:00.600340Z", + "iopub.status.busy": "2021-11-14T04:35:00.599491Z", + "iopub.status.idle": "2021-11-14T04:35:46.833670Z", + "shell.execute_reply": "2021-11-14T04:35:46.833209Z", + "shell.execute_reply.started": "2021-11-14T04:19:37.291635Z" + }, + "papermill": { + "duration": 46.26705, + "end_time": "2021-11-14T04:35:46.833796", + "exception": false, + "start_time": "2021-11-14T04:35:00.566746", + "status": "completed" + }, + "tags": [], + "id": "bf1cc71c" + }, + "outputs": [], + "source": [ + "# Creating column clean_text for cleaned comments\n", + "\n", + "df[\"text\"] = clean_data(df[\"text\"])" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Defining keras Model with GRU units" + ], + "metadata": { + "id": "t66LA_jV3Ikb" + }, + "id": "t66LA_jV3Ikb" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6218d295", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:35:46.866116Z", + "iopub.status.busy": "2021-11-14T04:35:46.865533Z", + "iopub.status.idle": "2021-11-14T04:35:46.868820Z", + "shell.execute_reply": "2021-11-14T04:35:46.868396Z", + "shell.execute_reply.started": "2021-11-14T04:24:27.459783Z" + }, + "papermill": { + "duration": 0.023249, + "end_time": "2021-11-14T04:35:46.868944", + "exception": false, + "start_time": "2021-11-14T04:35:46.845695", + "status": "completed" + }, + "tags": [], + "id": "6218d295" + }, + "outputs": [], + "source": [ + "\n", + "class GRU_model(tf.keras.Model):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.Embedding = Embedding(voc_size, embedding_dim, input_length = max_sequence_length)\n", + " self.GRU1 = Bidirectional(GRU(128, return_sequences=True))\n", + " self.Dropout1 = Dropout(0.25)\n", + " self.GRU2 = Bidirectional(GRU(64, return_sequences = False))\n", + " self.Dropout2 = Dropout(0.25)\n", + " self.Dense1 = Dense(64, activation=\"relu\")\n", + " self.Dropout3 = Dropout(0.2)\n", + " self.Dense2 = Dense(1, activation=\"sigmoid\")\n", + " \n", + " def call(self, inputs):\n", + " x = self.Embedding(inputs)\n", + " x = self.GRU1(x)\n", + " x = self.Dropout1(x)\n", + " x = self.GRU2(x)\n", + " x = self.Dropout2(x)\n", + " x = self.Dense1(x)\n", + " x = self.Dropout3(x)\n", + " x = self.Dense2(x)\n", + " \n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcfbed0d", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:35:46.896473Z", + "iopub.status.busy": "2021-11-14T04:35:46.895748Z", + "iopub.status.idle": "2021-11-14T04:35:46.897736Z", + "shell.execute_reply": "2021-11-14T04:35:46.898129Z", + "shell.execute_reply.started": "2021-11-14T04:20:31.747499Z" + }, + "papermill": { + "duration": 0.017612, + "end_time": "2021-11-14T04:35:46.898247", + "exception": false, + "start_time": "2021-11-14T04:35:46.880635", + "status": "completed" + }, + "tags": [], + "id": "dcfbed0d" + }, + "outputs": [], + "source": [ + "# Using early_stopping as callback function \n", + "# It takes the weigths of epoch with the best val_accuracy\n", + "\n", + "early_stopping = EarlyStopping(patience = 5,restore_best_weights = True)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Tokenizing the comments from train dataset" + ], + "metadata": { + "id": "_Bom6ucC3N9L" + }, + "id": "_Bom6ucC3N9L" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3898633", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:35:46.964608Z", + "iopub.status.busy": "2021-11-14T04:35:46.954704Z", + "iopub.status.idle": "2021-11-14T04:35:50.091614Z", + "shell.execute_reply": "2021-11-14T04:35:50.091002Z", + "shell.execute_reply.started": "2021-11-14T04:20:31.777482Z" + }, + "papermill": { + "duration": 3.181771, + "end_time": "2021-11-14T04:35:50.091743", + "exception": false, + "start_time": "2021-11-14T04:35:46.909972", + "status": "completed" + }, + "tags": [], + "id": "c3898633" + }, + "outputs": [], + "source": [ + "\n", + "tokenizer = Tokenizer(num_words = voc_size)\n", + "tokenizer.fit_on_texts(df[\"text\"].values)\n", + "X = tokenizer.texts_to_sequences(df[\"text\"].values)\n", + "X = pad_sequences(X, maxlen = max_sequence_length)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58864ac8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:35:50.121160Z", + "iopub.status.busy": "2021-11-14T04:35:50.120539Z", + "iopub.status.idle": "2021-11-14T04:46:58.436311Z", + "shell.execute_reply": "2021-11-14T04:46:58.435629Z", + "shell.execute_reply.started": "2021-11-14T04:24:39.828475Z" + }, + "papermill": { + "duration": 668.332415, + "end_time": "2021-11-14T04:46:58.436449", + "exception": false, + "start_time": "2021-11-14T04:35:50.104034", + "status": "completed" + }, + "tags": [], + "id": "58864ac8", + "outputId": "5d9d8afa-b5ba-4f72-fa0d-e43a08fc90f9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "2247/2247 [==============================] - 101s 42ms/step - loss: 0.2942 - accuracy: 0.8761 - val_loss: 0.2347 - val_accuracy: 0.9227\n", + "Epoch 2/10\n", + "2247/2247 [==============================] - 94s 42ms/step - loss: 0.1639 - accuracy: 0.9375 - val_loss: 0.2342 - val_accuracy: 0.9211\n", + "Epoch 3/10\n", + "2247/2247 [==============================] - 94s 42ms/step - loss: 0.0894 - accuracy: 0.9676 - val_loss: 0.2741 - val_accuracy: 0.9000\n", + "Epoch 4/10\n", + "2247/2247 [==============================] - 94s 42ms/step - loss: 0.0501 - accuracy: 0.9830 - val_loss: 0.4452 - val_accuracy: 0.8856\n", + "Epoch 5/10\n", + "2247/2247 [==============================] - 94s 42ms/step - loss: 0.0312 - accuracy: 0.9898 - val_loss: 0.6138 - val_accuracy: 0.8623\n", + "Epoch 6/10\n", + "2247/2247 [==============================] - 94s 42ms/step - loss: 0.0215 - accuracy: 0.9927 - val_loss: 0.5662 - val_accuracy: 0.8569\n", + "Epoch 7/10\n", + "2247/2247 [==============================] - 94s 42ms/step - loss: 0.0147 - accuracy: 0.9951 - val_loss: 0.4859 - val_accuracy: 0.9047\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = GRU_model()\n", + "model.compile(\n", + " loss = tf.keras.losses.BinaryCrossentropy(),\n", + " optimizer = \"Adam\",\n", + " metrics = [\"accuracy\"]\n", + " )\n", + "\n", + "model.fit(\n", + " X, \n", + " df.target, \n", + " epochs = 10, \n", + " validation_split = 0.2,\n", + " batch_size = Batch_size, \n", + " callbacks = [early_stopping]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5499067b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:47:02.789433Z", + "iopub.status.busy": "2021-11-14T04:47:02.788712Z", + "iopub.status.idle": "2021-11-14T04:47:32.291140Z", + "shell.execute_reply": "2021-11-14T04:47:32.290642Z", + "shell.execute_reply.started": "2021-11-14T04:34:08.703401Z" + }, + "papermill": { + "duration": 31.545177, + "end_time": "2021-11-14T04:47:32.291279", + "exception": false, + "start_time": "2021-11-14T04:47:00.746102", + "status": "completed" + }, + "tags": [], + "id": "5499067b" + }, + "outputs": [], + "source": [ + "# Reading given test dataset \n", + "\n", + "test = pd.read_csv(test_cur_comp)\n", + "\n", + "test[\"text\"] = clean_data(test[\"text\"])\n", + "x_test = tokenizer.texts_to_sequences(test[\"text\"].values)\n", + "x_test = pad_sequences(x_test, maxlen = max_sequence_length)\n", + "\n", + "pred = model.predict(x_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d11ff76", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:47:36.685493Z", + "iopub.status.busy": "2021-11-14T04:47:36.684828Z", + "iopub.status.idle": "2021-11-14T04:47:36.710436Z", + "shell.execute_reply": "2021-11-14T04:47:36.709543Z", + "shell.execute_reply.started": "2021-11-14T04:34:27.686805Z" + }, + "papermill": { + "duration": 2.10009, + "end_time": "2021-11-14T04:47:36.710555", + "exception": false, + "start_time": "2021-11-14T04:47:34.610465", + "status": "completed" + }, + "tags": [], + "id": "2d11ff76" + }, + "outputs": [], + "source": [ + "# Making submission file\n", + "\n", + "final = pd.DataFrame()\n", + "final[\"comment_id\"] = test[\"comment_id\"]\n", + "final[\"score\"] = pred\n", + "final.to_csv(\"submission.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ce5f54a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-11-14T04:47:41.313743Z", + "iopub.status.busy": "2021-11-14T04:47:41.313173Z", + "iopub.status.idle": "2021-11-14T04:47:41.317997Z", + "shell.execute_reply": "2021-11-14T04:47:41.318382Z", + "shell.execute_reply.started": "2021-11-14T04:34:27.719860Z" + }, + "papermill": { + "duration": 2.503346, + "end_time": "2021-11-14T04:47:41.318536", + "exception": false, + "start_time": "2021-11-14T04:47:38.815190", + "status": "completed" + }, + "tags": [], + "id": "5ce5f54a", + "outputId": "88ed6f34-c7c6-497c-855d-58a8025b72e1" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
comment_idscore
01148900.001387
17328950.007995
211390510.001851
314345120.014324
420848210.828060
\n", + "
" + ], + "text/plain": [ + " comment_id score\n", + "0 114890 0.001387\n", + "1 732895 0.007995\n", + "2 1139051 0.001851\n", + "3 1434512 0.014324\n", + "4 2084821 0.828060" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + }, + "papermill": { + "default_parameters": {}, + "duration": 783.084669, + "end_time": "2021-11-14T04:47:46.757169", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2021-11-14T04:34:43.672500", + "version": "2.3.3" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file