MainCode.py

# -*- coding: utf-8 -*-
"""MJAhmadi_NNDL_HW4_Q2.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1FHXdllZnBlbuiKHE1BtCAloh5R6ev_BF
"""

!nvidia-smi

"""# **2. Data Download and Prepare**"""

!pip install --upgrade --no-cache-dir gdown
!gdown 1lDpsLB-erPd4rvvRguui6i06h1hTKE1K
!gdown 1HctYMsZ-V7t7ipdLOO05S-WNWH5X_l4b
!gdown 1nZ-JnAIQ5fku0FhIFUHCFKguCtEyuNNB

# Load pre-trained GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

import nltk
nltk.download('punkt')

"""## Method1"""

import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Clean the text by removing special characters and converting to lowercase
train_data['cleaned_text'] = train_data['text'].str.replace('[^\w\s\?]', '').str.lower()
test_data['cleaned_text'] = test_data['text'].str.replace('[^\w\s\?]', '').str.lower()

# Tokenize the text
train_data['tokenized_text'] = train_data['cleaned_text'].apply(word_tokenize)
test_data['tokenized_text'] = test_data['cleaned_text'].apply(word_tokenize)

# Remove stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
train_data['filtered_text'] = train_data['tokenized_text'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
test_data['filtered_text'] = test_data['tokenized_text'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Get the filtered text and labels
X_train = train_data['filtered_text'].values
y_train = train_data['label-fine'].values
X_test = test_data['filtered_text'].values
y_test = test_data['label-fine'].values

# Tokenize the text and convert to sequences
tokenizer = Tokenizer(num_words=400000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to have the same length
max_sequence_length = max(len(seq) for seq in X_train)
X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

# Convert labels to categorical
num_classes = train_data['label-fine'].nunique()
y_train = np.eye(num_classes)[y_train]
y_test = np.eye(num_classes)[y_test]

# Replace 'glove_path' with the path to your GloVe embeddings file
glove_path = '/content/glove.6B.300d.txt'
embeddings_index = {}
with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create an embedding matrix
embedding_dim = 300
word_index = tokenizer.word_index
num_words = min(400000, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

"""## Method2"""

# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Load dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Read 'QA_data.csv' with 'errors' parameter and pass the file object to 'pd.read_csv()'
with open('QA_data.csv', 'r', encoding='utf-8', errors='replace') as file:
    qa_data = pd.read_csv(file)

# Define function for text normalization
def normalize_text(text):
    text = re.sub(r'[^a-zA-Z0-9\?]+', ' ', text)
    text = text.lower()
    return text

# Normalize text
train_data['text'] = train_data['text'].apply(normalize_text)
test_data['text'] = test_data['text'].apply(normalize_text)
qa_data['text'] = qa_data['text'].apply(normalize_text)

# Tokenize the text
train_data['tokens'] = train_data['text'].apply(word_tokenize)
test_data['tokens'] = test_data['text'].apply(word_tokenize)
qa_data['tokens'] = qa_data['text'].apply(word_tokenize)

# Load GloVe embeddings into a dictionary
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Prepare tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['tokens'])

# Convert tokens to sequences
train_data['sequences'] = tokenizer.texts_to_sequences(train_data['tokens'])
test_data['sequences'] = tokenizer.texts_to_sequences(test_data['tokens'])
qa_data['sequences'] = tokenizer.texts_to_sequences(qa_data['tokens'])

# Pad sequences
maxlen = max(train_data['sequences'].apply(len))
train_padded_sequences = pad_sequences(train_data['sequences'], maxlen=maxlen)
test_padded_sequences = pad_sequences(test_data['sequences'], maxlen=maxlen)
qa_padded_sequences = pad_sequences(qa_data['sequences'], maxlen=maxlen)

# Encode labels
encoder = LabelEncoder()
encoder.fit(train_data['label-coarse'])
train_data['encoded_labels'] = encoder.transform(train_data['label-coarse'])
test_data['encoded_labels'] = encoder.transform(test_data['label-coarse'])
qa_data['encoded_labels'] = encoder.transform(qa_data['label-coarse'])

# Create an embedding matrix
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

"""# **3. Models, Training, and Evaluation**

## **3.1 (Model 1)**

### Method 1
"""

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import time

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Prepare the data
X_train = train_padded_sequences
y_train = train_data['encoded_labels']
X_test = test_padded_sequences
y_test = test_data['encoded_labels']

# Function to encode labels as one-hot vectors
def encode_labels(labels, num_classes):
    return tf.keras.utils.to_categorical(labels, num_classes=num_classes)

from keras.regularizers import l2

# Function to create the LSTM model
def create_lstm_model(input_length, h_dim, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(h_dim, return_sequences=True, activation='tanh'))
    model.add(LSTM(h_dim, activation='tanh'))
    model.add(Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.001)))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Function to save plots to PDF
def save_plots_to_pdf(filename):
    with PdfPages(filename) as pdf:
        plt.figure(figsize=(10, 4))

        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Training')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.title(f'Accuracy (h_dim = {h_dim})')

        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Training')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.title(f'Loss (h_dim = {h_dim})')

        pdf.savefig(bbox_inches='tight')

        # Show the plots in Colab output
        plt.show()

    plt.close()

# Function to save confusion matrix to PDF
def save_confusion_matrix_to_pdf(y_true, y_pred, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title(f'Confusion matrix (h_dim = {h_dim})')
    plt.savefig(filename, bbox_inches='tight')

    # Show the plot in Colab output
    plt.show()
    plt.close()

# Set hyperparameters
h_dimensions = [25, 50, 75, 100]
epochs = 50
num_classes = 6
batch_size = 64

# Iterate over h_dimensions and train the models
for h_dim in h_dimensions:
    print(f"Training model with h_dim = {h_dim}")
    model = create_lstm_model(maxlen, h_dim, num_classes)

    # Encode labels as one-hot vectors
    y_train_encoded = encode_labels(y_train, num_classes)

    history = model.fit(X_train, y_train_encoded, epochs=epochs, batch_size=batch_size, verbose=2)
    train_accuracy = model.evaluate(X_train, encode_labels(y_train, num_classes), verbose=0)[1] * 100
    test_accuracy = model.evaluate(X_test, encode_labels(y_test, num_classes), verbose=0)[1] * 100
    print(f"Training set accuracy: {train_accuracy:.2f}%")
    print(f"Test set accuracy: {test_accuracy:.2f}%")

    y_pred_train = np.argmax(model.predict(X_train), axis=1)
    y_pred_test = np.argmax(model.predict(X_test), axis=1)

    train_f1 = f1_score(y_train, y_pred_train, average='weighted')
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')

    train_precision = precision_score(y_train, y_pred_train, average='weighted')
    test_precision = precision_score(y_test, y_pred_test, average='weighted')

    train_recall = recall_score(y_train, y_pred_train, average='weighted')
    test_recall = recall_score(y_test, y_pred_test, average='weighted')

    print(f"Training set F1-score: {train_f1:.2f}")
    print(f"Test set F1-score: {test_f1:.2f}")

    print(f"Training set Precision: {train_precision:.2f}")
    print(f"Test set Precision: {test_precision:.2f}")

    print(f"Training set Recall: {train_recall:.2f}")
    print(f"Test set Recall: {test_recall:.2f}")

    save_plots_to_pdf(f"model_{h_dim}_plots.pdf")

    print("Confusion matrix (training set):")
    save_confusion_matrix_to_pdf(y_train, y_pred_train, f"model{h_dim}confusion_matrix_train.pdf")

    print("Confusion matrix (test set):")
    save_confusion_matrix_to_pdf(y_test, y_pred_test, f"model{h_dim}confusion_matrix_test.pdf")

    print("\n")

    time.sleep(2)  # Wait for 2 seconds before proceeding to the next h_dim

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import time

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Prepare the data
X_train = train_padded_sequences
y_train = train_data['encoded_labels']
X_test = test_padded_sequences
y_test = test_data['encoded_labels']

# Function to encode labels as one-hot vectors
def encode_labels(labels, num_classes):
    return tf.keras.utils.to_categorical(labels, num_classes=num_classes)

from keras.regularizers import l2

# Function to create the LSTM model
def create_lstm_model(input_length, h_dim, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(h_dim, return_sequences=True, activation='tanh'))
    model.add(LSTM(h_dim, activation='tanh'))
    model.add(Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.001)))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Function to save plots to PDF
def save_plots_to_pdf(filename):
    with PdfPages(filename) as pdf:
        plt.figure(figsize=(10, 4))

        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Training')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.title(f'Accuracy (h_dim = {h_dim})')

        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Training')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.title(f'Loss (h_dim = {h_dim})')

        pdf.savefig(bbox_inches='tight')

        # Show the plots in Colab output
        plt.show()

    plt.close()

# Function to save confusion matrix to PDF
def save_confusion_matrix_to_pdf(y_true, y_pred, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title(f'Confusion matrix (h_dim = {h_dim})')
    plt.savefig(filename, bbox_inches='tight')

    # Show the plot in Colab output
    plt.show()
    plt.close()

# Set hyperparameters
h_dimensions = [25, 50, 75, 100]
epochs = 50
num_classes = 6
batch_size = 64

# Iterate over h_dimensions and train the models
for h_dim in h_dimensions:
    print(f"Training model with h_dim = {h_dim}")
    model = create_lstm_model(maxlen, h_dim, num_classes)

    # Encode labels as one-hot vectors
    y_train_encoded = encode_labels(y_train, num_classes)

    history = model.fit(X_train, y_train_encoded, epochs=epochs, batch_size=batch_size, verbose=2)
    train_accuracy = model.evaluate(X_train, encode_labels(y_train, num_classes), verbose=0)[1] * 100
    test_accuracy = model.evaluate(X_test, encode_labels(y_test, num_classes), verbose=0)[1] * 100
    print(f"Training set accuracy: {train_accuracy:.2f}%")
    print(f"Test set accuracy: {test_accuracy:.2f}%")

    y_pred_train = np.argmax(model.predict(X_train), axis=1)
    y_pred_test = np.argmax(model.predict(X_test), axis=1)

    train_f1 = f1_score(y_train, y_pred_train, average='weighted')
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')

    train_precision = precision_score(y_train, y_pred_train, average='weighted')
    test_precision = precision_score(y_test, y_pred_test, average='weighted')

    train_recall = recall_score(y_train, y_pred_train, average='weighted')
    test_recall = recall_score(y_test, y_pred_test, average='weighted')

    print(f"Training set F1-score: {train_f1:.2f}")
    print(f"Test set F1-score: {test_f1:.2f}")

    print(f"Training set Precision: {train_precision:.2f}")
    print(f"Test set Precision: {test_precision:.2f}")

    print(f"Training set Recall: {train_recall:.2f}")
    print(f"Test set Recall: {test_recall:.2f}")

    save_plots_to_pdf(f"model_{h_dim}_plots.pdf")

    print("Confusion matrix (training set):")
    save_confusion_matrix_to_pdf(y_train, y_pred_train, f"model{h_dim}confusion_matrix_train.pdf")

    print("Confusion matrix (test set):")
    save_confusion_matrix_to_pdf(y_test, y_pred_test, f"model{h_dim}confusion_matrix_test.pdf")

    print("\n")

    time.sleep(2)  # Wait for 2 seconds before proceeding to the next h_dim

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import time

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Prepare the data
X_train = train_padded_sequences
y_train = train_data['encoded_labels']
X_test = test_padded_sequences
y_test = test_data['encoded_labels']

# Function to encode labels as one-hot vectors
def encode_labels(labels, num_classes):
    return tf.keras.utils.to_categorical(labels, num_classes=num_classes)

from keras.regularizers import l2

# Function to create the LSTM model
def create_lstm_model(input_length, h_dim, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(h_dim, return_sequences=True, activation='tanh'))
    model.add(LSTM(h_dim, activation='tanh'))
    model.add(Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.001)))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Function to save plots to PDF
def save_plots_to_pdf(filename):
    with PdfPages(filename) as pdf:
        plt.figure(figsize=(10, 4))

        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Training')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.title(f'Accuracy (h_dim = {h_dim})')

        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Training')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.title(f'Loss (h_dim = {h_dim})')

        pdf.savefig(bbox_inches='tight')

        # Show the plots in Colab output
        plt.show()

    plt.close()

# Function to save confusion matrix to PDF
def save_confusion_matrix_to_pdf(y_true, y_pred, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title(f'Confusion matrix (h_dim = {h_dim})')
    plt.savefig(filename, bbox_inches='tight')

    # Show the plot in Colab output
    plt.show()
    plt.close()

# Set hyperparameters
h_dimensions = [100]
epochs = 50
num_classes = 6
batch_size = 64

# Iterate over h_dimensions and train the models
for h_dim in h_dimensions:
    print(f"Training model with h_dim = {h_dim}")
    model = create_lstm_model(maxlen, h_dim, num_classes)

    # Encode labels as one-hot vectors
    y_train_encoded = encode_labels(y_train, num_classes)

    history = model.fit(X_train, y_train_encoded, epochs=epochs, batch_size=batch_size, verbose=2)
    train_accuracy = model.evaluate(X_train, encode_labels(y_train, num_classes), verbose=0)[1] * 100
    test_accuracy = model.evaluate(X_test, encode_labels(y_test, num_classes), verbose=0)[1] * 100
    print(f"Training set accuracy: {train_accuracy:.2f}%")
    print(f"Test set accuracy: {test_accuracy:.2f}%")

    y_pred_train = np.argmax(model.predict(X_train), axis=1)
    y_pred_test = np.argmax(model.predict(X_test), axis=1)

    train_f1 = f1_score(y_train, y_pred_train, average='weighted')
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')

    train_precision = precision_score(y_train, y_pred_train, average='weighted')
    test_precision = precision_score(y_test, y_pred_test, average='weighted')

    train_recall = recall_score(y_train, y_pred_train, average='weighted')
    test_recall = recall_score(y_test, y_pred_test, average='weighted')

    print(f"Training set F1-score: {train_f1:.2f}")
    print(f"Test set F1-score: {test_f1:.2f}")

    print(f"Training set Precision: {train_precision:.2f}")
    print(f"Test set Precision: {test_precision:.2f}")

    print(f"Training set Recall: {train_recall:.2f}")
    print(f"Test set Recall: {test_recall:.2f}")

    save_plots_to_pdf(f"model_{h_dim}_plots.pdf")

    print("Confusion matrix (training set):")
    save_confusion_matrix_to_pdf(y_train, y_pred_train, f"model{h_dim}confusion_matrix_train.pdf")

    print("Confusion matrix (test set):")
    save_confusion_matrix_to_pdf(y_test, y_pred_test, f"model{h_dim}confusion_matrix_test.pdf")

    print("\n")

    time.sleep(2)  # Wait for 2 seconds before proceeding to the next h_dim

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import time

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Prepare the data
X_train = train_padded_sequences
y_train = train_data['encoded_labels']
X_test = test_padded_sequences
y_test = test_data['encoded_labels']

# Function to encode labels as one-hot vectors
def encode_labels(labels, num_classes):
    return tf.keras.utils.to_categorical(labels, num_classes=num_classes)

from keras.regularizers import l2

# Function to create the LSTM model
def create_lstm_model(input_length, h_dim, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(h_dim, return_sequences=True, activation='tanh'))
    model.add(LSTM(h_dim, activation='tanh'))
    model.add(Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.001)))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Function to save plots to PDF
def save_plots_to_pdf(filename):
    with PdfPages(filename) as pdf:
        plt.figure(figsize=(10, 4))

        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Training')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.title(f'Accuracy (h_dim = {h_dim})')

        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Training')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.title(f'Loss (h_dim = {h_dim})')

        pdf.savefig(bbox_inches='tight')

        # Show the plots in Colab output
        plt.show()

    plt.close()

# Function to save confusion matrix to PDF
def save_confusion_matrix_to_pdf(y_true, y_pred, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title(f'Confusion matrix (h_dim = {h_dim})')
    plt.savefig(filename, bbox_inches='tight')

    # Show the plot in Colab output
    plt.show()
    plt.close()

# Set hyperparameters
h_dimensions = [100]
epochs = 50
num_classes = 6
batch_size = 32

# Iterate over h_dimensions and train the models
for h_dim in h_dimensions:
    print(f"Training model with h_dim = {h_dim}")
    model = create_lstm_model(maxlen, h_dim, num_classes)

    # Encode labels as one-hot vectors
    y_train_encoded = encode_labels(y_train, num_classes)

    history = model.fit(X_train, y_train_encoded, epochs=epochs, batch_size=batch_size, verbose=2)
    train_accuracy = model.evaluate(X_train, encode_labels(y_train, num_classes), verbose=0)[1] * 100
    test_accuracy = model.evaluate(X_test, encode_labels(y_test, num_classes), verbose=0)[1] * 100
    print(f"Training set accuracy: {train_accuracy:.2f}%")
    print(f"Test set accuracy: {test_accuracy:.2f}%")

    y_pred_train = np.argmax(model.predict(X_train), axis=1)
    y_pred_test = np.argmax(model.predict(X_test), axis=1)

    train_f1 = f1_score(y_train, y_pred_train, average='weighted')
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')

    train_precision = precision_score(y_train, y_pred_train, average='weighted')
    test_precision = precision_score(y_test, y_pred_test, average='weighted')

    train_recall = recall_score(y_train, y_pred_train, average='weighted')
    test_recall = recall_score(y_test, y_pred_test, average='weighted')

    print(f"Training set F1-score: {train_f1:.2f}")
    print(f"Test set F1-score: {test_f1:.2f}")

    print(f"Training set Precision: {train_precision:.2f}")
    print(f"Test set Precision: {test_precision:.2f}")

    print(f"Training set Recall: {train_recall:.2f}")
    print(f"Test set Recall: {test_recall:.2f}")

    save_plots_to_pdf(f"model_{h_dim}_plots.pdf")

    print("Confusion matrix (training set):")
    save_confusion_matrix_to_pdf(y_train, y_pred_train, f"model{h_dim}confusion_matrix_train.pdf")

    print("Confusion matrix (test set):")
    save_confusion_matrix_to_pdf(y_test, y_pred_test, f"model{h_dim}confusion_matrix_test.pdf")

    print("\n")

    time.sleep(2)  # Wait for 2 seconds before proceeding to the next h_dim

"""### Method 1 (+ Val)"""

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import time

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Prepare the data
X_train = train_padded_sequences
y_train = train_data['encoded_labels']
X_test = test_padded_sequences
y_test = test_data['encoded_labels']

# Function to encode labels as one-hot vectors
def encode_labels(labels, num_classes):
    return tf.keras.utils.to_categorical(labels, num_classes=num_classes)

from keras.regularizers import l2

# Function to create the LSTM model
def create_lstm_model(input_length, h_dim, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(h_dim, return_sequences=True, activation='tanh'))
    model.add(LSTM(h_dim, activation='tanh'))
    model.add(Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.001)))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Function to save plots to PDF
def save_plots_to_pdf(filename):
    with PdfPages(filename) as pdf:
        plt.figure(figsize=(10, 4))

        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Training')
        plt.plot(history.history['val_accuracy'], label='Validation')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.title(f'Accuracy (h_dim = {h_dim})')

        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Training')
        plt.plot(history.history['val_loss'], label='Validation')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.title(f'Loss (h_dim = {h_dim})')

        pdf.savefig(bbox_inches='tight')

        # Show the plots in Colab output
        plt.show()

    plt.close()

# Function to save confusion matrix to PDF
def save_confusion_matrix_to_pdf(y_true, y_pred, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title(f'Confusion matrix (h_dim = {h_dim})')
    plt.savefig(filename, bbox_inches='tight')

    # Show the plot in Colab output
    plt.show()
    plt.close()

# Set hyperparameters
h_dimensions = [25, 50, 75, 100]
epochs = 23
num_classes = 6
batch_size = 64

# Iterate over h_dimensions and train the models
for h_dim in h_dimensions:
    print(f"Training model with h_dim = {h_dim}")
    model = create_lstm_model(maxlen, h_dim, num_classes)

    # Split the training set for validation
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

    # Encode labels as one-hot vectors
    y_train_encoded = encode_labels(y_train_split, num_classes)
    y_val_encoded = encode_labels(y_val_split, num_classes)

    history = model.fit(X_train_split, y_train_encoded, epochs=epochs, batch_size=batch_size, validation_data=(X_val_split, y_val_encoded), verbose=2)
    train_accuracy = model.evaluate(X_train, encode_labels(y_train, num_classes), verbose=0)[1] * 100
    test_accuracy = model.evaluate(X_test, encode_labels(y_test, num_classes), verbose=0)[1] * 100
    print(f"Training set accuracy: {train_accuracy:.2f}%")
    print(f"Test set accuracy: {test_accuracy:.2f}%")

    y_pred_train = np.argmax(model.predict(X_train), axis=1)
    y_pred_test = np.argmax(model.predict(X_test), axis=1)

    train_f1 = f1_score(y_train, y_pred_train, average='weighted')
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')

    train_precision = precision_score(y_train, y_pred_train, average='weighted')
    test_precision = precision_score(y_test, y_pred_test, average='weighted')

    train_recall = recall_score(y_train, y_pred_train, average='weighted')
    test_recall = recall_score(y_test, y_pred_test, average='weighted')

    print(f"Training set F1-score: {train_f1:.2f}")
    print(f"Test set F1-score: {test_f1:.2f}")

    print(f"Training set Precision: {train_precision:.2f}")
    print(f"Test set Precision: {test_precision:.2f}")

    print(f"Training set Recall: {train_recall:.2f}")
    print(f"Test set Recall: {test_recall:.2f}")

    save_plots_to_pdf(f"model_{h_dim}_plots.pdf")

    print("Confusion matrix (training set):")
    save_confusion_matrix_to_pdf(y_train, y_pred_train, f"model_{h_dim}_confusion_matrix_train.pdf")

    print("Confusion matrix (test set):")
    save_confusion_matrix_to_pdf(y_test, y_pred_test, f"model_{h_dim}_confusion_matrix_test.pdf")

    print("\n")

    time.sleep(2)  # Wait for 2 seconds before proceeding to the next h_dim

"""### Method 2"""

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import time

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Prepare the data
X_train = train_padded_sequences
y_train = train_data['encoded_labels']
X_test = test_padded_sequences
y_test = test_data['encoded_labels']

# Function to encode labels as one-hot vectors
def encode_labels(labels, num_classes):
    return tf.keras.utils.to_categorical(labels, num_classes=num_classes)

from keras.regularizers import l2

# Function to create the LSTM model
def create_lstm_model(input_length, h_dim, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(h_dim, activation='tanh'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Function to save plots to PDF
def save_plots_to_pdf(filename):
    with PdfPages(filename) as pdf:
        plt.figure(figsize=(10, 4))

        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Training')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.title(f'Accuracy (h_dim = {h_dim})')

        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Training')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.title(f'Loss (h_dim = {h_dim})')

        pdf.savefig(bbox_inches='tight')

        # Show the plots in Colab output
        plt.show()

    plt.close()

# Function to save confusion matrix to PDF
def save_confusion_matrix_to_pdf(y_true, y_pred, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title(f'Confusion matrix (h_dim = {h_dim})')
    plt.savefig(filename, bbox_inches='tight')

    # Show the plot in Colab output
    plt.show()
    plt.close()

# Set hyperparameters
h_dimensions = [25, 50, 75, 100]
epochs = 50
num_classes = 6
batch_size = 32

# Iterate over h_dimensions and train the models
for h_dim in h_dimensions:
    print(f"Training model with h_dim = {h_dim}")
    model = create_lstm_model(maxlen, h_dim, num_classes)

    # Encode labels as one-hot vectors
    y_train_encoded = encode_labels(y_train, num_classes)

    history = model.fit(X_train, y_train_encoded, epochs=epochs, batch_size=batch_size, verbose=2)
    train_accuracy = model.evaluate(X_train, encode_labels(y_train, num_classes), verbose=0)[1] * 100
    test_accuracy = model.evaluate(X_test, encode_labels(y_test, num_classes), verbose=0)[1] * 100
    print(f"Training set accuracy: {train_accuracy:.2f}%")
    print(f"Test set accuracy: {test_accuracy:.2f}%")

    y_pred_train = np.argmax(model.predict(X_train), axis=1)
    y_pred_test = np.argmax(model.predict(X_test), axis=1)

    train_f1 = f1_score(y_train, y_pred_train, average='weighted')
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')

    train_precision = precision_score(y_train, y_pred_train, average='weighted')
    test_precision = precision_score(y_test, y_pred_test, average='weighted')

    train_recall = recall_score(y_train, y_pred_train, average='weighted')
    test_recall = recall_score(y_test, y_pred_test, average='weighted')

    print(f"Training set F1-score: {train_f1:.2f}")
    print(f"Test set F1-score: {test_f1:.2f}")

    print(f"Training set Precision: {train_precision:.2f}")
    print(f"Test set Precision: {test_precision:.2f}")

    print(f"Training set Recall: {train_recall:.2f}")
    print(f"Test set Recall: {test_recall:.2f}")

    save_plots_to_pdf(f"model_{h_dim}_plots.pdf")

    print("Confusion matrix (training set):")
    save_confusion_matrix_to_pdf(y_train, y_pred_train, f"model{h_dim}confusion_matrix_train.pdf")

    print("Confusion matrix (test set):")
    save_confusion_matrix_to_pdf(y_test, y_pred_test, f"model{h_dim}confusion_matrix_test.pdf")

    print("\n")

    time.sleep(2)  # Wait for 2 seconds before proceeding to the next h_dim

"""### Method 2 (+ Val)"""

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Prepare the data
X_train = train_padded_sequences
y_train = train_data['encoded_labels']
X_test = test_padded_sequences
y_test = test_data['encoded_labels']

# Set hyperparameters
h_dimensions = [25, 50, 75, 100]
epochs = 25
num_classes = 6
batch_size = 32

# Function to create the LSTM model
def create_lstm_model(input_length, h_dim, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(h_dim, activation='tanh'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import time

# Function to save plots to PDF
def save_plots_to_pdf(filename):
    with PdfPages(filename) as pdf:
        plt.figure(figsize=(10, 4))

        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Training')
        plt.plot(history.history['val_accuracy'], label='Validation')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.title(f'Accuracy (h_dim = {h_dim})')

        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Training')
        plt.plot(history.history['val_loss'], label='Validation')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.title(f'Loss (h_dim = {h_dim})')

        pdf.savefig(bbox_inches='tight')

        # Show the plots in Colab output
        plt.show()

    plt.close()

# Function to save confusion matrix to PDF
def save_confusion_matrix_to_pdf(y_true, y_pred, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title(f'Confusion matrix (h_dim = {h_dim})')
    plt.savefig(filename, bbox_inches='tight')

    # Show the plot in Colab output
    plt.show()
    plt.close()

# Iterate over h_dimensions and train the models
for h_dim in h_dimensions:
    print(f"Training model with h_dim = {h_dim}")
    model = create_lstm_model(maxlen, h_dim, num_classes)

    # Split the training set for validation
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    history = model.fit(X_train_split, y_train_split, epochs=epochs, batch_size=batch_size, validation_data=(X_val_split, y_val_split), verbose=2)
    train_accuracy = model.evaluate(X_train, y_train, verbose=0)[1] * 100
    test_accuracy = model.evaluate(X_test, y_test, verbose=0)[1] * 100
    print(f"Training set accuracy: {train_accuracy:.2f}%")
    print(f"Test set accuracy: {test_accuracy:.2f}%")

    y_pred_train = np.argmax(model.predict(X_train), axis=1)
    y_pred_test = np.argmax(model.predict(X_test), axis=1)

    train_f1 = f1_score(y_train, y_pred_train, average='weighted')
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')

    train_precision = precision_score(y_train, y_pred_train, average='weighted')
    test_precision = precision_score(y_test, y_pred_test, average='weighted')

    train_recall = recall_score(y_train, y_pred_train, average='weighted')
    test_recall = recall_score(y_test, y_pred_test, average='weighted')

    print(f"Training set F1-score: {train_f1:.2f}")
    print(f"Test set F1-score: {test_f1:.2f}")

    print(f"Training set Precision: {train_precision:.2f}")
    print(f"Test set Precision: {test_precision:.2f}")

    print(f"Training set Recall: {train_recall:.2f}")
    print(f"Test set Recall: {test_recall:.2f}")

    save_plots_to_pdf(f"model_{h_dim}_plots.pdf")

    print("Confusion matrix (training set):")
    save_confusion_matrix_to_pdf(y_train, y_pred_train, f"model_{h_dim}_confusion_matrix_train.pdf")

    print("Confusion matrix (test set):")
    save_confusion_matrix_to_pdf(y_test, y_pred_test, f"model_{h_dim}_confusion_matrix_test.pdf")

    print("\n")

    time.sleep(2)  # Wait for 2 seconds before proceeding to the next h_dim

"""## **3.2 (Model 2)**

### Method1 (Val = Test, like Paper)
"""

import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import time
import warnings
from tensorflow.keras.utils import to_categorical

# Ignore all warnings
warnings.filterwarnings("ignore")

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set hyperparameters
h_dimensions = [25, 50, 75, 100]
epochs = 50
batch_size = 64

# Function to create the LSTM model
def create_lstm_model(input_length, h_dim):
    input_layer = Input(shape=(input_length,))
    embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)
    lstm_layer = LSTM(h_dim, activation='tanh', return_sequences=True, return_state=True)
    lstm_outputs, state_h, state_c = lstm_layer(embedding_layer)
    main_output = Dense(len(encoder_main.classes_), activation='softmax', name='main_output')(state_h)
    sub_output = Dense(len(encoder_sub.classes_), activation='softmax', name='sub_output')(lstm_outputs[:, -1, :])
    model = Model(inputs=input_layer, outputs=[main_output, sub_output])
    model.compile(optimizer=Adam(learning_rate=0.001), loss={'main_output': 'sparse_categorical_crossentropy', 'sub_output': 'sparse_categorical_crossentropy'}, metrics=['accuracy'])
    return model

# Function to save plots to PDF
def save_plots_to_pdf(filename, history, h_dim):
    with PdfPages(filename) as pdf:
        fig, axs = plt.subplots(2, 2, figsize=(10, 8))

        axs[0, 0].plot(history.history['main_output_accuracy'], label='Training')
        axs[0, 0].plot(history.history['val_main_output_accuracy'], label='Validation')
        axs[0, 0].set_xlabel('Epoch')
        axs[0, 0].set_ylabel('Accuracy')
        axs[0, 0].legend()
        axs[0, 0].set_title(f'Main Accuracy (h_dim = {h_dim})')

        axs[0, 1].plot(history.history['main_output_loss'], label='Training')
        axs[0, 1].plot(history.history['val_main_output_loss'], label='Validation')
        axs[0, 1].set_xlabel('Epoch')
        axs[0, 1].set_ylabel('Loss')
        axs[0, 1].legend()
        axs[0, 1].set_title(f'Main Loss (h_dim = {h_dim})')

        axs[1, 0].plot(history.history['sub_output_accuracy'], label='Training')
        axs[1, 0].plot(history.history['val_sub_output_accuracy'], label='Validation')
        axs[1, 0].set_xlabel('Epoch')
        axs[1, 0].set_ylabel('Accuracy')
        axs[1, 0].legend()
        axs[1, 0].set_title(f'Sub Accuracy (h_dim = {h_dim})')

        axs[1, 1].plot(history.history['sub_output_loss'], label='Training')
        axs[1, 1].plot(history.history['val_sub_output_loss'], label='Validation')
        axs[1, 1].set_xlabel('Epoch')
        axs[1, 1].set_ylabel('Loss')
        axs[1, 1].legend()
        axs[1, 1].set_title(f'Sub Loss (h_dim = {h_dim})')

        plt.subplots_adjust(hspace=0.5)  # Adjust the vertical spacing between subplots

        pdf.savefig(bbox_inches='tight')

        # Show the plots
        plt.show()

    plt.close()

# Function to save confusion matrix to PDF
def save_confusion_matrix_to_pdf(y_true, y_pred, filename, label_type):
    unique_classes = np.unique(y_true)
    num_classes = len(unique_classes)

    cm = confusion_matrix(y_true, y_pred, labels=unique_classes)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # Normalize the confusion matrix

    # Calculate the appropriate figure size based on the number of classes
    figsize = max(10, num_classes / 2)

    plt.figure(figsize=(figsize, figsize))
    sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', cbar=False)

    # Adjust the font size of the numbers inside the heatmap
    annot_fontsize = max(6, 120 // num_classes)
    plt.tick_params(axis='both', labelsize=annot_fontsize)

    # Map the class indices to their corresponding labels
    class_labels = encoder_sub.inverse_transform(unique_classes)

    plt.xticks(np.arange(num_classes) + 0.5, class_labels, rotation='vertical')
    plt.yticks(np.arange(num_classes) + 0.5, class_labels, rotation='horizontal')

    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title(f'Normalized Confusion Matrix ({label_type})')
    plt.savefig(filename, bbox_inches='tight')

    # Show the plot
    plt.show()
    plt.close()

# Function to print evaluation metrics
def print_evaluation_metrics(y_true, y_pred, label_type):
    accuracy = np.mean(y_true == y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    print(f'Evaluation metrics ({label_type}):')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')

# Load dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Define function for text normalization
def normalize_text(text):
    text = re.sub(r'[^a-zA-Z0-9\?]+', ' ', text)
    text = text.lower()
    return text

# Normalize text
train_data['text'] = train_data['text'].apply(normalize_text)
test_data['text'] = test_data['text'].apply(normalize_text)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

# Convert tokens to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Pad sequences
maxlen = max(len(seq) for seq in train_sequences)
train_padded_sequences = pad_sequences(train_sequences, maxlen=maxlen)
test_padded_sequences = pad_sequences(test_sequences, maxlen=maxlen)

# Encode main labels
encoder_main = LabelEncoder()
encoder_main.fit(train_data['label-coarse'])
train_encoded_main_labels = encoder_main.transform(train_data['label-coarse'])
test_encoded_main_labels = encoder_main.transform(test_data['label-coarse'])

# Encode sub labels
encoder_sub = LabelEncoder()
encoder_sub.fit(train_data['label-fine'])
train_encoded_sub_labels = encoder_sub.transform(train_data['label-fine'])
test_encoded_sub_labels = encoder_sub.transform(test_data['label-fine'])

# Convert main labels to one-hot vectors
y_main_train = to_categorical(train_encoded_main_labels)
y_main_test = to_categorical(test_encoded_main_labels)

# Convert sub labels to one-hot vectors
y_sub_train = to_categorical(train_encoded_sub_labels)
y_sub_test = to_categorical(test_encoded_sub_labels)

# Load GloVe embeddings into a dictionary
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
# Create an embedding matrix
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Iterate over h_dimensions and train the models
for h_dim in h_dimensions:
    print(f"Training model with h_dim = {h_dim}")
    model = create_lstm_model(maxlen, h_dim)

    # Fit the model
    history = model.fit(
        train_padded_sequences,
        {'main_output': train_encoded_main_labels, 'sub_output': train_encoded_sub_labels},
        validation_data=(test_padded_sequences, {'main_output': test_encoded_main_labels, 'sub_output': test_encoded_sub_labels}),
        epochs=epochs,
        batch_size=batch_size,
        verbose=1
    )

    # Save plots and confusion matrices
    timestamp = str(int(time.time()))
    save_plots_to_pdf(f'plots_{h_dim}_{timestamp}.pdf', history, h_dim)

    # Only plot confusion matrices for the test data
    main_output_predictions, sub_output_predictions = model.predict(test_padded_sequences)
    main_output_predictions = np.argmax(main_output_predictions, axis=1)
    sub_output_predictions = np.argmax(sub_output_predictions, axis=1)

    save_confusion_matrix_to_pdf(
        test_encoded_main_labels,
        main_output_predictions,
        f'confusion_matrix_test_main_{h_dim}_{timestamp}.pdf',
        'main'
    )
    save_confusion_matrix_to_pdf(
        test_encoded_sub_labels,
        sub_output_predictions,
        f'confusion_matrix_test_sub_{h_dim}_{timestamp}.pdf',
        'sub'
    )

    print_evaluation_metrics(test_encoded_main_labels, main_output_predictions, 'Main')
    print_evaluation_metrics(test_encoded_sub_labels, sub_output_predictions, 'Sub')

"""### Method1 (Val = Test, like Paper) + L2Norm"""

import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import time
import warnings
from tensorflow.keras.utils import to_categorical

# Ignore all warnings
warnings.filterwarnings("ignore")

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set hyperparameters
h_dimensions = [25, 50, 75, 100]
epochs = 100
batch_size = 32

# Function to create the LSTM model
from keras import regularizers
def create_lstm_model(input_length, h_dim):
    input_layer = Input(shape=(input_length,))
    embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)
    lstm_layer = LSTM(h_dim, activation='tanh', return_sequences=True, return_state=True)
    lstm_outputs, state_h, state_c = lstm_layer(embedding_layer)
    main_output = Dense(len(encoder_main.classes_), activation='softmax', name='main_output', kernel_regularizer=regularizers.l2(0.001))(state_h)
    sub_output = Dense(len(encoder_sub.classes_), activation='softmax', name='sub_output', kernel_regularizer=regularizers.l2(0.001))(lstm_outputs[:, -1, :])
    model = Model(inputs=input_layer, outputs=[main_output, sub_output])
    model.compile(optimizer=Adam(learning_rate=0.001), loss={'main_output': 'sparse_categorical_crossentropy', 'sub_output': 'sparse_categorical_crossentropy'}, metrics=['accuracy'])
    return model

# Function to save plots to PDF
def save_plots_to_pdf(filename, history, h_dim):
    with PdfPages(filename) as pdf:
        fig, axs = plt.subplots(2, 2, figsize=(10, 8))

        axs[0, 0].plot(history.history['main_output_accuracy'], label='Training')
        axs[0, 0].plot(history.history['val_main_output_accuracy'], label='Testing')
        axs[0, 0].set_xlabel('Epoch')
        axs[0, 0].set_ylabel('Accuracy')
        axs[0, 0].legend()
        axs[0, 0].set_title(f'Main Accuracy (h_dim = {h_dim})')

        axs[0, 1].plot(history.history['main_output_loss'], label='Training')
        axs[0, 1].plot(history.history['val_main_output_loss'], label='Testing')
        axs[0, 1].set_xlabel('Epoch')
        axs[0, 1].set_ylabel('Loss')
        axs[0, 1].legend()
        axs[0, 1].set_title(f'Main Loss (h_dim = {h_dim})')

        axs[1, 0].plot(history.history['sub_output_accuracy'], label='Training')
        axs[1, 0].plot(history.history['val_sub_output_accuracy'], label='Testing')
        axs[1, 0].set_xlabel('Epoch')
        axs[1, 0].set_ylabel('Accuracy')
        axs[1, 0].legend()
        axs[1, 0].set_title(f'Sub Accuracy (h_dim = {h_dim})')

        axs[1, 1].plot(history.history['sub_output_loss'], label='Training')
        axs[1, 1].plot(history.history['val_sub_output_loss'], label='Testing')
        axs[1, 1].set_xlabel('Epoch')
        axs[1, 1].set_ylabel('Loss')
        axs[1, 1].legend()
        axs[1, 1].set_title(f'Sub Loss (h_dim = {h_dim})')

        plt.subplots_adjust(hspace=0.5)  # Adjust the vertical spacing between subplots

        pdf.savefig(bbox_inches='tight')

        # Show the plots
        plt.show()

    plt.close()

# Function to save confusion matrix to PDF
def save_confusion_matrix_to_pdf(y_true, y_pred, filename, label_type):
    unique_classes = np.unique(y_true)
    num_classes = len(unique_classes)

    cm = confusion_matrix(y_true, y_pred, labels=unique_classes)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # Normalize the confusion matrix

    # Calculate the appropriate figure size based on the number of classes
    figsize = max(10, num_classes / 2)

    plt.figure(figsize=(figsize, figsize))
    sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', cbar=False)

    # Adjust the font size of the numbers inside the heatmap
    annot_fontsize = max(6, 120 // num_classes)
    plt.tick_params(axis='both', labelsize=annot_fontsize)

    # Map the class indices to their corresponding labels
    class_labels = encoder_sub.inverse_transform(unique_classes)

    plt.xticks(np.arange(num_classes) + 0.5, class_labels, rotation='vertical')
    plt.yticks(np.arange(num_classes) + 0.5, class_labels, rotation='horizontal')

    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title(f'Normalized Confusion Matrix ({label_type})')
    plt.savefig(filename, bbox_inches='tight')

    # Show the plot
    plt.show()
    plt.close()

# Function to print evaluation metrics
def print_evaluation_metrics(y_true, y_pred, label_type):
    accuracy = np.mean(y_true == y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    print(f'Evaluation metrics ({label_type}):')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')

# Load dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Define function for text normalization
def normalize_text(text):
    text = re.sub(r'[^a-zA-Z0-9\?]+', ' ', text)
    text = text.lower()
    return text

# Normalize text
train_data['text'] = train_data['text'].apply(normalize_text)
test_data['text'] = test_data['text'].apply(normalize_text)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

# Convert tokens to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Pad sequences
maxlen = max(len(seq) for seq in train_sequences)
train_padded_sequences = pad_sequences(train_sequences, maxlen=maxlen)
test_padded_sequences = pad_sequences(test_sequences, maxlen=maxlen)

# Encode main labels
encoder_main = LabelEncoder()
encoder_main.fit(train_data['label-coarse'])
train_encoded_main_labels = encoder_main.transform(train_data['label-coarse'])
test_encoded_main_labels = encoder_main.transform(test_data['label-coarse'])

# Encode sub labels
encoder_sub = LabelEncoder()
encoder_sub.fit(train_data['label-fine'])
train_encoded_sub_labels = encoder_sub.transform(train_data['label-fine'])
test_encoded_sub_labels = encoder_sub.transform(test_data['label-fine'])

# Convert main labels to one-hot vectors
y_main_train = to_categorical(train_encoded_main_labels)
y_main_test = to_categorical(test_encoded_main_labels)

# Convert sub labels to one-hot vectors
y_sub_train = to_categorical(train_encoded_sub_labels)
y_sub_test = to_categorical(test_encoded_sub_labels)

# Load GloVe embeddings into a dictionary
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
# Create an embedding matrix
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Iterate over h_dimensions and train the models
for h_dim in h_dimensions:
    print(f"Training model with h_dim = {h_dim}")
    model = create_lstm_model(maxlen, h_dim)

    # Fit the model
    history = model.fit(
        train_padded_sequences,
        {'main_output': train_encoded_main_labels, 'sub_output': train_encoded_sub_labels},
        validation_data=(test_padded_sequences, {'main_output': test_encoded_main_labels, 'sub_output': test_encoded_sub_labels}),
        epochs=epochs,
        batch_size=batch_size,
        verbose=1
    )

    # Save plots and confusion matrices
    timestamp = str(int(time.time()))
    save_plots_to_pdf(f'plots_{h_dim}_{timestamp}.pdf', history, h_dim)

    # Only plot confusion matrices for the test data
    main_output_predictions, sub_output_predictions = model.predict(test_padded_sequences)
    main_output_predictions = np.argmax(main_output_predictions, axis=1)
    sub_output_predictions = np.argmax(sub_output_predictions, axis=1)

    save_confusion_matrix_to_pdf(
        test_encoded_main_labels,
        main_output_predictions,
        f'confusion_matrix_test_main_{h_dim}_{timestamp}.pdf',
        'main'
    )
    save_confusion_matrix_to_pdf(
        test_encoded_sub_labels,
        sub_output_predictions,
        f'confusion_matrix_test_sub_{h_dim}_{timestamp}.pdf',
        'sub'
    )

    print_evaluation_metrics(test_encoded_main_labels, main_output_predictions, 'Main')
    print_evaluation_metrics(test_encoded_sub_labels, sub_output_predictions, 'Sub')

"""### Method1 (Val = Val)"""

import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import time
import warnings
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Ignore all warnings
warnings.filterwarnings("ignore")

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set hyperparameters
h_dimensions = [25, 50, 75, 100]
epochs = 50
batch_size = 64

# Function to create the LSTM model
def create_lstm_model(input_length, h_dim):
    input_layer = Input(shape=(input_length,))
    embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)
    lstm_layer = LSTM(h_dim, activation='tanh', return_sequences=True, return_state=True)
    lstm_outputs, state_h, state_c = lstm_layer(embedding_layer)
    main_output = Dense(len(encoder_main.classes_), activation='softmax', name='main_output')(state_h)
    sub_output = Dense(len(encoder_sub.classes_), activation='softmax', name='sub_output')(lstm_outputs[:, -1, :])
    model = Model(inputs=input_layer, outputs=[main_output, sub_output])
    model.compile(optimizer=Adam(learning_rate=0.001), loss={'main_output': 'sparse_categorical_crossentropy', 'sub_output': 'sparse_categorical_crossentropy'}, metrics=['accuracy'])
    return model

# Function to save plots to PDF
def save_plots_to_pdf(filename, history, h_dim):
    with PdfPages(filename) as pdf:
        fig, axs = plt.subplots(2, 2, figsize=(10, 8))

        axs[0, 0].plot(history.history['main_output_accuracy'], label='Training')
        axs[0, 0].plot(history.history['val_main_output_accuracy'], label='Validation')
        axs[0, 0].set_xlabel('Epoch')
        axs[0, 0].set_ylabel('Accuracy')
        axs[0, 0].legend()
        axs[0, 0].set_title(f'Main Accuracy (h_dim = {h_dim})')

        axs[0, 1].plot(history.history['main_output_loss'], label='Training')
        axs[0, 1].plot(history.history['val_main_output_loss'], label='Validation')
        axs[0, 1].set_xlabel('Epoch')
        axs[0, 1].set_ylabel('Loss')
        axs[0, 1].legend()
        axs[0, 1].set_title(f'Main Loss (h_dim = {h_dim})')

        axs[1, 0].plot(history.history['sub_output_accuracy'], label='Training')
        axs[1, 0].plot(history.history['val_sub_output_accuracy'], label='Validation')
        axs[1, 0].set_xlabel('Epoch')
        axs[1, 0].set_ylabel('Accuracy')
        axs[1, 0].legend()
        axs[1, 0].set_title(f'Sub Accuracy (h_dim = {h_dim})')

        axs[1, 1].plot(history.history['sub_output_loss'], label='Training')
        axs[1, 1].plot(history.history['val_sub_output_loss'], label='Validation')
        axs[1, 1].set_xlabel('Epoch')
        axs[1, 1].set_ylabel('Loss')
        axs[1, 1].legend()
        axs[1, 1].set_title(f'Sub Loss (h_dim = {h_dim})')

        plt.subplots_adjust(hspace=0.5)  # Adjust the vertical spacing between subplots

        pdf.savefig(bbox_inches='tight')

        # Show the plots
        plt.show()

    plt.close()

# Function to save confusion matrix to PDF
def save_confusion_matrix_to_pdf(y_true, y_pred, filename, label_type):
    unique_classes = np.unique(y_true)
    num_classes = len(unique_classes)

    cm = confusion_matrix(y_true, y_pred, labels=unique_classes)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # Normalize the confusion matrix

    # Calculate the appropriate figure size based on the number of classes
    figsize = max(10, num_classes / 2)

    plt.figure(figsize=(figsize, figsize))
    sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', cbar=False)

    # Adjust the font size of the numbers inside the heatmap
    annot_fontsize = max(6, 120 // num_classes)
    plt.tick_params(axis='both', labelsize=annot_fontsize)

    # Map the class indices to their corresponding labels
    class_labels = encoder_sub.inverse_transform(unique_classes)

    plt.xticks(np.arange(num_classes) + 0.5, class_labels, rotation='vertical')
    plt.yticks(np.arange(num_classes) + 0.5, class_labels, rotation='horizontal')

    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title(f'Normalized Confusion Matrix ({label_type})')
    plt.savefig(filename, bbox_inches='tight')

    # Show the plot
    plt.show()
    plt.close()

# Function to print evaluation metrics
def print_evaluation_metrics(y_true, y_pred, label_type):
    accuracy = np.mean(y_true == y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    print(f'Evaluation metrics ({label_type}):')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')

# Load dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Define function for text normalization
def normalize_text(text):
    text = re.sub(r'[^a-zA-Z0-9\?]+', ' ', text)
    text = text.lower()
    return text

# Normalize text
train_data['text'] = train_data['text'].apply(normalize_text)
test_data['text'] = test_data['text'].apply(normalize_text)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

# Convert tokens to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Pad sequences
maxlen = max(len(seq) for seq in train_sequences)
train_padded_sequences = pad_sequences(train_sequences, maxlen=maxlen)
test_padded_sequences = pad_sequences(test_sequences, maxlen=maxlen)

# Encode main labels
encoder_main = LabelEncoder()
encoder_main.fit(train_data['label-coarse'])
train_encoded_main_labels = encoder_main.transform(train_data['label-coarse'])
test_encoded_main_labels = encoder_main.transform(test_data['label-coarse'])

# Encode sub labels
encoder_sub = LabelEncoder()
encoder_sub.fit(train_data['label-fine'])
train_encoded_sub_labels = encoder_sub.transform(train_data['label-fine'])
test_encoded_sub_labels = encoder_sub.transform(test_data['label-fine'])

# Convert main labels to one-hot vectors
y_main_train = to_categorical(train_encoded_main_labels)
y_main_test = to_categorical(test_encoded_main_labels)

# Convert sub labels to one-hot vectors
y_sub_train = to_categorical(train_encoded_sub_labels)
y_sub_test = to_categorical(test_encoded_sub_labels)

# Load GloVe embeddings into a dictionary
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create an embedding matrix
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Split train data into train and validation sets
train_padded_sequences, val_padded_sequences, train_encoded_main_labels, val_encoded_main_labels, train_encoded_sub_labels, val_encoded_sub_labels = train_test_split(
    train_padded_sequences,
    train_encoded_main_labels,
    train_encoded_sub_labels,
    test_size=0.2,
    random_state=42
)

# Iterate over h_dimensions and train the models
for h_dim in h_dimensions:
    print(f"Training model with h_dim = {h_dim}")
    model = create_lstm_model(maxlen, h_dim)

    # Fit the model
    history = model.fit(
        train_padded_sequences,
        {'main_output': train_encoded_main_labels, 'sub_output': train_encoded_sub_labels},
        validation_data=(val_padded_sequences, {'main_output': val_encoded_main_labels, 'sub_output': val_encoded_sub_labels}),
        epochs=epochs,
        batch_size=batch_size,
        verbose=1
    )

    # Save plots and confusion matrices
    timestamp = str(int(time.time()))
    save_plots_to_pdf(f'plots_{h_dim}_{timestamp}.pdf', history, h_dim)

    # Only plot confusion matrices for the test data
    main_output_predictions, sub_output_predictions = model.predict(test_padded_sequences)
    main_output_predictions = np.argmax(main_output_predictions, axis=1)
    sub_output_predictions = np.argmax(sub_output_predictions, axis=1)

    save_confusion_matrix_to_pdf(
        test_encoded_main_labels,
        main_output_predictions,
        f'confusion_matrix_test_main_{h_dim}_{timestamp}.pdf',
        'main'
    )
    save_confusion_matrix_to_pdf(
        test_encoded_sub_labels,
        sub_output_predictions,
        f'confusion_matrix_test_sub_{h_dim}_{timestamp}.pdf',
        'sub'
    )

    print_evaluation_metrics(test_encoded_main_labels, main_output_predictions, 'Main')
    print_evaluation_metrics(test_encoded_sub_labels, sub_output_predictions, 'Sub')

"""### Method1 (Val = Val) + L2Norm"""

import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import time
import warnings
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Ignore all warnings
warnings.filterwarnings("ignore")

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set hyperparameters
h_dimensions = [25, 50, 75, 100]
epochs = 100
batch_size = 64

# Function to create the LSTM model
from keras import regularizers
def create_lstm_model(input_length, h_dim):
    input_layer = Input(shape=(input_length,))
    embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)
    lstm_layer = LSTM(h_dim, activation='tanh', return_sequences=True, return_state=True)
    lstm_outputs, state_h, state_c = lstm_layer(embedding_layer)
    main_output = Dense(len(encoder_main.classes_), activation='softmax', name='main_output', kernel_regularizer=regularizers.l2(0.001))(state_h)
    sub_output = Dense(len(encoder_sub.classes_), activation='softmax', name='sub_output', kernel_regularizer=regularizers.l2(0.001))(lstm_outputs[:, -1, :])
    model = Model(inputs=input_layer, outputs=[main_output, sub_output])
    model.compile(optimizer=Adam(learning_rate=0.001), loss={'main_output': 'sparse_categorical_crossentropy', 'sub_output': 'sparse_categorical_crossentropy'}, metrics=['accuracy'])
    return model

# Function to save plots to PDF
def save_plots_to_pdf(filename, history, h_dim):
    with PdfPages(filename) as pdf:
        fig, axs = plt.subplots(2, 2, figsize=(10, 8))

        axs[0, 0].plot(history.history['main_output_accuracy'], label='Training')
        axs[0, 0].plot(history.history['val_main_output_accuracy'], label='Validation')
        axs[0, 0].set_xlabel('Epoch')
        axs[0, 0].set_ylabel('Accuracy')
        axs[0, 0].legend()
        axs[0, 0].set_title(f'Main Accuracy (h_dim = {h_dim})')

        axs[0, 1].plot(history.history['main_output_loss'], label='Training')
        axs[0, 1].plot(history.history['val_main_output_loss'], label='Validation')
        axs[0, 1].set_xlabel('Epoch')
        axs[0, 1].set_ylabel('Loss')
        axs[0, 1].legend()
        axs[0, 1].set_title(f'Main Loss (h_dim = {h_dim})')

        axs[1, 0].plot(history.history['sub_output_accuracy'], label='Training')
        axs[1, 0].plot(history.history['val_sub_output_accuracy'], label='Validation')
        axs[1, 0].set_xlabel('Epoch')
        axs[1, 0].set_ylabel('Accuracy')
        axs[1, 0].legend()
        axs[1, 0].set_title(f'Sub Accuracy (h_dim = {h_dim})')

        axs[1, 1].plot(history.history['sub_output_loss'], label='Training')
        axs[1, 1].plot(history.history['val_sub_output_loss'], label='Validation')
        axs[1, 1].set_xlabel('Epoch')
        axs[1, 1].set_ylabel('Loss')
        axs[1, 1].legend()
        axs[1, 1].set_title(f'Sub Loss (h_dim = {h_dim})')

        plt.subplots_adjust(hspace=0.5)  # Adjust the vertical spacing between subplots

        pdf.savefig(bbox_inches='tight')

        # Show the plots
        plt.show()

    plt.close()

# Function to save confusion matrix to PDF
def save_confusion_matrix_to_pdf(y_true, y_pred, filename, label_type):
    unique_classes = np.unique(y_true)
    num_classes = len(unique_classes)

    cm = confusion_matrix(y_true, y_pred, labels=unique_classes)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # Normalize the confusion matrix

    # Calculate the appropriate figure size based on the number of classes
    figsize = max(10, num_classes / 2)

    plt.figure(figsize=(figsize, figsize))
    sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', cbar=False)

    # Adjust the font size of the numbers inside the heatmap
    annot_fontsize = max(6, 120 // num_classes)
    plt.tick_params(axis='both', labelsize=annot_fontsize)

    # Map the class indices to their corresponding labels
    class_labels = encoder_sub.inverse_transform(unique_classes)

    plt.xticks(np.arange(num_classes) + 0.5, class_labels, rotation='vertical')
    plt.yticks(np.arange(num_classes) + 0.5, class_labels, rotation='horizontal')

    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title(f'Normalized Confusion Matrix ({label_type})')
    plt.savefig(filename, bbox_inches='tight')

    # Show the plot
    plt.show()
    plt.close()

# Function to print evaluation metrics
def print_evaluation_metrics(y_true, y_pred, label_type):
    accuracy = np.mean(y_true == y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    print(f'Evaluation metrics ({label_type}):')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')

# Load dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Define function for text normalization
def normalize_text(text):
    text = re.sub(r'[^a-zA-Z0-9\?]+', ' ', text)
    text = text.lower()
    return text

# Normalize text
train_data['text'] = train_data['text'].apply(normalize_text)
test_data['text'] = test_data['text'].apply(normalize_text)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

# Convert tokens to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Pad sequences
maxlen = max(len(seq) for seq in train_sequences)
train_padded_sequences = pad_sequences(train_sequences, maxlen=maxlen)
test_padded_sequences = pad_sequences(test_sequences, maxlen=maxlen)

# Encode main labels
encoder_main = LabelEncoder()
encoder_main.fit(train_data['label-coarse'])
train_encoded_main_labels = encoder_main.transform(train_data['label-coarse'])
test_encoded_main_labels = encoder_main.transform(test_data['label-coarse'])

# Encode sub labels
encoder_sub = LabelEncoder()
encoder_sub.fit(train_data['label-fine'])
train_encoded_sub_labels = encoder_sub.transform(train_data['label-fine'])
test_encoded_sub_labels = encoder_sub.transform(test_data['label-fine'])

# Convert main labels to one-hot vectors
y_main_train = to_categorical(train_encoded_main_labels)
y_main_test = to_categorical(test_encoded_main_labels)

# Convert sub labels to one-hot vectors
y_sub_train = to_categorical(train_encoded_sub_labels)
y_sub_test = to_categorical(test_encoded_sub_labels)

# Load GloVe embeddings into a dictionary
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create an embedding matrix
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Split train data into train and validation sets
train_padded_sequences, val_padded_sequences, train_encoded_main_labels, val_encoded_main_labels, train_encoded_sub_labels, val_encoded_sub_labels = train_test_split(
    train_padded_sequences,
    train_encoded_main_labels,
    train_encoded_sub_labels,
    test_size=0.2,
    random_state=42
)

# Iterate over h_dimensions and train the models
for h_dim in h_dimensions:
    print(f"Training model with h_dim = {h_dim}")
    model = create_lstm_model(maxlen, h_dim)

    # Fit the model
    history = model.fit(
        train_padded_sequences,
        {'main_output': train_encoded_main_labels, 'sub_output': train_encoded_sub_labels},
        validation_data=(val_padded_sequences, {'main_output': val_encoded_main_labels, 'sub_output': val_encoded_sub_labels}),
        epochs=epochs,
        batch_size=batch_size,
        verbose=1
    )

    # Save plots and confusion matrices
    timestamp = str(int(time.time()))
    save_plots_to_pdf(f'plots_{h_dim}_{timestamp}.pdf', history, h_dim)

    # Only plot confusion matrices for the test data
    main_output_predictions, sub_output_predictions = model.predict(test_padded_sequences)
    main_output_predictions = np.argmax(main_output_predictions, axis=1)
    sub_output_predictions = np.argmax(sub_output_predictions, axis=1)

    save_confusion_matrix_to_pdf(
        test_encoded_main_labels,
        main_output_predictions,
        f'confusion_matrix_test_main_{h_dim}_{timestamp}.pdf',
        'main'
    )
    save_confusion_matrix_to_pdf(
        test_encoded_sub_labels,
        sub_output_predictions,
        f'confusion_matrix_test_sub_{h_dim}_{timestamp}.pdf',
        'sub'
    )

    print_evaluation_metrics(test_encoded_main_labels, main_output_predictions, 'Main')
    print_evaluation_metrics(test_encoded_sub_labels, sub_output_predictions, 'Sub')

"""# **4. Prototype Responder**"""

import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense

# Load the QA_data
qa_data = pd.read_csv('QA_data.csv', encoding='latin1')

# Define function for text normalization
def normalize_text(text):
    text = re.sub(r'[^a-zA-Z0-9\?]+', ' ', text)
    text = text.lower()
    return text

# Normalize text
qa_data['text'] = qa_data['text'].apply(normalize_text)

# Preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(qa_data['text'])
sequences = tokenizer.texts_to_sequences(qa_data['text'])
word_index = tokenizer.word_index
max_sequence_length = max(len(seq) for seq in sequences)

X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode the answers
answers = qa_data['answer']
answer_labels = np.unique(answers)
label_to_index = {label: index for index, label in enumerate(answer_labels)}
answers_encoded = np.array([label_to_index[answer] for answer in answers])

num_classes = len(answer_labels)

# Define the Responder model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)
lstm_layer = LSTM(100, return_sequences=True)(embedding_layer)
lstm_last_hidden_state = lstm_layer[:, -1, :]
lstm_last_cell_state = lstm_layer[:, -1, :]
bilstm_layer = Bidirectional(LSTM(100, return_sequences=False), merge_mode='concat')(lstm_layer, initial_state=[lstm_last_hidden_state, lstm_last_cell_state, lstm_last_hidden_state, lstm_last_cell_state])
# bilstm_layer = Bidirectional(LSTM(100, return_sequences=False), merge_mode='concat')(embedding_layer, initial_state=[lstm_last_hidden_state, lstm_last_cell_state, lstm_last_hidden_state, lstm_last_cell_state])
output_layer = Dense(num_classes, activation='softmax')(bilstm_layer)
model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, answers_encoded, epochs=100, batch_size=8)

# Test the model on Table 3 questions
test_questions = [
    'How many people speak French?',
    'What day is today?',
    'Who will win the war?',
    'Who is Italian first minister?',
    'When World War II ended?',
    'When Gandhi was assassinated?'
]

# Normalize test questions
test_questions = [normalize_text(question) for question in test_questions]

test_sequences = tokenizer.texts_to_sequences(test_questions)
test_X = pad_sequences(test_sequences, maxlen=max_sequence_length)

predictions = model.predict(test_X)
predicted_labels = [answer_labels[np.argmax(pred)] for pred in predictions]

# Print the predicted answers
for question, answer in zip(test_questions, predicted_labels):
    print(f'Question: {question}')
    print(f'Predicted Answer: {answer}\n')

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense

# Load the QA_data
qa_data = pd.read_csv('QA_data.csv', encoding='latin1')

# Define function for text normalization
def normalize_text(text):
    text = re.sub(r'[^a-zA-Z0-9\?]+', ' ', text)
    text = text.lower()
    return text

# Normalize text
qa_data['text'] = qa_data['text'].apply(normalize_text)

# Preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(qa_data['text'])
sequences = tokenizer.texts_to_sequences(qa_data['text'])
word_index = tokenizer.word_index
max_sequence_length = max(len(seq) for seq in sequences)

X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode the answers
answers = qa_data['answer']
answer_labels = np.unique(answers)
label_to_index = {label: index for index, label in enumerate(answer_labels)}
answers_encoded = np.array([label_to_index[answer] for answer in answers])

num_classes = len(answer_labels)

# Load GloVe embeddings into a dictionary
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create an embedding matrix
embedding_dim = 300
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the Responder model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)
lstm_layer = LSTM(100, return_sequences=True)(embedding_layer)
lstm_last_hidden_state = lstm_layer[:, -1, :]
lstm_last_cell_state = lstm_layer[:, -1, :]
bilstm_layer = Bidirectional(LSTM(100, return_sequences=False), merge_mode='concat')(lstm_layer, initial_state=[lstm_last_hidden_state, lstm_last_cell_state, lstm_last_hidden_state, lstm_last_cell_state])
output_layer = Dense(num_classes, activation='softmax')(bilstm_layer)
model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X, answers_encoded, epochs=100, batch_size=8)

# Plot loss and accuracy
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.tight_layout()
plt.savefig('lossaccuracyplot.pdf')
plt.show()

# Test the model on Table 3 questions
test_questions = [
    'How many people speak French?',
    'What day is today?',
    'Who will win the war?',
    'Who is Italian first minister?',
    'When World War II ended?',
    'When Gandhi was assassinated?'
]

# Normalize test questions
test_questions = [normalize_text(question) for question in test_questions]

test_sequences = tokenizer.texts_to_sequences(test_questions)
test_X = pad_sequences(test_sequences, maxlen=max_sequence_length)

predictions = model.predict(test_X)
predicted_labels = [answer_labels[np.argmax(pred)] for pred in predictions]

# Save predictions as pandas DataFrame
results = pd.DataFrame({'Question': test_questions, 'Predicted Answer': predicted_labels})
results.to_csv('predictions.csv', index=False)

# Print the predicted answers
for question, answer in zip(test_questions, predicted_labels):
    print(f'Question: {question}')
    print(f'Predicted Answer: {answer}\n')

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense

# Load the QA_data
qa_data = pd.read_csv('QA_data.csv', encoding='latin1')

# Define function for text normalization
def normalize_text(text):
    text = re.sub(r'[^a-zA-Z0-9\?]+', ' ', text)
    text = text.lower()
    return text

# Normalize text
qa_data['text'] = qa_data['text'].apply(normalize_text)

# Preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(qa_data['text'])
sequences = tokenizer.texts_to_sequences(qa_data['text'])
word_index = tokenizer.word_index
max_sequence_length = max(len(seq) for seq in sequences)

X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode the answers
answers = qa_data['answer']
answer_labels = np.unique(answers)
label_to_index = {label: index for index, label in enumerate(answer_labels)}
answers_encoded = np.array([label_to_index[answer] for answer in answers])

num_classes = len(answer_labels)

# Load GloVe embeddings into a dictionary
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create an embedding matrix
embedding_dim = 300
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the Responder model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)
lstm_layer = LSTM(100, return_sequences=True)(embedding_layer)
lstm_last_hidden_state = lstm_layer[:, -1, :]
lstm_last_cell_state = lstm_layer[:, -1, :]
bilstm_layer = Bidirectional(LSTM(100, return_sequences=False), merge_mode='concat')(lstm_layer, initial_state=[lstm_last_hidden_state, lstm_last_cell_state, lstm_last_hidden_state, lstm_last_cell_state])
# bilstm_layer = Bidirectional(LSTM(100, return_sequences=False), merge_mode='concat')(embedding_layer, initial_state=[lstm_last_hidden_state, lstm_last_cell_state, lstm_last_hidden_state, lstm_last_cell_state])
output_layer = Dense(num_classes, activation='softmax')(bilstm_layer)
model = Model(inputs=input_layer, outputs=output_layer)

# model = Sequential()
# model.add(Embedding(vocab_size, embedding_dim, input_length=max_sequence_length, weights=[embedding_matrix], trainable=False))
# model.add(LSTM(100, return_sequences=True))
# model.add(Bidirectional(LSTM(100, return_sequences=True)))
# model.add(Bidirectional(LSTM(100)))
# model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X, answers_encoded, epochs=100, batch_size=8)

# Plot loss and accuracy
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.tight_layout()
plt.savefig('lossaccuracyplot2.pdf')
plt.show()

# Test the model on Table 3 questions
test_questions = [
    'How many people speak French?',
    'What day is today?',
    'Who will win the war?',
    'Who is Italian first minister?',
    'When World War II ended?',
    'When Gandhi was assassinated?'
]

# Normalize test questions
test_questions = [normalize_text(question) for question in test_questions]

test_sequences = tokenizer.texts_to_sequences(test_questions)
test_X = pad_sequences(test_sequences, maxlen=max_sequence_length)

predictions = model.predict(test_X)
predicted_labels = [answer_labels[np.argmax(pred)] for pred in predictions]

# Save predictions as pandas DataFrame
results = pd.DataFrame({'Question': test_questions, 'Predicted Answer': predicted_labels})
results.to_csv('predictions.csv', index=False)

# Print the predicted answers
for question, answer in zip(test_questions, predicted_labels):
    print(f'Question: {question}')
    print(f'Predicted Answer: {answer}\n')

import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense

# Load the QA_data
qa_data = pd.read_csv('QA_data.csv', encoding='latin1')

# Define function for text normalization
def normalize_text(text):
    text = re.sub(r'[^a-zA-Z0-9\?]+', ' ', text)
    text = text.lower()
    return text

# Normalize text
qa_data['text'] = qa_data['text'].apply(normalize_text)

# Preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(qa_data['text'])
sequences = tokenizer.texts_to_sequences(qa_data['text'])
word_index = tokenizer.word_index
max_sequence_length = max(len(seq) for seq in sequences)

X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode the answers
answers = qa_data['answer']
answer_labels = np.unique(answers)
label_to_index = {label: index for index, label in enumerate(answer_labels)}
answers_encoded = np.array([label_to_index[answer] for answer in answers])

num_classes = len(answer_labels)

# Define the Responder model
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, input_length=max_sequence_length))
model.add(LSTM(100, return_sequences=True))
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, answers_encoded, epochs=100, batch_size=8)

# Test the model on Table 3 questions
test_questions = [
    'How many people speak French?',
    'What day is today?',
    'Who will win the war?',
    'Who is Italian first minister?',
    'When World War II ended?',
    'When Gandhi was assassinated?'
]

# Normalize test questions
test_questions = [normalize_text(question) for question in test_questions]

test_sequences = tokenizer.texts_to_sequences(test_questions)
test_X = pad_sequences(test_sequences, maxlen=max_sequence_length)

predictions = model.predict(test_X)
predicted_labels = [answer_labels[np.argmax(pred)] for pred in predictions]

# Print the predicted answers
for question, answer in zip(test_questions, predicted_labels):
    print(f'Question: {question}')
    print(f'Predicted Answer: {answer}\n')

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense

# Load the QA_data
qa_data = pd.read_csv('QA_data.csv', encoding='latin1')

# Define function for text normalization
def normalize_text(text):
    text = re.sub(r'[^a-zA-Z0-9\?]+', ' ', text)
    text = text.lower()
    return text

# Normalize text
qa_data['text'] = qa_data['text'].apply(normalize_text)

# Preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(qa_data['text'])
sequences = tokenizer.texts_to_sequences(qa_data['text'])
word_index = tokenizer.word_index
max_sequence_length = max(len(seq) for seq in sequences)

X = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode the answers
answers = qa_data['answer']
answer_labels = np.unique(answers)
label_to_index = {label: index for index, label in enumerate(answer_labels)}
answers_encoded = np.array([label_to_index[answer] for answer in answers])

num_classes = len(answer_labels)

# Load GloVe embeddings into a dictionary
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create an embedding matrix
embedding_dim = 300
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the Responder model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)
lstm_layer = LSTM(100, return_sequences=True)(embedding_layer)
lstm_last_hidden_state = lstm_layer[:, -1, :]
lstm_last_cell_state = lstm_layer[:, -1, :]
bilstm_layer = Bidirectional(LSTM(100, return_sequences=False), merge_mode='concat')(lstm_layer, initial_state=[lstm_last_hidden_state, lstm_last_cell_state, lstm_last_hidden_state, lstm_last_cell_state])
output_layer = Dense(num_classes, activation='softmax')(bilstm_layer)
model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X, answers_encoded, epochs=150, batch_size=8)

# Plot loss and accuracy
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.tight_layout()
plt.savefig('lossaccuracyplot4.pdf')
plt.show()

# Test the model on Table 3 questions
test_questions = [
    'How many people speak French?',
    'What day is today?',
    'Who will win the war?',
    'Who is Italian first minister?',
    'When World War II ended?',
    'When Gandhi was assassinated?'
]

# Normalize test questions
test_questions = [normalize_text(question) for question in test_questions]

test_sequences = tokenizer.texts_to_sequences(test_questions)
test_X = pad_sequences(test_sequences, maxlen=max_sequence_length)

predictions = model.predict(test_X)
predicted_labels = [answer_labels[np.argmax(pred)] for pred in predictions]

# Save predictions as pandas DataFrame
results = pd.DataFrame({'Question': test_questions, 'Predicted Answer': predicted_labels})
results.to_csv('predictions4.csv', index=False)

# Print the predicted answers
for question, answer in zip(test_questions, predicted_labels):
    print(f'Question: {question}')
    print(f'Predicted Answer: {answer}\n')

# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Load dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Read 'QA_data.csv' with 'errors' parameter and pass the file object to 'pd.read_csv()'
with open('QA_data.csv', 'r', encoding='utf-8', errors='replace') as file:
    qa_data = pd.read_csv(file)

# Define function for text normalization
def normalize_text(text):
    text = re.sub(r'[^a-zA-Z0-9\?]+', ' ', text)
    text = text.lower()
    return text

# Normalize text
train_data['text'] = train_data['text'].apply(normalize_text)
test_data['text'] = test_data['text'].apply(normalize_text)
qa_data['text'] = qa_data['text'].apply(normalize_text)

# Tokenize the text
train_data['tokens'] = train_data['text'].apply(word_tokenize)
test_data['tokens'] = test_data['text'].apply(word_tokenize)
qa_data['tokens'] = qa_data['text'].apply(word_tokenize)

# Load GloVe embeddings into a dictionary
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Prepare tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['tokens'])

# Convert tokens to sequences
train_data['sequences'] = tokenizer.texts_to_sequences(train_data['tokens'])
test_data['sequences'] = tokenizer.texts_to_sequences(test_data['tokens'])
qa_data['sequences'] = tokenizer.texts_to_sequences(qa_data['tokens'])

# Pad sequences
maxlen = max(train_data['sequences'].apply(len))
train_padded_sequences = pad_sequences(train_data['sequences'], maxlen=maxlen)
test_padded_sequences = pad_sequences(test_data['sequences'], maxlen=maxlen)
qa_padded_sequences = pad_sequences(qa_data['sequences'], maxlen=maxlen)

# Encode labels
encoder = LabelEncoder()
encoder.fit(train_data['label-coarse'])
train_data['encoded_labels'] = encoder.transform(train_data['label-coarse'])
test_data['encoded_labels'] = encoder.transform(test_data['label-coarse'])
qa_data['encoded_labels'] = encoder.transform(qa_data['label-coarse'])

# Create an embedding matrix
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector