From 501b2b52b52867e252a05f637276b94ec585f3b2 Mon Sep 17 00:00:00 2001
From: Mikhail Vyrodov <m_vyrodov@mail.ru>
Date: Tue, 21 May 2024 13:25:20 +0300
Subject: [PATCH] Review fixes

---
 README.md                                     | 86 +++++++++++++---
 goat/backend/Dockerfile                       | 14 +++
 goat/backend/__init__.py                      |  0
 .../backend/add_results.py                    | 36 ++++---
 goat/backend/docker-compose.yml               | 19 ++++
 goat/backend/eval.py                          | 24 +++++
 goat/backend/requirements.txt                 |  9 ++
 goat/database/bd_init_script.sql              | 38 +++++++
 goat/frontend/__init__.py                     |  0
 .../leaderboard_web => goat/frontend}/app.py  | 21 ++--
 .../utils.py => goat/frontend/precision.py    |  0
 goat/frontend/requirements.txt                |  3 +
 .../frontend/src_display.css                  | 10 --
 goat/parser/__init__.py                       |  0
 goat/{ => parser}/dataset_demonstration.py    |  0
 goat/{ => parser}/dataset_utils.py            |  0
 goat/{ => parser}/items.py                    |  0
 goat/{ => parser}/middlewares.py              |  0
 goat/{ => parser}/pipelines.py                |  0
 .../parser/requirements.txt                   |  0
 goat/{ => parser}/settings.py                 |  6 +-
 goat/{ => parser}/spider_utils.py             |  0
 goat/{ => parser}/spiders/__init__.py         |  0
 goat/{ => parser}/spiders/sdamgia_spider.py   |  4 +-
 goat/utils/__init__.py                        |  0
 .../utils}/database_helper.py                 | 53 +++++++---
 scrapy.cfg                                    |  4 +-
 scripts/leaderboard_backend/build_script.sh   |  4 -
 scripts/leaderboard_backend/eval.py           | 14 ---
 scripts/leaderboard_backend/eval_script.sh    | 10 --
 scripts/leaderboard_web/database_helper.py    | 98 -------------------
 31 files changed, 251 insertions(+), 202 deletions(-)
 create mode 100644 goat/backend/Dockerfile
 create mode 100644 goat/backend/__init__.py
 rename scripts/leaderboard_backend/add_result.py => goat/backend/add_results.py (79%)
 create mode 100644 goat/backend/docker-compose.yml
 create mode 100644 goat/backend/eval.py
 create mode 100644 goat/backend/requirements.txt
 create mode 100644 goat/database/bd_init_script.sql
 create mode 100644 goat/frontend/__init__.py
 rename {scripts/leaderboard_web => goat/frontend}/app.py (77%)
 rename scripts/leaderboard_web/utils.py => goat/frontend/precision.py (100%)
 create mode 100644 goat/frontend/requirements.txt
 rename scripts/leaderboard_web/src_display_css_html_js.py => goat/frontend/src_display.css (86%)
 create mode 100644 goat/parser/__init__.py
 rename goat/{ => parser}/dataset_demonstration.py (100%)
 rename goat/{ => parser}/dataset_utils.py (100%)
 rename goat/{ => parser}/items.py (100%)
 rename goat/{ => parser}/middlewares.py (100%)
 rename goat/{ => parser}/pipelines.py (100%)
 rename requirements.txt => goat/parser/requirements.txt (100%)
 rename goat/{ => parser}/settings.py (96%)
 rename goat/{ => parser}/spider_utils.py (100%)
 rename goat/{ => parser}/spiders/__init__.py (100%)
 rename goat/{ => parser}/spiders/sdamgia_spider.py (98%)
 create mode 100644 goat/utils/__init__.py
 rename {scripts/leaderboard_backend => goat/utils}/database_helper.py (62%)
 delete mode 100644 scripts/leaderboard_backend/build_script.sh
 delete mode 100644 scripts/leaderboard_backend/eval.py
 delete mode 100644 scripts/leaderboard_backend/eval_script.sh
 delete mode 100644 scripts/leaderboard_web/database_helper.py

diff --git a/README.md b/README.md
index 2661aa3..32e708c 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,18 @@
-# Parser for tasks from "Sdamgia"
+# GOAT
 
-This parser is using scrapy lib.
+This project consists of three different subprojects:
+- Parser for tasks from Russian USE;
+- Script for validation of HF models on GOAT dataset;
+- Web app with models' leaderboard after their validation on GOAT dataset;
+
+
+## Parser
+This parser was used to gather tasks for GOAT dataset. It is using Scrapy lib.
 
 Currently, program parses tests from the Unified State Exam (EGE or OGE)
 from the [sdamgia](https://sdamgia.ru/?redir=1) website.
 
-## Structure
+### Structure
 
 Program takes exam subject, exam type, test id and the desired output file
 name as command-line arguments. The parsing result is supposed to be stored in a jsonl file.
@@ -15,16 +22,23 @@ Additionally, in the *goat* folder, there is a script called **dataset_demonstra
 After you run it (instructions on how to run it are provided below), it will display one task of each type
 from the parsed test in the console.
 
-## Usage
-
-First, you need to install the necessary libraries. To do this, run the following command from the root folder:
+### Usage
+Firstly, you need to install the necessary libraries. To do this, run the following commands:
 
-`pip install -r requirements.txt`
+```bash
+cd goat/parser
+pip install -r requirements.txt
+```
 
-To run the parser, navigate to the goat directory
-and run the following command in the console:
+To run the parser, run the following command from goat/parser directory:
 
-`scrapy crawl sdamgia -a subject='your exam subject' -a exam_type='your exam type' -a test_id='your test id' -O <output file>`
+```bash
+scrapy crawl sdamgia \
+    -a subject='your exam subject' \
+    -a exam_type='your exam type' \
+    -a test_id='your test id' \
+    -O <output file>
+```
 
 *your exam subject* indicates which subject the exam is in. Currently acceptable subject values are 'soc' and 'lit'.
 
@@ -34,8 +48,56 @@ and run the following command in the console:
 
 *output file* is file name that parser will generate or overwrite with parsing output. For example - ege_data.jsonl.
 
-To run the dataset_demonstration.py script, execute the following command in the root directory:
+To run the dataset_demonstration.py script, execute the following command from the root directory:
 
-`python .\goat\dataset_demonstration.py -f <parser output file name>`
+`python goat/parser/dataset_demonstration.py -f <parser output file name>`
 
 where *parser output file name* is the name of the jsonl file that parser has generated.
+
+## Leaderboard frontend
+
+### Structure
+My leaderboard follows similar structure that [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) uses.
+It is a gradio web app that is used in a HuggingFace space. Database info is stored in environment variables.
+
+### Usage
+Firstly, you need to install the necessary libraries. To do this, run the following commands:
+
+```bash
+cd goat/frontend
+pip install -r requirements.txt
+```
+
+In this app you can send your model validation request to
+backend database and after some time validation result on your model will appear
+in the leaderboard folder after reloading the app.
+
+To run leaderboard web app execute this command from root directory
+(it is supposed that you have set all needed environment variables for database connection):
+
+`python -m goat.frontend.app`
+
+## Leaderboard backend
+
+### Structure
+Leaderboard backend after receiving new validation request validate
+the model in the request on GOAT dataset using modified
+[LM Evaluation Harness benchmark](https://github.com/deepvk/lm-evaluation-harness/tree/goat) from deepvk repository.
+After finishing validation it adds the resulting scores in the leaderboard.
+
+### Usage
+Firstly, you need to install the necessary libraries. To do this, run the following commands:
+
+```bash
+cd goat/backend
+pip install -r requirements.txt
+```
+
+To run leaderboard backend execute this command from root directory
+(it is supposed that you have set all needed environment variables for database connection):
+
+`python -m goat.backend.app`
+
+After running the script, it will listen to new validation requests in the database.
+After receiving new request it will start validating the model in the request on GOAT dataset.
+After getting results of the validation it will add these results in leaderboard table in database.
diff --git a/goat/backend/Dockerfile b/goat/backend/Dockerfile
new file mode 100644
index 0000000..7f36119
--- /dev/null
+++ b/goat/backend/Dockerfile
@@ -0,0 +1,14 @@
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+WORKDIR /app
+COPY ./requirements.txt /app
+
+RUN apt-get update && apt-get install
+RUN apt-get install -y libpq-dev
+
+RUN pip install -r requirements.txt
+RUN pip uninstall -y flash-attn
+RUN pip install flash-attn==2.5.8
+
+WORKDIR /leaderboard_eval
+ENTRYPOINT ["python", "eval.py"]
diff --git a/goat/backend/__init__.py b/goat/backend/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/leaderboard_backend/add_result.py b/goat/backend/add_results.py
similarity index 79%
rename from scripts/leaderboard_backend/add_result.py
rename to goat/backend/add_results.py
index 1f0a5c1..b66b676 100644
--- a/scripts/leaderboard_backend/add_result.py
+++ b/goat/backend/add_results.py
@@ -1,10 +1,10 @@
 # type: ignore
-import argparse
 import json
 
-from database_helper import DatabaseHelper, EvalResult
 from datasets import get_dataset_config_names, load_dataset
 
+from goat.utils.database_helper import DatabaseHelper, EvalResult
+
 
 def get_datasets_len(tasks):
     datasets_len = dict()
@@ -24,17 +24,7 @@ def get_datasets_len(tasks):
     return datasets_len
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="This program redacts dataset -_-")
-    parser.add_argument("eval_result_path", type=str, help="Path to evaluation result")
-
-    args = parser.parse_args()
-    with open(args.eval_result_path, "r") as j:
-        contents = json.loads(j.read())
-    evaluation = contents["results"]
-    tasks = get_dataset_config_names("deepvk/goat")
-
-    datasets_len = get_datasets_len(tasks)
+def get_metrics_values(tasks, evaluation, datasets_len):
     metrics = [
         "multi_choice_em_unordered,get-answer",
         "word_in_set,none",
@@ -62,12 +52,19 @@ def get_datasets_len(tasks):
     multiple_choice_score /= datasets_len["multiple_choice"]
     word_gen_score /= datasets_len["word_gen"]
 
-    model_params = contents["config"]["model_args"].split(",")
-    model_name = None
-    for param in model_params:
-        if "pretrained" in param:
-            model_name = param[11:]
-        break
+    return single_choice_score, multiple_choice_score, word_gen_score
+
+
+def add_results(input_path):
+    with open(input_path, "r") as j:
+        contents = json.loads(j.read())
+    evaluation = contents["results"]
+    tasks = get_dataset_config_names("deepvk/goat")
+
+    datasets_len = get_datasets_len(tasks)
+    single_choice_score, multiple_choice_score, word_gen_score = get_metrics_values(tasks, evaluation, datasets_len)
+
+    model_name = contents["config"]["model"]
 
     eval_result = EvalResult(
         model=model_name,
@@ -78,3 +75,4 @@ def get_datasets_len(tasks):
 
     db = DatabaseHelper()
     db.add_eval_result(eval_result)
+    db.end_connection()
diff --git a/goat/backend/docker-compose.yml b/goat/backend/docker-compose.yml
new file mode 100644
index 0000000..7a1e07d
--- /dev/null
+++ b/goat/backend/docker-compose.yml
@@ -0,0 +1,19 @@
+services:
+   leaderboard:
+     image: leaderboard_eval:latest
+     volumes:
+       - /home/m.vyrodov/leaderboards/backend:/leaderboard_eval
+     stdin_open: true
+     tty: true
+     environment:
+       POSTGRES_IP: ${POSTGRES_IP}
+       POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
+       POSTGRES_USER: ${POSTGRES_USER}
+       POSTGRES_DB: ${POSTGRES_DB}
+     deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
diff --git a/goat/backend/eval.py b/goat/backend/eval.py
new file mode 100644
index 0000000..6cc89ac
--- /dev/null
+++ b/goat/backend/eval.py
@@ -0,0 +1,24 @@
+import json
+
+from lm_eval import evaluator
+from lm_eval.models.huggingface import HFLM
+
+from goat.backend.add_results import add_results
+from goat.utils.database_helper import DatabaseHelper
+
+
+def eval(model_name: str, precision: str):
+    lm = HFLM(pretrained=model_name, dtype=precision)
+    taskname = "goat"
+    results = evaluator.simple_evaluate(model=lm, tasks=[taskname])
+
+    filename = model_name.replace("/", "__")
+    with open(f"results/{filename}.json", "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False)
+
+    add_results(input_path=f"results/{filename}.json")
+
+
+if __name__ == "__main__":
+    db_helper = DatabaseHelper()
+    db_helper.listen_to_new_requests(eval)
diff --git a/goat/backend/requirements.txt b/goat/backend/requirements.txt
new file mode 100644
index 0000000..6bf15e0
--- /dev/null
+++ b/goat/backend/requirements.txt
@@ -0,0 +1,9 @@
+flash-attn==2.5.8
+lm_eval @ git+https://github.com/deepvk/lm-evaluation-harness@goat
+psycopg2==2.9.9
+SQLAlchemy==2.0.29
+torch==2.2.0
+torchdata==0.7.1
+torchtext==0.17.0
+torchvision==0.17.0
+transformer_engine==0.0.0
diff --git a/goat/database/bd_init_script.sql b/goat/database/bd_init_script.sql
new file mode 100644
index 0000000..d9285a3
--- /dev/null
+++ b/goat/database/bd_init_script.sql
@@ -0,0 +1,38 @@
+create table if not exists public.leaderboard
+(
+    model varchar not null
+        primary key,
+    single_choice double precision,
+    multiple_choice double precision,
+    word_gen double precision
+);
+
+alter table public.leaderboard
+    owner to habrpguser;
+
+create table if not exists public.eval_requests
+(
+    id          serial
+        constraint eval_requests_pk
+            primary key,
+    model_name  varchar not null,
+    precision   varchar not null
+);
+
+alter table public.eval_requests
+    owner to habrpguser;
+
+alter table public.eval_requests
+    owner to habrpguser;
+
+create or replace function notify_id_trigger()
+returns trigger as $$
+begin
+    perform pg_notify('id'::text, NEW."id"::text);
+    return new;
+end;
+$$ language plpgsql;
+
+create trigger trigger1
+after insert or update on public."eval_requests"
+for each row execute procedure notify_id_trigger();
diff --git a/goat/frontend/__init__.py b/goat/frontend/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/leaderboard_web/app.py b/goat/frontend/app.py
similarity index 77%
rename from scripts/leaderboard_web/app.py
rename to goat/frontend/app.py
index 5c8ce15..0f0a0e3 100644
--- a/scripts/leaderboard_web/app.py
+++ b/goat/frontend/app.py
@@ -1,19 +1,21 @@
 # type: ignore
 import gradio as gr
-from database_helper import DatabaseHelper
-from src_display_css_html_js import custom_css
-from utils import Precision
+
+from goat.frontend.precision import Precision
+
+from ..utils.database_helper import DatabaseHelper, EvalRequest
 
 TITLE = "Goat leaderboard"
 INTRODUCTION_TEXT = "This is really nice introduction text!!!"
 EVALUATION_QUEUE_TEXT = "there is evaluation queue"
 
+
 db_helper = DatabaseHelper()
 
 leaderboard_df = db_helper.get_leaderboard_df()
 
 
-demo = gr.Blocks(css=custom_css)
+demo = gr.Blocks(css="src_display.css")
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
@@ -43,20 +45,11 @@
                         value="float16",
                         interactive=True,
                     )
-                    num_fewshot = gr.Number(
-                        label="Fewshot number",
-                        minimum=0,
-                        maximum=5,
-                        step=1,
-                        value=5,
-                        interactive=True,
-                    )
-
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
             submit_button.click(
                 db_helper.add_eval_request,
-                [model_name, model_precision, num_fewshot],
+                [model_name, model_precision],
                 submission_result,
             )
 
diff --git a/scripts/leaderboard_web/utils.py b/goat/frontend/precision.py
similarity index 100%
rename from scripts/leaderboard_web/utils.py
rename to goat/frontend/precision.py
diff --git a/goat/frontend/requirements.txt b/goat/frontend/requirements.txt
new file mode 100644
index 0000000..341cfe5
--- /dev/null
+++ b/goat/frontend/requirements.txt
@@ -0,0 +1,3 @@
+gradio==4.31.3
+psycopg2==2.9.9
+SQLAlchemy==2.0.29
diff --git a/scripts/leaderboard_web/src_display_css_html_js.py b/goat/frontend/src_display.css
similarity index 86%
rename from scripts/leaderboard_web/src_display_css_html_js.py
rename to goat/frontend/src_display.css
index 4b0511c..cc385f1 100644
--- a/scripts/leaderboard_web/src_display_css_html_js.py
+++ b/goat/frontend/src_display.css
@@ -1,4 +1,3 @@
-custom_css = """
 /* Hides the final AutoEvalColumn */
 #llm-benchmark-tab-table table td:last-child,
 #llm-benchmark-tab-table table th:last-child {
@@ -86,12 +85,3 @@
 #box-filter > .form{
     border: 0
 }
-"""
-
-get_window_url_params = """
-    function(url_params) {
-        const params = new URLSearchParams(window.location.search);
-        url_params = Object.fromEntries(params);
-        return url_params;
-    }
-    """
diff --git a/goat/parser/__init__.py b/goat/parser/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/goat/dataset_demonstration.py b/goat/parser/dataset_demonstration.py
similarity index 100%
rename from goat/dataset_demonstration.py
rename to goat/parser/dataset_demonstration.py
diff --git a/goat/dataset_utils.py b/goat/parser/dataset_utils.py
similarity index 100%
rename from goat/dataset_utils.py
rename to goat/parser/dataset_utils.py
diff --git a/goat/items.py b/goat/parser/items.py
similarity index 100%
rename from goat/items.py
rename to goat/parser/items.py
diff --git a/goat/middlewares.py b/goat/parser/middlewares.py
similarity index 100%
rename from goat/middlewares.py
rename to goat/parser/middlewares.py
diff --git a/goat/pipelines.py b/goat/parser/pipelines.py
similarity index 100%
rename from goat/pipelines.py
rename to goat/parser/pipelines.py
diff --git a/requirements.txt b/goat/parser/requirements.txt
similarity index 100%
rename from requirements.txt
rename to goat/parser/requirements.txt
diff --git a/goat/settings.py b/goat/parser/settings.py
similarity index 96%
rename from goat/settings.py
rename to goat/parser/settings.py
index 46556e2..fbd56ab 100644
--- a/goat/settings.py
+++ b/goat/parser/settings.py
@@ -7,10 +7,10 @@
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 
-BOT_NAME = "goat"
+BOT_NAME = "parser"
 
-SPIDER_MODULES = ["goat.spiders"]
-NEWSPIDER_MODULE = "goat.spiders"
+SPIDER_MODULES = ["goat.parser.spiders"]
+NEWSPIDER_MODULE = "goat.parser.spiders"
 
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
diff --git a/goat/spider_utils.py b/goat/parser/spider_utils.py
similarity index 100%
rename from goat/spider_utils.py
rename to goat/parser/spider_utils.py
diff --git a/goat/spiders/__init__.py b/goat/parser/spiders/__init__.py
similarity index 100%
rename from goat/spiders/__init__.py
rename to goat/parser/spiders/__init__.py
diff --git a/goat/spiders/sdamgia_spider.py b/goat/parser/spiders/sdamgia_spider.py
similarity index 98%
rename from goat/spiders/sdamgia_spider.py
rename to goat/parser/spiders/sdamgia_spider.py
index bbf70eb..93de7b0 100644
--- a/goat/spiders/sdamgia_spider.py
+++ b/goat/parser/spiders/sdamgia_spider.py
@@ -3,8 +3,8 @@
 import scrapy
 from scrapy.http import Response
 
-from goat.items import SdamgiaTaskItem
-from goat.spider_utils import *
+from goat.parser.items import SdamgiaTaskItem
+from goat.parser.spider_utils import *
 
 
 class SdamgiaSpider(scrapy.Spider):
diff --git a/goat/utils/__init__.py b/goat/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/leaderboard_backend/database_helper.py b/goat/utils/database_helper.py
similarity index 62%
rename from scripts/leaderboard_backend/database_helper.py
rename to goat/utils/database_helper.py
index c099107..4b3f963 100644
--- a/scripts/leaderboard_backend/database_helper.py
+++ b/goat/utils/database_helper.py
@@ -1,4 +1,5 @@
 # type: ignore
+import os
 import select
 from dataclasses import dataclass
 
@@ -8,6 +9,12 @@
 from sqlalchemy import MetaData, Table, create_engine, insert, text
 from sqlalchemy.orm import sessionmaker
 
+POSTGRES_USER = os.environ.get("POSTGRES_USER")
+POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
+POSTGRES_DB = os.environ.get("POSTGRES_DB")
+POSTGRES_IP = os.environ.get("POSTGRES_IP")
+POSTGRES_PORT = os.environ.get("POSTGRES_PORT")
+
 
 @dataclass
 class EvalResult:
@@ -17,14 +24,20 @@ class EvalResult:
     word_gen: float
 
 
+@dataclass
+class EvalRequest:
+    model_name: str
+    precision: str
+
+
 class DatabaseHelper:
     def __init__(self):
         self.engine = create_engine(
-            "postgresql+psycopg2://{username}:{passwd}@{ip}:5432/{db}",
+            f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_IP}:{POSTGRES_PORT}/{POSTGRES_DB}",
             echo=True,
         )
         self.engine.connect()
-        conn_string = "dbname={db} user={username} password={passwd} port='5432' host={ip}"
+        conn_string = f"dbname='{POSTGRES_DB}' user='{POSTGRES_USER}' password='{POSTGRES_PASSWORD}' port='{POSTGRES_PORT}' host='{POSTGRES_IP}'"
         self.connection = psycopg2.connect(conn_string)
         self.connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
         Session = sessionmaker(bind=self.engine)
@@ -32,6 +45,22 @@ def __init__(self):
         metadata = MetaData()
         metadata.reflect(bind=self.engine)
         self.leaderboard = Table("leaderboard", metadata, autoload_with=self.engine)
+        self.eval_requests = Table("eval_requests", metadata, autoload_with=self.engine)
+
+    def add_eval_request(self, model_name, precision):
+        request = insert(self.eval_requests).values(model_name=model_name, precision=precision)
+        self.session.execute(request)
+        self.session.commit()
+
+    def add_eval_result(self, eval_result):
+        stmt = insert(self.leaderboard).values(
+            model=eval_result.model,
+            single_choice=eval_result.single_choice,
+            multiple_choice=eval_result.multiple_choice,
+            word_gen=eval_result.word_gen,
+        )
+        self.session.execute(stmt)
+        self.session.commit()
 
     def listen_to_new_requests(self, action):
         cur = self.connection.cursor()
@@ -43,19 +72,15 @@ def listen_to_new_requests(self, action):
                 notify = self.connection.notifies.pop()
                 query = "SELECT * FROM eval_requests"
                 df = pd.DataFrame(self.engine.connect().execute(text(query)))
-                model, precision, num_fewshot = (
+                model, precision = (
                     df.loc[df["id"] == int(notify.payload)]["model_name"].to_string(index=False),
                     df.loc[df["id"] == int(notify.payload)]["precision"].to_string(index=False),
-                    df.loc[df["id"] == int(notify.payload)]["num_fewshot"].to_string(index=False),
                 )
-                action(model, precision, num_fewshot)
+                action(model, precision)
 
-    def add_eval_result(self, eval_result):
-        stmt = insert(self.leaderboard).values(
-            model=eval_result.model,
-            single_choice=eval_result.single_choice,
-            multiple_choice=eval_result.multiple_choice,
-            word_gen=eval_result.word_gen,
-        )
-        self.session.execute(stmt)
-        self.session.commit()
+    def get_leaderboard_df(self):
+        df = pd.read_sql_table("leaderboard", self.engine)
+        return df
+
+    def end_connection(self):
+        self.connection.close()
diff --git a/scrapy.cfg b/scrapy.cfg
index de75b48..60cf7e0 100644
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -4,8 +4,8 @@
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 
 [settings]
-default = goat.settings
+default = goat.parser.settings
 
 [deploy]
 #url = http://localhost:6800/
-project = goat
+project = parser
diff --git a/scripts/leaderboard_backend/build_script.sh b/scripts/leaderboard_backend/build_script.sh
deleted file mode 100644
index 17f70a5..0000000
--- a/scripts/leaderboard_backend/build_script.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#! /usr/bin/bash
-
-cd ../lm-evaluation-harness
-pip install -e .
diff --git a/scripts/leaderboard_backend/eval.py b/scripts/leaderboard_backend/eval.py
deleted file mode 100644
index 46b5173..0000000
--- a/scripts/leaderboard_backend/eval.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# type: ignore
-import subprocess
-
-from database_helper import DatabaseHelper
-
-
-def eval_model(model_name: str, precision: str, num_fewshot: str):
-    subprocess.run(["./eval_script.sh", model_name, precision, num_fewshot])
-
-
-if __name__ == "__main__":
-    db_helper = DatabaseHelper()
-    subprocess.run(["./build_script.sh"])
-    db_helper.listen_to_new_requests(eval_model)
diff --git a/scripts/leaderboard_backend/eval_script.sh b/scripts/leaderboard_backend/eval_script.sh
deleted file mode 100644
index 6a74ec0..0000000
--- a/scripts/leaderboard_backend/eval_script.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#! /usr/bin/bash
-
-model_name=$1
-replaced_model_name=${model_name//\//__}
-
-cd ../lm-evaluation-harness
-lm_eval --model hf --model_args pretrained="$model_name",dtype="$2" --num_fewshot "$3" --tasks goat --device cuda --output_path "results/$replaced_model_name" --log_samples
-
-cd ../leaderboard_evaluation
-python add_result.py "../lm-evaluation-harness/results/$replaced_model_name/results.json"
diff --git a/scripts/leaderboard_web/database_helper.py b/scripts/leaderboard_web/database_helper.py
deleted file mode 100644
index 64cdab1..0000000
--- a/scripts/leaderboard_web/database_helper.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# type: ignore
-import select
-
-import pandas as pd
-import psycopg2
-from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
-from sqlalchemy import Column, Float, Integer, String, create_engine
-from sqlalchemy.orm import declarative_base, sessionmaker
-
-Base = declarative_base()
-
-
-class Leaderboard(Base):
-    __tablename__ = "leaderboard"
-
-    model_name = Column("model", String, primary_key=True)
-    single_choice = Column("single_choice", Float)
-    multiple_choice = Column("multiple_choice", Float)
-    word_gen = Column("word_gen", Float)
-
-    def __init__(
-        self,
-        model_name,
-        single_choice_score,
-        mult_choice_score,
-        word_gen_score,
-    ):
-        self.model_name = model_name
-        self.single_choice = single_choice_score
-        self.multiple_choice = mult_choice_score
-        self.word_gen = word_gen_score
-
-    def __repr__(self):
-        return (
-            f"{self.model_name}:\n"
-            f"{self.single_choice} acc on single choice tasks;\n"
-            f"{self.multiple_choice} metric score on multiple choice tasks;\n"
-            f"{self.word_gen} metric score on word generation tasks."
-        )
-
-
-class EvalRequest(Base):
-    __tablename__ = "eval_requests"
-
-    id = Column(Integer, primary_key=True)
-    model_name = Column("model_name", String)
-    precision = Column("precision", String)
-    num_fewshot = Column("num_fewshot", Integer)
-
-    def __init__(self, model_name, precision, num_fewshot):
-        self.model_name = model_name
-        self.precision = precision
-        self.num_fewshot = num_fewshot
-
-    def __repr__(self):
-        return (
-            f"Evaluation request on model {self.model_name}\n"
-            f"with {self.precision} precision and {self.num_fewshot}-shot prompt."
-        )
-
-
-class DatabaseHelper:
-    def __init__(self):
-        self.engine = create_engine(
-            "postgresql+psycopg2://{username}:{passwd}@{ip}:5432/{db}",
-            echo=True,
-        )
-        self.engine.connect()
-        conn_string = "dbname={db} user={username} password={passwd} port='5432' host={ip}"
-        self.connection = psycopg2.connect(conn_string)
-        self.connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
-        Base.metadata.create_all(bind=self.engine)
-        Session = sessionmaker(bind=self.engine)
-        self.session = Session()
-
-    def add_eval_request(self, model_name, precision, num_fewshot):
-        request = EvalRequest(model_name, precision, num_fewshot)
-        self.session.add(request)
-        self.session.commit()
-
-    def listen_to(self):
-        cur = self.connection.cursor()
-        cur.execute("LISTEN id;")
-        while True:
-            select.select([self.connection], [], [])
-            self.connection.poll()
-            while self.connection.notifies:
-                notify = self.connection.notifies.pop()
-                print("Got NOTIFY:", notify.pid, notify.channel, notify.payload)
-
-    def get_leaderboard_df(self):
-        df = pd.read_sql_table("leaderboard", self.engine)
-        # For proper displaying
-        df["useless"] = 0
-        return df
-
-    def end_connection(self):
-        self.connection.close()