From 501b2b52b52867e252a05f637276b94ec585f3b2 Mon Sep 17 00:00:00 2001 From: Mikhail Vyrodov Date: Tue, 21 May 2024 13:25:20 +0300 Subject: [PATCH] Review fixes --- README.md | 86 +++++++++++++--- goat/backend/Dockerfile | 14 +++ goat/backend/__init__.py | 0 .../backend/add_results.py | 36 ++++--- goat/backend/docker-compose.yml | 19 ++++ goat/backend/eval.py | 24 +++++ goat/backend/requirements.txt | 9 ++ goat/database/bd_init_script.sql | 38 +++++++ goat/frontend/__init__.py | 0 .../leaderboard_web => goat/frontend}/app.py | 21 ++-- .../utils.py => goat/frontend/precision.py | 0 goat/frontend/requirements.txt | 3 + .../frontend/src_display.css | 10 -- goat/parser/__init__.py | 0 goat/{ => parser}/dataset_demonstration.py | 0 goat/{ => parser}/dataset_utils.py | 0 goat/{ => parser}/items.py | 0 goat/{ => parser}/middlewares.py | 0 goat/{ => parser}/pipelines.py | 0 .../parser/requirements.txt | 0 goat/{ => parser}/settings.py | 6 +- goat/{ => parser}/spider_utils.py | 0 goat/{ => parser}/spiders/__init__.py | 0 goat/{ => parser}/spiders/sdamgia_spider.py | 4 +- goat/utils/__init__.py | 0 .../utils}/database_helper.py | 53 +++++++--- scrapy.cfg | 4 +- scripts/leaderboard_backend/build_script.sh | 4 - scripts/leaderboard_backend/eval.py | 14 --- scripts/leaderboard_backend/eval_script.sh | 10 -- scripts/leaderboard_web/database_helper.py | 98 ------------------- 31 files changed, 251 insertions(+), 202 deletions(-) create mode 100644 goat/backend/Dockerfile create mode 100644 goat/backend/__init__.py rename scripts/leaderboard_backend/add_result.py => goat/backend/add_results.py (79%) create mode 100644 goat/backend/docker-compose.yml create mode 100644 goat/backend/eval.py create mode 100644 goat/backend/requirements.txt create mode 100644 goat/database/bd_init_script.sql create mode 100644 goat/frontend/__init__.py rename {scripts/leaderboard_web => goat/frontend}/app.py (77%) rename scripts/leaderboard_web/utils.py => goat/frontend/precision.py (100%) create mode 100644 goat/frontend/requirements.txt rename scripts/leaderboard_web/src_display_css_html_js.py => goat/frontend/src_display.css (86%) create mode 100644 goat/parser/__init__.py rename goat/{ => parser}/dataset_demonstration.py (100%) rename goat/{ => parser}/dataset_utils.py (100%) rename goat/{ => parser}/items.py (100%) rename goat/{ => parser}/middlewares.py (100%) rename goat/{ => parser}/pipelines.py (100%) rename requirements.txt => goat/parser/requirements.txt (100%) rename goat/{ => parser}/settings.py (96%) rename goat/{ => parser}/spider_utils.py (100%) rename goat/{ => parser}/spiders/__init__.py (100%) rename goat/{ => parser}/spiders/sdamgia_spider.py (98%) create mode 100644 goat/utils/__init__.py rename {scripts/leaderboard_backend => goat/utils}/database_helper.py (62%) delete mode 100644 scripts/leaderboard_backend/build_script.sh delete mode 100644 scripts/leaderboard_backend/eval.py delete mode 100644 scripts/leaderboard_backend/eval_script.sh delete mode 100644 scripts/leaderboard_web/database_helper.py diff --git a/README.md b/README.md index 2661aa3..32e708c 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,18 @@ -# Parser for tasks from "Sdamgia" +# GOAT -This parser is using scrapy lib. +This project consists of three different subprojects: +- Parser for tasks from Russian USE; +- Script for validation of HF models on GOAT dataset; +- Web app with models' leaderboard after their validation on GOAT dataset; + + +## Parser +This parser was used to gather tasks for GOAT dataset. It is using Scrapy lib. Currently, program parses tests from the Unified State Exam (EGE or OGE) from the [sdamgia](https://sdamgia.ru/?redir=1) website. -## Structure +### Structure Program takes exam subject, exam type, test id and the desired output file name as command-line arguments. The parsing result is supposed to be stored in a jsonl file. @@ -15,16 +22,23 @@ Additionally, in the *goat* folder, there is a script called **dataset_demonstra After you run it (instructions on how to run it are provided below), it will display one task of each type from the parsed test in the console. -## Usage - -First, you need to install the necessary libraries. To do this, run the following command from the root folder: +### Usage +Firstly, you need to install the necessary libraries. To do this, run the following commands: -`pip install -r requirements.txt` +```bash +cd goat/parser +pip install -r requirements.txt +``` -To run the parser, navigate to the goat directory -and run the following command in the console: +To run the parser, run the following command from goat/parser directory: -`scrapy crawl sdamgia -a subject='your exam subject' -a exam_type='your exam type' -a test_id='your test id' -O ` +```bash +scrapy crawl sdamgia \ + -a subject='your exam subject' \ + -a exam_type='your exam type' \ + -a test_id='your test id' \ + -O +``` *your exam subject* indicates which subject the exam is in. Currently acceptable subject values are 'soc' and 'lit'. @@ -34,8 +48,56 @@ and run the following command in the console: *output file* is file name that parser will generate or overwrite with parsing output. For example - ege_data.jsonl. -To run the dataset_demonstration.py script, execute the following command in the root directory: +To run the dataset_demonstration.py script, execute the following command from the root directory: -`python .\goat\dataset_demonstration.py -f ` +`python goat/parser/dataset_demonstration.py -f ` where *parser output file name* is the name of the jsonl file that parser has generated. + +## Leaderboard frontend + +### Structure +My leaderboard follows similar structure that [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) uses. +It is a gradio web app that is used in a HuggingFace space. Database info is stored in environment variables. + +### Usage +Firstly, you need to install the necessary libraries. To do this, run the following commands: + +```bash +cd goat/frontend +pip install -r requirements.txt +``` + +In this app you can send your model validation request to +backend database and after some time validation result on your model will appear +in the leaderboard folder after reloading the app. + +To run leaderboard web app execute this command from root directory +(it is supposed that you have set all needed environment variables for database connection): + +`python -m goat.frontend.app` + +## Leaderboard backend + +### Structure +Leaderboard backend after receiving new validation request validate +the model in the request on GOAT dataset using modified +[LM Evaluation Harness benchmark](https://github.com/deepvk/lm-evaluation-harness/tree/goat) from deepvk repository. +After finishing validation it adds the resulting scores in the leaderboard. + +### Usage +Firstly, you need to install the necessary libraries. To do this, run the following commands: + +```bash +cd goat/backend +pip install -r requirements.txt +``` + +To run leaderboard backend execute this command from root directory +(it is supposed that you have set all needed environment variables for database connection): + +`python -m goat.backend.app` + +After running the script, it will listen to new validation requests in the database. +After receiving new request it will start validating the model in the request on GOAT dataset. +After getting results of the validation it will add these results in leaderboard table in database. diff --git a/goat/backend/Dockerfile b/goat/backend/Dockerfile new file mode 100644 index 0000000..7f36119 --- /dev/null +++ b/goat/backend/Dockerfile @@ -0,0 +1,14 @@ +FROM nvcr.io/nvidia/pytorch:24.02-py3 + +WORKDIR /app +COPY ./requirements.txt /app + +RUN apt-get update && apt-get install +RUN apt-get install -y libpq-dev + +RUN pip install -r requirements.txt +RUN pip uninstall -y flash-attn +RUN pip install flash-attn==2.5.8 + +WORKDIR /leaderboard_eval +ENTRYPOINT ["python", "eval.py"] diff --git a/goat/backend/__init__.py b/goat/backend/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/leaderboard_backend/add_result.py b/goat/backend/add_results.py similarity index 79% rename from scripts/leaderboard_backend/add_result.py rename to goat/backend/add_results.py index 1f0a5c1..b66b676 100644 --- a/scripts/leaderboard_backend/add_result.py +++ b/goat/backend/add_results.py @@ -1,10 +1,10 @@ # type: ignore -import argparse import json -from database_helper import DatabaseHelper, EvalResult from datasets import get_dataset_config_names, load_dataset +from goat.utils.database_helper import DatabaseHelper, EvalResult + def get_datasets_len(tasks): datasets_len = dict() @@ -24,17 +24,7 @@ def get_datasets_len(tasks): return datasets_len -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="This program redacts dataset -_-") - parser.add_argument("eval_result_path", type=str, help="Path to evaluation result") - - args = parser.parse_args() - with open(args.eval_result_path, "r") as j: - contents = json.loads(j.read()) - evaluation = contents["results"] - tasks = get_dataset_config_names("deepvk/goat") - - datasets_len = get_datasets_len(tasks) +def get_metrics_values(tasks, evaluation, datasets_len): metrics = [ "multi_choice_em_unordered,get-answer", "word_in_set,none", @@ -62,12 +52,19 @@ def get_datasets_len(tasks): multiple_choice_score /= datasets_len["multiple_choice"] word_gen_score /= datasets_len["word_gen"] - model_params = contents["config"]["model_args"].split(",") - model_name = None - for param in model_params: - if "pretrained" in param: - model_name = param[11:] - break + return single_choice_score, multiple_choice_score, word_gen_score + + +def add_results(input_path): + with open(input_path, "r") as j: + contents = json.loads(j.read()) + evaluation = contents["results"] + tasks = get_dataset_config_names("deepvk/goat") + + datasets_len = get_datasets_len(tasks) + single_choice_score, multiple_choice_score, word_gen_score = get_metrics_values(tasks, evaluation, datasets_len) + + model_name = contents["config"]["model"] eval_result = EvalResult( model=model_name, @@ -78,3 +75,4 @@ def get_datasets_len(tasks): db = DatabaseHelper() db.add_eval_result(eval_result) + db.end_connection() diff --git a/goat/backend/docker-compose.yml b/goat/backend/docker-compose.yml new file mode 100644 index 0000000..7a1e07d --- /dev/null +++ b/goat/backend/docker-compose.yml @@ -0,0 +1,19 @@ +services: + leaderboard: + image: leaderboard_eval:latest + volumes: + - /home/m.vyrodov/leaderboards/backend:/leaderboard_eval + stdin_open: true + tty: true + environment: + POSTGRES_IP: ${POSTGRES_IP} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_DB: ${POSTGRES_DB} + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] diff --git a/goat/backend/eval.py b/goat/backend/eval.py new file mode 100644 index 0000000..6cc89ac --- /dev/null +++ b/goat/backend/eval.py @@ -0,0 +1,24 @@ +import json + +from lm_eval import evaluator +from lm_eval.models.huggingface import HFLM + +from goat.backend.add_results import add_results +from goat.utils.database_helper import DatabaseHelper + + +def eval(model_name: str, precision: str): + lm = HFLM(pretrained=model_name, dtype=precision) + taskname = "goat" + results = evaluator.simple_evaluate(model=lm, tasks=[taskname]) + + filename = model_name.replace("/", "__") + with open(f"results/{filename}.json", "w", encoding="utf-8") as f: + json.dump(results, f, ensure_ascii=False) + + add_results(input_path=f"results/{filename}.json") + + +if __name__ == "__main__": + db_helper = DatabaseHelper() + db_helper.listen_to_new_requests(eval) diff --git a/goat/backend/requirements.txt b/goat/backend/requirements.txt new file mode 100644 index 0000000..6bf15e0 --- /dev/null +++ b/goat/backend/requirements.txt @@ -0,0 +1,9 @@ +flash-attn==2.5.8 +lm_eval @ git+https://github.com/deepvk/lm-evaluation-harness@goat +psycopg2==2.9.9 +SQLAlchemy==2.0.29 +torch==2.2.0 +torchdata==0.7.1 +torchtext==0.17.0 +torchvision==0.17.0 +transformer_engine==0.0.0 diff --git a/goat/database/bd_init_script.sql b/goat/database/bd_init_script.sql new file mode 100644 index 0000000..d9285a3 --- /dev/null +++ b/goat/database/bd_init_script.sql @@ -0,0 +1,38 @@ +create table if not exists public.leaderboard +( + model varchar not null + primary key, + single_choice double precision, + multiple_choice double precision, + word_gen double precision +); + +alter table public.leaderboard + owner to habrpguser; + +create table if not exists public.eval_requests +( + id serial + constraint eval_requests_pk + primary key, + model_name varchar not null, + precision varchar not null +); + +alter table public.eval_requests + owner to habrpguser; + +alter table public.eval_requests + owner to habrpguser; + +create or replace function notify_id_trigger() +returns trigger as $$ +begin + perform pg_notify('id'::text, NEW."id"::text); + return new; +end; +$$ language plpgsql; + +create trigger trigger1 +after insert or update on public."eval_requests" +for each row execute procedure notify_id_trigger(); diff --git a/goat/frontend/__init__.py b/goat/frontend/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/leaderboard_web/app.py b/goat/frontend/app.py similarity index 77% rename from scripts/leaderboard_web/app.py rename to goat/frontend/app.py index 5c8ce15..0f0a0e3 100644 --- a/scripts/leaderboard_web/app.py +++ b/goat/frontend/app.py @@ -1,19 +1,21 @@ # type: ignore import gradio as gr -from database_helper import DatabaseHelper -from src_display_css_html_js import custom_css -from utils import Precision + +from goat.frontend.precision import Precision + +from ..utils.database_helper import DatabaseHelper, EvalRequest TITLE = "Goat leaderboard" INTRODUCTION_TEXT = "This is really nice introduction text!!!" EVALUATION_QUEUE_TEXT = "there is evaluation queue" + db_helper = DatabaseHelper() leaderboard_df = db_helper.get_leaderboard_df() -demo = gr.Blocks(css=custom_css) +demo = gr.Blocks(css="src_display.css") with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") @@ -43,20 +45,11 @@ value="float16", interactive=True, ) - num_fewshot = gr.Number( - label="Fewshot number", - minimum=0, - maximum=5, - step=1, - value=5, - interactive=True, - ) - submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( db_helper.add_eval_request, - [model_name, model_precision, num_fewshot], + [model_name, model_precision], submission_result, ) diff --git a/scripts/leaderboard_web/utils.py b/goat/frontend/precision.py similarity index 100% rename from scripts/leaderboard_web/utils.py rename to goat/frontend/precision.py diff --git a/goat/frontend/requirements.txt b/goat/frontend/requirements.txt new file mode 100644 index 0000000..341cfe5 --- /dev/null +++ b/goat/frontend/requirements.txt @@ -0,0 +1,3 @@ +gradio==4.31.3 +psycopg2==2.9.9 +SQLAlchemy==2.0.29 diff --git a/scripts/leaderboard_web/src_display_css_html_js.py b/goat/frontend/src_display.css similarity index 86% rename from scripts/leaderboard_web/src_display_css_html_js.py rename to goat/frontend/src_display.css index 4b0511c..cc385f1 100644 --- a/scripts/leaderboard_web/src_display_css_html_js.py +++ b/goat/frontend/src_display.css @@ -1,4 +1,3 @@ -custom_css = """ /* Hides the final AutoEvalColumn */ #llm-benchmark-tab-table table td:last-child, #llm-benchmark-tab-table table th:last-child { @@ -86,12 +85,3 @@ #box-filter > .form{ border: 0 } -""" - -get_window_url_params = """ - function(url_params) { - const params = new URLSearchParams(window.location.search); - url_params = Object.fromEntries(params); - return url_params; - } - """ diff --git a/goat/parser/__init__.py b/goat/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/goat/dataset_demonstration.py b/goat/parser/dataset_demonstration.py similarity index 100% rename from goat/dataset_demonstration.py rename to goat/parser/dataset_demonstration.py diff --git a/goat/dataset_utils.py b/goat/parser/dataset_utils.py similarity index 100% rename from goat/dataset_utils.py rename to goat/parser/dataset_utils.py diff --git a/goat/items.py b/goat/parser/items.py similarity index 100% rename from goat/items.py rename to goat/parser/items.py diff --git a/goat/middlewares.py b/goat/parser/middlewares.py similarity index 100% rename from goat/middlewares.py rename to goat/parser/middlewares.py diff --git a/goat/pipelines.py b/goat/parser/pipelines.py similarity index 100% rename from goat/pipelines.py rename to goat/parser/pipelines.py diff --git a/requirements.txt b/goat/parser/requirements.txt similarity index 100% rename from requirements.txt rename to goat/parser/requirements.txt diff --git a/goat/settings.py b/goat/parser/settings.py similarity index 96% rename from goat/settings.py rename to goat/parser/settings.py index 46556e2..fbd56ab 100644 --- a/goat/settings.py +++ b/goat/parser/settings.py @@ -7,10 +7,10 @@ # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html -BOT_NAME = "goat" +BOT_NAME = "parser" -SPIDER_MODULES = ["goat.spiders"] -NEWSPIDER_MODULE = "goat.spiders" +SPIDER_MODULES = ["goat.parser.spiders"] +NEWSPIDER_MODULE = "goat.parser.spiders" # Crawl responsibly by identifying yourself (and your website) on the user-agent diff --git a/goat/spider_utils.py b/goat/parser/spider_utils.py similarity index 100% rename from goat/spider_utils.py rename to goat/parser/spider_utils.py diff --git a/goat/spiders/__init__.py b/goat/parser/spiders/__init__.py similarity index 100% rename from goat/spiders/__init__.py rename to goat/parser/spiders/__init__.py diff --git a/goat/spiders/sdamgia_spider.py b/goat/parser/spiders/sdamgia_spider.py similarity index 98% rename from goat/spiders/sdamgia_spider.py rename to goat/parser/spiders/sdamgia_spider.py index bbf70eb..93de7b0 100644 --- a/goat/spiders/sdamgia_spider.py +++ b/goat/parser/spiders/sdamgia_spider.py @@ -3,8 +3,8 @@ import scrapy from scrapy.http import Response -from goat.items import SdamgiaTaskItem -from goat.spider_utils import * +from goat.parser.items import SdamgiaTaskItem +from goat.parser.spider_utils import * class SdamgiaSpider(scrapy.Spider): diff --git a/goat/utils/__init__.py b/goat/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/leaderboard_backend/database_helper.py b/goat/utils/database_helper.py similarity index 62% rename from scripts/leaderboard_backend/database_helper.py rename to goat/utils/database_helper.py index c099107..4b3f963 100644 --- a/scripts/leaderboard_backend/database_helper.py +++ b/goat/utils/database_helper.py @@ -1,4 +1,5 @@ # type: ignore +import os import select from dataclasses import dataclass @@ -8,6 +9,12 @@ from sqlalchemy import MetaData, Table, create_engine, insert, text from sqlalchemy.orm import sessionmaker +POSTGRES_USER = os.environ.get("POSTGRES_USER") +POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD") +POSTGRES_DB = os.environ.get("POSTGRES_DB") +POSTGRES_IP = os.environ.get("POSTGRES_IP") +POSTGRES_PORT = os.environ.get("POSTGRES_PORT") + @dataclass class EvalResult: @@ -17,14 +24,20 @@ class EvalResult: word_gen: float +@dataclass +class EvalRequest: + model_name: str + precision: str + + class DatabaseHelper: def __init__(self): self.engine = create_engine( - "postgresql+psycopg2://{username}:{passwd}@{ip}:5432/{db}", + f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_IP}:{POSTGRES_PORT}/{POSTGRES_DB}", echo=True, ) self.engine.connect() - conn_string = "dbname={db} user={username} password={passwd} port='5432' host={ip}" + conn_string = f"dbname='{POSTGRES_DB}' user='{POSTGRES_USER}' password='{POSTGRES_PASSWORD}' port='{POSTGRES_PORT}' host='{POSTGRES_IP}'" self.connection = psycopg2.connect(conn_string) self.connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) Session = sessionmaker(bind=self.engine) @@ -32,6 +45,22 @@ def __init__(self): metadata = MetaData() metadata.reflect(bind=self.engine) self.leaderboard = Table("leaderboard", metadata, autoload_with=self.engine) + self.eval_requests = Table("eval_requests", metadata, autoload_with=self.engine) + + def add_eval_request(self, model_name, precision): + request = insert(self.eval_requests).values(model_name=model_name, precision=precision) + self.session.execute(request) + self.session.commit() + + def add_eval_result(self, eval_result): + stmt = insert(self.leaderboard).values( + model=eval_result.model, + single_choice=eval_result.single_choice, + multiple_choice=eval_result.multiple_choice, + word_gen=eval_result.word_gen, + ) + self.session.execute(stmt) + self.session.commit() def listen_to_new_requests(self, action): cur = self.connection.cursor() @@ -43,19 +72,15 @@ def listen_to_new_requests(self, action): notify = self.connection.notifies.pop() query = "SELECT * FROM eval_requests" df = pd.DataFrame(self.engine.connect().execute(text(query))) - model, precision, num_fewshot = ( + model, precision = ( df.loc[df["id"] == int(notify.payload)]["model_name"].to_string(index=False), df.loc[df["id"] == int(notify.payload)]["precision"].to_string(index=False), - df.loc[df["id"] == int(notify.payload)]["num_fewshot"].to_string(index=False), ) - action(model, precision, num_fewshot) + action(model, precision) - def add_eval_result(self, eval_result): - stmt = insert(self.leaderboard).values( - model=eval_result.model, - single_choice=eval_result.single_choice, - multiple_choice=eval_result.multiple_choice, - word_gen=eval_result.word_gen, - ) - self.session.execute(stmt) - self.session.commit() + def get_leaderboard_df(self): + df = pd.read_sql_table("leaderboard", self.engine) + return df + + def end_connection(self): + self.connection.close() diff --git a/scrapy.cfg b/scrapy.cfg index de75b48..60cf7e0 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -4,8 +4,8 @@ # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] -default = goat.settings +default = goat.parser.settings [deploy] #url = http://localhost:6800/ -project = goat +project = parser diff --git a/scripts/leaderboard_backend/build_script.sh b/scripts/leaderboard_backend/build_script.sh deleted file mode 100644 index 17f70a5..0000000 --- a/scripts/leaderboard_backend/build_script.sh +++ /dev/null @@ -1,4 +0,0 @@ -#! /usr/bin/bash - -cd ../lm-evaluation-harness -pip install -e . diff --git a/scripts/leaderboard_backend/eval.py b/scripts/leaderboard_backend/eval.py deleted file mode 100644 index 46b5173..0000000 --- a/scripts/leaderboard_backend/eval.py +++ /dev/null @@ -1,14 +0,0 @@ -# type: ignore -import subprocess - -from database_helper import DatabaseHelper - - -def eval_model(model_name: str, precision: str, num_fewshot: str): - subprocess.run(["./eval_script.sh", model_name, precision, num_fewshot]) - - -if __name__ == "__main__": - db_helper = DatabaseHelper() - subprocess.run(["./build_script.sh"]) - db_helper.listen_to_new_requests(eval_model) diff --git a/scripts/leaderboard_backend/eval_script.sh b/scripts/leaderboard_backend/eval_script.sh deleted file mode 100644 index 6a74ec0..0000000 --- a/scripts/leaderboard_backend/eval_script.sh +++ /dev/null @@ -1,10 +0,0 @@ -#! /usr/bin/bash - -model_name=$1 -replaced_model_name=${model_name//\//__} - -cd ../lm-evaluation-harness -lm_eval --model hf --model_args pretrained="$model_name",dtype="$2" --num_fewshot "$3" --tasks goat --device cuda --output_path "results/$replaced_model_name" --log_samples - -cd ../leaderboard_evaluation -python add_result.py "../lm-evaluation-harness/results/$replaced_model_name/results.json" diff --git a/scripts/leaderboard_web/database_helper.py b/scripts/leaderboard_web/database_helper.py deleted file mode 100644 index 64cdab1..0000000 --- a/scripts/leaderboard_web/database_helper.py +++ /dev/null @@ -1,98 +0,0 @@ -# type: ignore -import select - -import pandas as pd -import psycopg2 -from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT -from sqlalchemy import Column, Float, Integer, String, create_engine -from sqlalchemy.orm import declarative_base, sessionmaker - -Base = declarative_base() - - -class Leaderboard(Base): - __tablename__ = "leaderboard" - - model_name = Column("model", String, primary_key=True) - single_choice = Column("single_choice", Float) - multiple_choice = Column("multiple_choice", Float) - word_gen = Column("word_gen", Float) - - def __init__( - self, - model_name, - single_choice_score, - mult_choice_score, - word_gen_score, - ): - self.model_name = model_name - self.single_choice = single_choice_score - self.multiple_choice = mult_choice_score - self.word_gen = word_gen_score - - def __repr__(self): - return ( - f"{self.model_name}:\n" - f"{self.single_choice} acc on single choice tasks;\n" - f"{self.multiple_choice} metric score on multiple choice tasks;\n" - f"{self.word_gen} metric score on word generation tasks." - ) - - -class EvalRequest(Base): - __tablename__ = "eval_requests" - - id = Column(Integer, primary_key=True) - model_name = Column("model_name", String) - precision = Column("precision", String) - num_fewshot = Column("num_fewshot", Integer) - - def __init__(self, model_name, precision, num_fewshot): - self.model_name = model_name - self.precision = precision - self.num_fewshot = num_fewshot - - def __repr__(self): - return ( - f"Evaluation request on model {self.model_name}\n" - f"with {self.precision} precision and {self.num_fewshot}-shot prompt." - ) - - -class DatabaseHelper: - def __init__(self): - self.engine = create_engine( - "postgresql+psycopg2://{username}:{passwd}@{ip}:5432/{db}", - echo=True, - ) - self.engine.connect() - conn_string = "dbname={db} user={username} password={passwd} port='5432' host={ip}" - self.connection = psycopg2.connect(conn_string) - self.connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) - Base.metadata.create_all(bind=self.engine) - Session = sessionmaker(bind=self.engine) - self.session = Session() - - def add_eval_request(self, model_name, precision, num_fewshot): - request = EvalRequest(model_name, precision, num_fewshot) - self.session.add(request) - self.session.commit() - - def listen_to(self): - cur = self.connection.cursor() - cur.execute("LISTEN id;") - while True: - select.select([self.connection], [], []) - self.connection.poll() - while self.connection.notifies: - notify = self.connection.notifies.pop() - print("Got NOTIFY:", notify.pid, notify.channel, notify.payload) - - def get_leaderboard_df(self): - df = pd.read_sql_table("leaderboard", self.engine) - # For proper displaying - df["useless"] = 0 - return df - - def end_connection(self): - self.connection.close()