Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Leaderboard #3

Merged
merged 11 commits into from
Jun 24, 2024
80 changes: 80 additions & 0 deletions scripts/leaderboard_backend/add_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# type: ignore
import argparse
import json

from database_helper import DatabaseHelper, EvalResult
from datasets import get_dataset_config_names, load_dataset


def get_datasets_len(tasks):
datasets_len = dict()
datasets_len["single_choice"] = 0
datasets_len["multiple_choice"] = 0
datasets_len["word_gen"] = 0

for task in tasks:
dataset = load_dataset("deepvk/goat", task, split="test")
datasets_len[task] = len(dataset)
if "single_choice" in task:
datasets_len["single_choice"] += len(dataset)
elif "multiple_choice" in task:
datasets_len["multiple_choice"] += len(dataset)
elif "word_gen" in task:
datasets_len["word_gen"] += len(dataset)
return datasets_len


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="This program redacts dataset -_-")
parser.add_argument("eval_result_path", type=str, help="Path to evaluation result")

args = parser.parse_args()
with open(args.eval_result_path, "r") as j:
contents = json.loads(j.read())
evaluation = contents["results"]
tasks = get_dataset_config_names("deepvk/goat")

datasets_len = get_datasets_len(tasks)
metrics = [
"multi_choice_em_unordered,get-answer",
"word_in_set,none",
"multi_choice_em_unordered,get-answer",
"acc,none",
]

single_choice_score = 0
multiple_choice_score = 0
word_gen_score = 0

for task in tasks:
for metric in metrics:
if metric in evaluation[task].keys():
if "single_choice" in task:
single_choice_score += datasets_len[task] * evaluation[task][metric]
elif "multiple_choice" in task:
multiple_choice_score += datasets_len[task] * evaluation[task][metric]
elif "word_gen" in task:
word_gen_score += datasets_len[task] * evaluation[task][metric]
print(evaluation[task][metric])
break

single_choice_score /= datasets_len["single_choice"]
multiple_choice_score /= datasets_len["multiple_choice"]
word_gen_score /= datasets_len["word_gen"]

model_params = contents["config"]["model_args"].split(",")
model_name = None
for param in model_params:
if "pretrained" in param:
model_name = param[11:]
break

eval_result = EvalResult(
model=model_name,
single_choice=single_choice_score,
multiple_choice=multiple_choice_score,
word_gen=word_gen_score,
)

db = DatabaseHelper()
db.add_eval_result(eval_result)
4 changes: 4 additions & 0 deletions scripts/leaderboard_backend/build_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#! /usr/bin/bash

cd ../lm-evaluation-harness
pip install -e .
VyrodovMikhail marked this conversation as resolved.
Show resolved Hide resolved
61 changes: 61 additions & 0 deletions scripts/leaderboard_backend/database_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# type: ignore
import select
from dataclasses import dataclass

import pandas as pd
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from sqlalchemy import MetaData, Table, create_engine, insert, text
from sqlalchemy.orm import sessionmaker


@dataclass
class EvalResult:
model: str
single_choice: float
multiple_choice: float
word_gen: float


class DatabaseHelper:
def __init__(self):
self.engine = create_engine(
"postgresql+psycopg2://{username}:{passwd}@{ip}:5432/{db}",
echo=True,
)
self.engine.connect()
conn_string = "dbname={db} user={username} password={passwd} port='5432' host={ip}"
VyrodovMikhail marked this conversation as resolved.
Show resolved Hide resolved
self.connection = psycopg2.connect(conn_string)
self.connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
Session = sessionmaker(bind=self.engine)
self.session = Session()
metadata = MetaData()
metadata.reflect(bind=self.engine)
self.leaderboard = Table("leaderboard", metadata, autoload_with=self.engine)

def listen_to_new_requests(self, action):
cur = self.connection.cursor()
cur.execute("LISTEN id;")
while True:
select.select([self.connection], [], [])
self.connection.poll()
while self.connection.notifies:
notify = self.connection.notifies.pop()
query = "SELECT * FROM eval_requests"
df = pd.DataFrame(self.engine.connect().execute(text(query)))
model, precision, num_fewshot = (
df.loc[df["id"] == int(notify.payload)]["model_name"].to_string(index=False),
df.loc[df["id"] == int(notify.payload)]["precision"].to_string(index=False),
df.loc[df["id"] == int(notify.payload)]["num_fewshot"].to_string(index=False),
)
action(model, precision, num_fewshot)

def add_eval_result(self, eval_result):
stmt = insert(self.leaderboard).values(
model=eval_result.model,
single_choice=eval_result.single_choice,
multiple_choice=eval_result.multiple_choice,
word_gen=eval_result.word_gen,
)
self.session.execute(stmt)
self.session.commit()
14 changes: 14 additions & 0 deletions scripts/leaderboard_backend/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# type: ignore
import subprocess

from database_helper import DatabaseHelper


def eval_model(model_name: str, precision: str, num_fewshot: str):
subprocess.run(["./eval_script.sh", model_name, precision, num_fewshot])
VyrodovMikhail marked this conversation as resolved.
Show resolved Hide resolved


if __name__ == "__main__":
db_helper = DatabaseHelper()
subprocess.run(["./build_script.sh"])
db_helper.listen_to_new_requests(eval_model)
10 changes: 10 additions & 0 deletions scripts/leaderboard_backend/eval_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#! /usr/bin/bash

model_name=$1
replaced_model_name=${model_name//\//__}

cd ../lm-evaluation-harness
lm_eval --model hf --model_args pretrained="$model_name",dtype="$2" --num_fewshot "$3" --tasks goat --device cuda --output_path "results/$replaced_model_name" --log_samples

cd ../leaderboard_evaluation
python add_result.py "../lm-evaluation-harness/results/$replaced_model_name/results.json"
63 changes: 63 additions & 0 deletions scripts/leaderboard_web/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# type: ignore
import gradio as gr
from database_helper import DatabaseHelper
from src_display_css_html_js import custom_css
from utils import Precision

TITLE = "Goat leaderboard"
INTRODUCTION_TEXT = "This is really nice introduction text!!!"
EVALUATION_QUEUE_TEXT = "there is evaluation queue"

db_helper = DatabaseHelper()

leaderboard_df = db_helper.get_leaderboard_df()


demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=1):
leaderboard_table = gr.components.Dataframe(
value=leaderboard_df,
headers=["Model", "GOAT"],
interactive=False,
)
with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

with gr.Row():
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")

with gr.Row():
with gr.Column():
model_name = gr.Textbox(label="Model name on HF")
model_precision = gr.Dropdown(
choices=[i.value for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
num_fewshot = gr.Number(
label="Fewshot number",
minimum=0,
maximum=5,
step=1,
value=5,
interactive=True,
)

submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
db_helper.add_eval_request,
[model_name, model_precision, num_fewshot],
submission_result,
)

demo.queue(default_concurrency_limit=40).launch()
98 changes: 98 additions & 0 deletions scripts/leaderboard_web/database_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# type: ignore
import select

import pandas as pd
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from sqlalchemy import Column, Float, Integer, String, create_engine
from sqlalchemy.orm import declarative_base, sessionmaker

Base = declarative_base()


class Leaderboard(Base):
__tablename__ = "leaderboard"

model_name = Column("model", String, primary_key=True)
single_choice = Column("single_choice", Float)
multiple_choice = Column("multiple_choice", Float)
word_gen = Column("word_gen", Float)

def __init__(
self,
model_name,
single_choice_score,
mult_choice_score,
word_gen_score,
):
self.model_name = model_name
self.single_choice = single_choice_score
self.multiple_choice = mult_choice_score
self.word_gen = word_gen_score

def __repr__(self):
return (
f"{self.model_name}:\n"
f"{self.single_choice} acc on single choice tasks;\n"
f"{self.multiple_choice} metric score on multiple choice tasks;\n"
f"{self.word_gen} metric score on word generation tasks."
)


class EvalRequest(Base):
__tablename__ = "eval_requests"

id = Column(Integer, primary_key=True)
model_name = Column("model_name", String)
precision = Column("precision", String)
num_fewshot = Column("num_fewshot", Integer)

def __init__(self, model_name, precision, num_fewshot):
self.model_name = model_name
self.precision = precision
self.num_fewshot = num_fewshot

def __repr__(self):
return (
f"Evaluation request on model {self.model_name}\n"
f"with {self.precision} precision and {self.num_fewshot}-shot prompt."
)


class DatabaseHelper:
def __init__(self):
self.engine = create_engine(
"postgresql+psycopg2://{username}:{passwd}@{ip}:5432/{db}",
echo=True,
)
self.engine.connect()
conn_string = "dbname={db} user={username} password={passwd} port='5432' host={ip}"
self.connection = psycopg2.connect(conn_string)
self.connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
Base.metadata.create_all(bind=self.engine)
Session = sessionmaker(bind=self.engine)
self.session = Session()
VyrodovMikhail marked this conversation as resolved.
Show resolved Hide resolved

def add_eval_request(self, model_name, precision, num_fewshot):
request = EvalRequest(model_name, precision, num_fewshot)
self.session.add(request)
self.session.commit()

def listen_to(self):
cur = self.connection.cursor()
cur.execute("LISTEN id;")
while True:
select.select([self.connection], [], [])
self.connection.poll()
while self.connection.notifies:
notify = self.connection.notifies.pop()
print("Got NOTIFY:", notify.pid, notify.channel, notify.payload)

def get_leaderboard_df(self):
df = pd.read_sql_table("leaderboard", self.engine)
# For proper displaying
df["useless"] = 0
return df

def end_connection(self):
self.connection.close()
Loading
Loading