deepvk · SpirinEgor · Jun 24, 2024 · Apr 27, 2024 · May 21, 2024 · May 21, 2024
diff --git a/scripts/leaderboard_backend/add_result.py b/scripts/leaderboard_backend/add_result.py
@@ -0,0 +1,80 @@
+# type: ignore
+import argparse
+import json
+
+from database_helper import DatabaseHelper, EvalResult
+from datasets import get_dataset_config_names, load_dataset
+
+
+def get_datasets_len(tasks):
+ datasets_len = dict()
+ datasets_len["single_choice"] = 0
+ datasets_len["multiple_choice"] = 0
+ datasets_len["word_gen"] = 0
+
+ for task in tasks:
+ dataset = load_dataset("deepvk/goat", task, split="test")
+ datasets_len[task] = len(dataset)
+ if "single_choice" in task:
+ datasets_len["single_choice"] += len(dataset)
+ elif "multiple_choice" in task:
+ datasets_len["multiple_choice"] += len(dataset)
+ elif "word_gen" in task:
+ datasets_len["word_gen"] += len(dataset)
+ return datasets_len
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="This program redacts dataset -_-")
+ parser.add_argument("eval_result_path", type=str, help="Path to evaluation result")
+
+ args = parser.parse_args()
+ with open(args.eval_result_path, "r") as j:
+ contents = json.loads(j.read())
+ evaluation = contents["results"]
+ tasks = get_dataset_config_names("deepvk/goat")
+
+ datasets_len = get_datasets_len(tasks)
+ metrics = [
+ "multi_choice_em_unordered,get-answer",
+ "word_in_set,none",
+ "multi_choice_em_unordered,get-answer",
+ "acc,none",
+ ]
+
+ single_choice_score = 0
+ multiple_choice_score = 0
+ word_gen_score = 0
+
+ for task in tasks:
+ for metric in metrics:
+ if metric in evaluation[task].keys():
+ if "single_choice" in task:
+ single_choice_score += datasets_len[task] * evaluation[task][metric]
+ elif "multiple_choice" in task:
+ multiple_choice_score += datasets_len[task] * evaluation[task][metric]
+ elif "word_gen" in task:
+ word_gen_score += datasets_len[task] * evaluation[task][metric]
+ print(evaluation[task][metric])
+ break
+
+ single_choice_score /= datasets_len["single_choice"]
+ multiple_choice_score /= datasets_len["multiple_choice"]
+ word_gen_score /= datasets_len["word_gen"]
+
+ model_params = contents["config"]["model_args"].split(",")
+ model_name = None
+ for param in model_params:
+ if "pretrained" in param:
+ model_name = param[11:]
+ break
+
+ eval_result = EvalResult(
+ model=model_name,
+ single_choice=single_choice_score,
+ multiple_choice=multiple_choice_score,
+ word_gen=word_gen_score,
+ )
+
+ db = DatabaseHelper()
+ db.add_eval_result(eval_result)
diff --git a/scripts/leaderboard_backend/build_script.sh b/scripts/leaderboard_backend/build_script.sh
@@ -0,0 +1,4 @@
+#! /usr/bin/bash
+
+cd ../lm-evaluation-harness
+pip install -e .
diff --git a/scripts/leaderboard_backend/database_helper.py b/scripts/leaderboard_backend/database_helper.py
@@ -0,0 +1,61 @@
+# type: ignore
+import select
+from dataclasses import dataclass
+
+import pandas as pd
+import psycopg2
+from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
+from sqlalchemy import MetaData, Table, create_engine, insert, text
+from sqlalchemy.orm import sessionmaker
+
+
+@dataclass
+class EvalResult:
+ model: str
+ single_choice: float
+ multiple_choice: float
+ word_gen: float
+
+
+class DatabaseHelper:
+ def __init__(self):
+ self.engine = create_engine(
+ "postgresql+psycopg2://{username}:{passwd}@{ip}:5432/{db}",
+ echo=True,
+ )
+ self.engine.connect()
+ conn_string = "dbname={db} user={username} password={passwd} port='5432' host={ip}"
+ self.connection = psycopg2.connect(conn_string)
+ self.connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
+ Session = sessionmaker(bind=self.engine)
+ self.session = Session()
+ metadata = MetaData()
+ metadata.reflect(bind=self.engine)
+ self.leaderboard = Table("leaderboard", metadata, autoload_with=self.engine)
+
+ def listen_to_new_requests(self, action):
+ cur = self.connection.cursor()
+ cur.execute("LISTEN id;")
+ while True:
+ select.select([self.connection], [], [])
+ self.connection.poll()
+ while self.connection.notifies:
+ notify = self.connection.notifies.pop()
+ query = "SELECT * FROM eval_requests"
+ df = pd.DataFrame(self.engine.connect().execute(text(query)))
+ model, precision, num_fewshot = (
+ df.loc[df["id"] == int(notify.payload)]["model_name"].to_string(index=False),
+ df.loc[df["id"] == int(notify.payload)]["precision"].to_string(index=False),
+ df.loc[df["id"] == int(notify.payload)]["num_fewshot"].to_string(index=False),
+ )
+ action(model, precision, num_fewshot)
+
+ def add_eval_result(self, eval_result):
+ stmt = insert(self.leaderboard).values(
+ model=eval_result.model,
+ single_choice=eval_result.single_choice,
+ multiple_choice=eval_result.multiple_choice,
+ word_gen=eval_result.word_gen,
+ )
+ self.session.execute(stmt)
+ self.session.commit()
diff --git a/scripts/leaderboard_backend/eval.py b/scripts/leaderboard_backend/eval.py
@@ -0,0 +1,14 @@
+# type: ignore
+import subprocess
+
+from database_helper import DatabaseHelper
+
+
+def eval_model(model_name: str, precision: str, num_fewshot: str):
+ subprocess.run(["./eval_script.sh", model_name, precision, num_fewshot])
+
+
+if __name__ == "__main__":
+ db_helper = DatabaseHelper()
+ subprocess.run(["./build_script.sh"])
+ db_helper.listen_to_new_requests(eval_model)
diff --git a/scripts/leaderboard_backend/eval_script.sh b/scripts/leaderboard_backend/eval_script.sh
@@ -0,0 +1,10 @@
+#! /usr/bin/bash
+
+model_name=$1
+replaced_model_name=${model_name//\//__}
+
+cd ../lm-evaluation-harness
+lm_eval --model hf --model_args pretrained="$model_name",dtype="$2" --num_fewshot "$3" --tasks goat --device cuda --output_path "results/$replaced_model_name" --log_samples
+
+cd ../leaderboard_evaluation
+python add_result.py "../lm-evaluation-harness/results/$replaced_model_name/results.json"
diff --git a/scripts/leaderboard_web/app.py b/scripts/leaderboard_web/app.py
@@ -0,0 +1,63 @@
+# type: ignore
+import gradio as gr
+from database_helper import DatabaseHelper
+from src_display_css_html_js import custom_css
+from utils import Precision
+
+TITLE = "Goat leaderboard"
+INTRODUCTION_TEXT = "This is really nice introduction text!!!"
+EVALUATION_QUEUE_TEXT = "there is evaluation queue"
+
+db_helper = DatabaseHelper()
+
+leaderboard_df = db_helper.get_leaderboard_df()
+
+
+demo = gr.Blocks(css=custom_css)
+with demo:
+ gr.HTML(TITLE)
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
+ with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=1):
+ leaderboard_table = gr.components.Dataframe(
+ value=leaderboard_df,
+ headers=["Model", "GOAT"],
+ interactive=False,
+ )
+ with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
+ with gr.Column():
+ with gr.Row():
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+
+ with gr.Row():
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+
+ with gr.Row():
+ with gr.Column():
+ model_name = gr.Textbox(label="Model name on HF")
+ model_precision = gr.Dropdown(
+ choices=[i.value for i in Precision if i != Precision.Unknown],
+ label="Precision",
+ multiselect=False,
+ value="float16",
+ interactive=True,
+ )
+ num_fewshot = gr.Number(
+ label="Fewshot number",
+ minimum=0,
+ maximum=5,
+ step=1,
+ value=5,
+ interactive=True,
+ )
+
+ submit_button = gr.Button("Submit Eval")
+ submission_result = gr.Markdown()
+ submit_button.click(
+ db_helper.add_eval_request,
+ [model_name, model_precision, num_fewshot],
+ submission_result,
+ )
+
+demo.queue(default_concurrency_limit=40).launch()
diff --git a/scripts/leaderboard_web/database_helper.py b/scripts/leaderboard_web/database_helper.py
@@ -0,0 +1,98 @@
+# type: ignore
+import select
+
+import pandas as pd
+import psycopg2
+from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
+from sqlalchemy import Column, Float, Integer, String, create_engine
+from sqlalchemy.orm import declarative_base, sessionmaker
+
+Base = declarative_base()
+
+
+class Leaderboard(Base):
+ __tablename__ = "leaderboard"
+
+ model_name = Column("model", String, primary_key=True)
+ single_choice = Column("single_choice", Float)
+ multiple_choice = Column("multiple_choice", Float)
+ word_gen = Column("word_gen", Float)
+
+ def __init__(
+ self,
+ model_name,
+ single_choice_score,
+ mult_choice_score,
+ word_gen_score,
+ ):
+ self.model_name = model_name
+ self.single_choice = single_choice_score
+ self.multiple_choice = mult_choice_score
+ self.word_gen = word_gen_score
+
+ def __repr__(self):
+ return (
+ f"{self.model_name}:\n"
+ f"{self.single_choice} acc on single choice tasks;\n"
+ f"{self.multiple_choice} metric score on multiple choice tasks;\n"
+ f"{self.word_gen} metric score on word generation tasks."
+ )
+
+
+class EvalRequest(Base):
+ __tablename__ = "eval_requests"
+
+ id = Column(Integer, primary_key=True)
+ model_name = Column("model_name", String)
+ precision = Column("precision", String)
+ num_fewshot = Column("num_fewshot", Integer)
+
+ def __init__(self, model_name, precision, num_fewshot):
+ self.model_name = model_name
+ self.precision = precision
+ self.num_fewshot = num_fewshot
+
+ def __repr__(self):
+ return (
+ f"Evaluation request on model {self.model_name}\n"
+ f"with {self.precision} precision and {self.num_fewshot}-shot prompt."
+ )
+
+
+class DatabaseHelper:
+ def __init__(self):
+ self.engine = create_engine(
+ "postgresql+psycopg2://{username}:{passwd}@{ip}:5432/{db}",
+ echo=True,
+ )
+ self.engine.connect()
+ conn_string = "dbname={db} user={username} password={passwd} port='5432' host={ip}"
+ self.connection = psycopg2.connect(conn_string)
+ self.connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
+ Base.metadata.create_all(bind=self.engine)
+ Session = sessionmaker(bind=self.engine)
+ self.session = Session()
+
+ def add_eval_request(self, model_name, precision, num_fewshot):
+ request = EvalRequest(model_name, precision, num_fewshot)
+ self.session.add(request)
+ self.session.commit()
+
+ def listen_to(self):
+ cur = self.connection.cursor()
+ cur.execute("LISTEN id;")
+ while True:
+ select.select([self.connection], [], [])
+ self.connection.poll()
+ while self.connection.notifies:
+ notify = self.connection.notifies.pop()
+ print("Got NOTIFY:", notify.pid, notify.channel, notify.payload)
+
+ def get_leaderboard_df(self):
+ df = pd.read_sql_table("leaderboard", self.engine)
+ # For proper displaying
+ df["useless"] = 0
+ return df
+
+ def end_connection(self):
+ self.connection.close()