Review fixes

deepvk · May 21, 2024 · 501b2b5 · 501b2b5
1 parent c6273a2
commit 501b2b5
Show file tree

Hide file tree

Showing 31 changed files with 251 additions and 202 deletions.
diff --git a/README.md b/README.md
@@ -1,11 +1,18 @@
-# Parser for tasks from "Sdamgia"
+# GOAT
 
-This parser is using scrapy lib.
+This project consists of three different subprojects:
+- Parser for tasks from Russian USE;
+- Script for validation of HF models on GOAT dataset;
+- Web app with models' leaderboard after their validation on GOAT dataset;
+
+
+## Parser
+This parser was used to gather tasks for GOAT dataset. It is using Scrapy lib.
 
 Currently, program parses tests from the Unified State Exam (EGE or OGE)
 from the [sdamgia](https://sdamgia.ru/?redir=1) website.
 
-## Structure
+### Structure
 
 Program takes exam subject, exam type, test id and the desired output file
 name as command-line arguments. The parsing result is supposed to be stored in a jsonl file.
@@ -15,16 +22,23 @@ Additionally, in the *goat* folder, there is a script called **dataset_demonstra
 After you run it (instructions on how to run it are provided below), it will display one task of each type
 from the parsed test in the console.
 
-## Usage
-
-First, you need to install the necessary libraries. To do this, run the following command from the root folder:
+### Usage
+Firstly, you need to install the necessary libraries. To do this, run the following commands:
 
-`pip install -r requirements.txt`
+```bash
+cd goat/parser
+pip install -r requirements.txt
+```
 
-To run the parser, navigate to the goat directory
-and run the following command in the console:
+To run the parser, run the following command from goat/parser directory:
 
-`scrapy crawl sdamgia -a subject='your exam subject' -a exam_type='your exam type' -a test_id='your test id' -O <output file>`
+```bash
+scrapy crawl sdamgia \
+ -a subject='your exam subject' \
+ -a exam_type='your exam type' \
+ -a test_id='your test id' \
+ -O <output file>
+```
 
 *your exam subject* indicates which subject the exam is in. Currently acceptable subject values are 'soc' and 'lit'.
 
@@ -34,8 +48,56 @@ and run the following command in the console:
 
 *output file* is file name that parser will generate or overwrite with parsing output. For example - ege_data.jsonl.
 
-To run the dataset_demonstration.py script, execute the following command in the root directory:
+To run the dataset_demonstration.py script, execute the following command from the root directory:
 
-`python .\goat\dataset_demonstration.py -f <parser output file name>`
+`python goat/parser/dataset_demonstration.py -f <parser output file name>`
 
 where *parser output file name* is the name of the jsonl file that parser has generated.
+
+## Leaderboard frontend
+
+### Structure
+My leaderboard follows similar structure that [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) uses.
+It is a gradio web app that is used in a HuggingFace space. Database info is stored in environment variables.
+
+### Usage
+Firstly, you need to install the necessary libraries. To do this, run the following commands:
+
+```bash
+cd goat/frontend
+pip install -r requirements.txt
+```
+
+In this app you can send your model validation request to
+backend database and after some time validation result on your model will appear
+in the leaderboard folder after reloading the app.
+
+To run leaderboard web app execute this command from root directory
+(it is supposed that you have set all needed environment variables for database connection):
+
+`python -m goat.frontend.app`
+
+## Leaderboard backend
+
+### Structure
+Leaderboard backend after receiving new validation request validate
+the model in the request on GOAT dataset using modified
+[LM Evaluation Harness benchmark](https://github.com/deepvk/lm-evaluation-harness/tree/goat) from deepvk repository.
+After finishing validation it adds the resulting scores in the leaderboard.
+
+### Usage
+Firstly, you need to install the necessary libraries. To do this, run the following commands:
+
+```bash
+cd goat/backend
+pip install -r requirements.txt
+```
+
+To run leaderboard backend execute this command from root directory
+(it is supposed that you have set all needed environment variables for database connection):
+
+`python -m goat.backend.app`
+
+After running the script, it will listen to new validation requests in the database.
+After receiving new request it will start validating the model in the request on GOAT dataset.
+After getting results of the validation it will add these results in leaderboard table in database.
diff --git a/goat/backend/Dockerfile b/goat/backend/Dockerfile
@@ -0,0 +1,14 @@
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+WORKDIR /app
+COPY ./requirements.txt /app
+
+RUN apt-get update && apt-get install
+RUN apt-get install -y libpq-dev
+
+RUN pip install -r requirements.txt
+RUN pip uninstall -y flash-attn
+RUN pip install flash-attn==2.5.8
+
+WORKDIR /leaderboard_eval
+ENTRYPOINT ["python", "eval.py"]
diff --git a/goat/backend/__init__.py b/goat/backend/__init__.py
diff --git a/scripts/leaderboard_backend/add_result.py → goat/backend/add_results.py b/scripts/leaderboard_backend/add_result.py → goat/backend/add_results.py
@@ -1,10 +1,10 @@
 # type: ignore
-import argparse
 import json
 
-from database_helper import DatabaseHelper, EvalResult
 from datasets import get_dataset_config_names, load_dataset
 
+from goat.utils.database_helper import DatabaseHelper, EvalResult
+
 
 def get_datasets_len(tasks):
  datasets_len = dict()
@@ -24,17 +24,7 @@ def get_datasets_len(tasks):
  return datasets_len
 
 
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="This program redacts dataset -_-")
- parser.add_argument("eval_result_path", type=str, help="Path to evaluation result")
-
- args = parser.parse_args()
- with open(args.eval_result_path, "r") as j:
- contents = json.loads(j.read())
- evaluation = contents["results"]
- tasks = get_dataset_config_names("deepvk/goat")
-
- datasets_len = get_datasets_len(tasks)
+def get_metrics_values(tasks, evaluation, datasets_len):
  metrics = [
  "multi_choice_em_unordered,get-answer",
  "word_in_set,none",
@@ -62,12 +52,19 @@ def get_datasets_len(tasks):
  multiple_choice_score /= datasets_len["multiple_choice"]
  word_gen_score /= datasets_len["word_gen"]
 
- model_params = contents["config"]["model_args"].split(",")
- model_name = None
- for param in model_params:
- if "pretrained" in param:
- model_name = param[11:]
- break
+ return single_choice_score, multiple_choice_score, word_gen_score
+
+
+def add_results(input_path):
+ with open(input_path, "r") as j:
+ contents = json.loads(j.read())
+ evaluation = contents["results"]
+ tasks = get_dataset_config_names("deepvk/goat")
+
+ datasets_len = get_datasets_len(tasks)
+ single_choice_score, multiple_choice_score, word_gen_score = get_metrics_values(tasks, evaluation, datasets_len)
+
+ model_name = contents["config"]["model"]
 
  eval_result = EvalResult(
  model=model_name,
@@ -78,3 +75,4 @@ def get_datasets_len(tasks):
 
  db = DatabaseHelper()
  db.add_eval_result(eval_result)
+ db.end_connection()
diff --git a/goat/backend/docker-compose.yml b/goat/backend/docker-compose.yml
@@ -0,0 +1,19 @@
+services:
+ leaderboard:
+ image: leaderboard_eval:latest
+ volumes:
+ - /home/m.vyrodov/leaderboards/backend:/leaderboard_eval
+ stdin_open: true
+ tty: true
+ environment:
+ POSTGRES_IP: ${POSTGRES_IP}
+ POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
+ POSTGRES_USER: ${POSTGRES_USER}
+ POSTGRES_DB: ${POSTGRES_DB}
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: all
+ capabilities: [gpu]
diff --git a/goat/backend/eval.py b/goat/backend/eval.py
@@ -0,0 +1,24 @@
+import json
+
+from lm_eval import evaluator
+from lm_eval.models.huggingface import HFLM
+
+from goat.backend.add_results import add_results
+from goat.utils.database_helper import DatabaseHelper
+
+
+def eval(model_name: str, precision: str):
+ lm = HFLM(pretrained=model_name, dtype=precision)
+ taskname = "goat"
+ results = evaluator.simple_evaluate(model=lm, tasks=[taskname])
+
+ filename = model_name.replace("/", "__")
+ with open(f"results/{filename}.json", "w", encoding="utf-8") as f:
+ json.dump(results, f, ensure_ascii=False)
+
+ add_results(input_path=f"results/{filename}.json")
+
+
+if __name__ == "__main__":
+ db_helper = DatabaseHelper()
+ db_helper.listen_to_new_requests(eval)
diff --git a/goat/backend/requirements.txt b/goat/backend/requirements.txt
@@ -0,0 +1,9 @@
+flash-attn==2.5.8
+lm_eval @ git+https://github.com/deepvk/lm-evaluation-harness@goat
+psycopg2==2.9.9
+SQLAlchemy==2.0.29
+torch==2.2.0
+torchdata==0.7.1
+torchtext==0.17.0
+torchvision==0.17.0
+transformer_engine==0.0.0
diff --git a/goat/database/bd_init_script.sql b/goat/database/bd_init_script.sql
@@ -0,0 +1,38 @@
+create table if not exists public.leaderboard
+(
+ model varchar not null
+ primary key,
+ single_choice double precision,
+ multiple_choice double precision,
+ word_gen double precision
+);
+
+alter table public.leaderboard
+ owner to habrpguser;
+
+create table if not exists public.eval_requests
+(
+ id serial
+ constraint eval_requests_pk
+ primary key,
+ model_name varchar not null,
+ precision varchar not null
+);
+
+alter table public.eval_requests
+ owner to habrpguser;
+
+alter table public.eval_requests
+ owner to habrpguser;
+
+create or replace function notify_id_trigger()
+returns trigger as $$
+begin
+ perform pg_notify('id'::text, NEW."id"::text);
+ return new;
+end;
+$$ language plpgsql;
+
+create trigger trigger1
+after insert or update on public."eval_requests"
+for each row execute procedure notify_id_trigger();
diff --git a/goat/frontend/__init__.py b/goat/frontend/__init__.py
diff --git a/scripts/leaderboard_web/app.py → goat/frontend/app.py b/scripts/leaderboard_web/app.py → goat/frontend/app.py
@@ -1,19 +1,21 @@
 # type: ignore
 import gradio as gr
-from database_helper import DatabaseHelper
-from src_display_css_html_js import custom_css
-from utils import Precision
+
+from goat.frontend.precision import Precision
+
+from ..utils.database_helper import DatabaseHelper, EvalRequest
 
 TITLE = "Goat leaderboard"
 INTRODUCTION_TEXT = "This is really nice introduction text!!!"
 EVALUATION_QUEUE_TEXT = "there is evaluation queue"
 
+
 db_helper = DatabaseHelper()
 
 leaderboard_df = db_helper.get_leaderboard_df()
 
 
-demo = gr.Blocks(css=custom_css)
+demo = gr.Blocks(css="src_display.css")
 with demo:
  gr.HTML(TITLE)
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
@@ -43,20 +45,11 @@
  value="float16",
  interactive=True,
  )
- num_fewshot = gr.Number(
- label="Fewshot number",
- minimum=0,
- maximum=5,
- step=1,
- value=5,
- interactive=True,
- )
-
  submit_button = gr.Button("Submit Eval")
  submission_result = gr.Markdown()
  submit_button.click(
  db_helper.add_eval_request,
- [model_name, model_precision, num_fewshot],
+ [model_name, model_precision],
  submission_result,
  )
 

diff --git a/scripts/leaderboard_web/utils.py → goat/frontend/precision.py b/scripts/leaderboard_web/utils.py → goat/frontend/precision.py
diff --git a/goat/frontend/requirements.txt b/goat/frontend/requirements.txt
@@ -0,0 +1,3 @@
+gradio==4.31.3
+psycopg2==2.9.9
+SQLAlchemy==2.0.29
diff --git a/...eaderboard_web/src_display_css_html_js.py → goat/frontend/src_display.css b/...eaderboard_web/src_display_css_html_js.py → goat/frontend/src_display.css
@@ -1,4 +1,3 @@
-custom_css = """
 /* Hides the final AutoEvalColumn */
 #llm-benchmark-tab-table table td:last-child,
 #llm-benchmark-tab-table table th:last-child {
@@ -86,12 +85,3 @@
 #box-filter > .form{
  border: 0
 }
-"""
-
-get_window_url_params = """
- function(url_params) {
- const params = new URLSearchParams(window.location.search);
- url_params = Object.fromEntries(params);
- return url_params;
- }
- """
diff --git a/goat/parser/__init__.py b/goat/parser/__init__.py
diff --git a/goat/dataset_demonstration.py → goat/parser/dataset_demonstration.py b/goat/dataset_demonstration.py → goat/parser/dataset_demonstration.py
diff --git a/goat/dataset_utils.py → goat/parser/dataset_utils.py b/goat/dataset_utils.py → goat/parser/dataset_utils.py
diff --git a/goat/items.py → goat/parser/items.py b/goat/items.py → goat/parser/items.py
diff --git a/goat/middlewares.py → goat/parser/middlewares.py b/goat/middlewares.py → goat/parser/middlewares.py
diff --git a/goat/pipelines.py → goat/parser/pipelines.py b/goat/pipelines.py → goat/parser/pipelines.py
diff --git a/requirements.txt → goat/parser/requirements.txt b/requirements.txt → goat/parser/requirements.txt
diff --git a/goat/settings.py → goat/parser/settings.py b/goat/settings.py → goat/parser/settings.py
@@ -7,10 +7,10 @@
 # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 
-BOT_NAME = "goat"
+BOT_NAME = "parser"
 
-SPIDER_MODULES = ["goat.spiders"]
-NEWSPIDER_MODULE = "goat.spiders"
+SPIDER_MODULES = ["goat.parser.spiders"]
+NEWSPIDER_MODULE = "goat.parser.spiders"
 
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent

diff --git a/goat/spider_utils.py → goat/parser/spider_utils.py b/goat/spider_utils.py → goat/parser/spider_utils.py
diff --git a/goat/spiders/__init__.py → goat/parser/spiders/__init__.py b/goat/spiders/__init__.py → goat/parser/spiders/__init__.py
diff --git a/goat/spiders/sdamgia_spider.py → goat/parser/spiders/sdamgia_spider.py b/goat/spiders/sdamgia_spider.py → goat/parser/spiders/sdamgia_spider.py
@@ -3,8 +3,8 @@
 import scrapy
 from scrapy.http import Response
 
-from goat.items import SdamgiaTaskItem
-from goat.spider_utils import *
+from goat.parser.items import SdamgiaTaskItem
+from goat.parser.spider_utils import *
 
 
 class SdamgiaSpider(scrapy.Spider):

diff --git a/goat/utils/__init__.py b/goat/utils/__init__.py