Remove validate_big_tasks option and add review fixes

deepvk · Jun 18, 2024 · e32b355 · e32b355
1 parent 477ff3b
commit e32b355
Show file tree

Hide file tree

Showing 7 changed files with 70 additions and 53 deletions.
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -20,6 +20,6 @@ jobs:
  - name: "isort"
  run: isort . --check --diff
  - name: "mypy"
- run: mypy --ignore-missing-imports
+ run: mypy
  - name: "pytests"
  run: pytest
diff --git a/README.md b/README.md
@@ -30,6 +30,12 @@ After you run it (instructions on how to run it are provided below), it will dis
 from the parsed test in the console.
 
 ### Usage
+Firstly, you need to install parser dependencies. To do this, run the following command:
+
+```bash
+pip install -e ".[parser]"
+```
+
 To run the parser, run the following command from goat/parser directory:
 
 ```bash
@@ -61,6 +67,12 @@ My leaderboard follows similar structure that [Open LLM Leaderboard](https://hug
 It is a gradio web app that is used in a HuggingFace space. Database info is stored in environment variables.
 
 ### Usage
+Firstly, you need to install frontend dependencies. To do this, run the following command:
+
+```bash
+pip install -e ".[frontend]"
+```
+
 In this app you can send your model validation request to
 backend database and after some time validation result on your model will appear
 in the leaderboard folder after reloading the app.
@@ -75,13 +87,14 @@ To run leaderboard web app execute this command from root directory
 ### Structure
 Leaderboard backend after receiving new validation request validate
 the model in the request on GOAT dataset using modified
-[LM Evaluation Harness benchmark](https://github.com/deepvk/lm-evaluation-harness/tree/goat) from deepvk repository.
+[LM Evaluation Harness benchmark](https://github.com/deepvk/lm-evaluation-harness/tree/goat) and [FastChat LLM-as-judge benchmark](https://github.com/deepvk/FastChat/tree/goat/fastchat/llm_judge) from deepvk repositories.
 After finishing validation it adds the resulting scores in the leaderboard.
 
 ### Usage
-Firstly, you need to install one additional library to run leaderboard backend. To do this, run the following commands:
+Firstly, you need to install backend dependencies. To do this, run the following commands:
 
 ```bash
+pip install -e ".[backend]"
 pip install -U wheel
 pip install flash-attn==2.5.8
 ```

diff --git a/goat/backend/eval.py b/goat/backend/eval.py
@@ -18,29 +18,28 @@ def eval(model_name: str, precision: str, generate_fastchat: bool) -> None:
 
  model_id = model_name.replace("/", "__")
  Path(f"goat/backend/results/{model_id}").mkdir(exist_ok=True)
- lm_eval_output_file = f"goat/backend/results/{model_id + '_lm_eval'}.json"
+ lm_eval_output_file = f"goat/backend/results/{model_id}/{model_id + '_lm_eval'}.json"
  with open(lm_eval_output_file, "w", encoding="utf-8") as f:
  json.dump(results, f, ensure_ascii=False)
 
- if generate_fastchat:
- fastchat_filename = os.path.join("goat/backend/results", model_id + "_fastchat.jsonl")
- question_file = "goat/backend/data/question.jsonl"
-
- run_eval(
- model_path=model_name,
- model_id=model_id,
- answer_file=fastchat_filename,
- question_file=question_file,
- question_begin=None,
- question_end=None,
- max_new_token=1024,
- num_choices=1,
- num_gpus_per_model=1,
- num_gpus_total=1,
- max_gpu_memory=None,
- dtype=str_to_torch_dtype(precision),
- revision="main",
- )
+ fastchat_filename = os.path.join(f"goat/backend/results/{model_id}", model_id + "_fastchat.jsonl")
+ question_file = "goat/backend/data/question.jsonl"
+
+ run_eval(
+ model_path=model_name,
+ model_id=model_id,
+ answer_file=fastchat_filename,
+ question_file=question_file,
+ question_begin=None,
+ question_end=None,
+ max_new_token=1024,
+ num_choices=1,
+ num_gpus_per_model=1,
+ num_gpus_total=1,
+ max_gpu_memory=None,
+ dtype=str_to_torch_dtype(precision),
+ revision="main",
+ )
 
  add_results(input_path=lm_eval_output_file)
 

diff --git a/goat/database/bd_init_script.sql b/goat/database/bd_init_script.sql
@@ -16,8 +16,7 @@ create table if not exists public.eval_requests
  constraint eval_requests_pk
  primary key,
  model_name varchar not null,
- precision varchar not null,
- validate_big_tasks boolean not null
+ precision varchar not null
 );
 
 alter table public.eval_requests

diff --git a/goat/frontend/app.py b/goat/frontend/app.py
@@ -41,15 +41,11 @@
  value="float16",
  interactive=True,
  )
- validate_big_tasks = gr.Checkbox(
- label="Validate on big text tasks",
- info="Do you need to validate your model on tasks that require large text answer?",
- )
  submit_button = gr.Button("Submit Eval")
  submission_result = gr.Markdown()
  submit_button.click(
  db_helper.add_eval_request,
- [model_name, model_precision, validate_big_tasks],
+ [model_name, model_precision],
  submission_result,
  )
 

diff --git a/goat/utils/database_helper.py b/goat/utils/database_helper.py
@@ -32,20 +32,26 @@ class EvalRequest:
 
 
 def postgres_str_to_bool(val: str) -> bool:
- if val == "True":
- return True
- else:
- return False
+ return val == "True"
+
+
+def get_env_var(name: str) -> str:
+ value = os.environ.get(name)
+ if value is None:
+ raise Exception(f'Environment variable {name} is not set. This variable is required for database connection.')
+ return value
 
 
 class DatabaseHelper:
  def __init__(self) -> None:
+ vars = ["POSTGRES_USER", "POSTGRES_PASSWORD", "POSTGRES_IP", "POSTGRES_PORT", "POSTGRES_DB"]
+ env_vars = {var: get_env_var(var) for var in vars}
  self.engine = create_engine(
- f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_IP}:{POSTGRES_PORT}/{POSTGRES_DB}",
+ f"postgresql+psycopg2://{env_vars["POSTGRES_USER"]}:{env_vars["POSTGRES_PASSWORD"]}@{env_vars["POSTGRES_IP"]}:{env_vars["POSTGRES_PORT"]}/{env_vars["POSTGRES_DB"]}",
  echo=True,
  )
  self.engine.connect()
- conn_string = f"dbname='{POSTGRES_DB}' user='{POSTGRES_USER}' password='{POSTGRES_PASSWORD}' port='{POSTGRES_PORT}' host='{POSTGRES_IP}'"
+ conn_string = f"dbname='{env_vars["POSTGRES_DB"]}' user='{env_vars["POSTGRES_USER"]}' password='{env_vars["POSTGRES_PASSWORD"]}' port='{env_vars["POSTGRES_PORT"]}' host='{env_vars["POSTGRES_IP"]}'"
  self.connection = psycopg2.connect(conn_string)
  self.connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
  Session = sessionmaker(bind=self.engine)
@@ -55,9 +61,9 @@ def __init__(self) -> None:
  self.leaderboard = Table("leaderboard", metadata, autoload_with=self.engine)
  self.eval_requests = Table("eval_requests", metadata, autoload_with=self.engine)
 
- def add_eval_request(self, model_name: str, precision: str, validate_big_tasks: bool) -> None:
+ def add_eval_request(self, model_name: str, precision: str) -> None:
  request = insert(self.eval_requests).values(
- model_name=model_name, precision=precision, validate_big_tasks=validate_big_tasks
+ model_name=model_name, precision=precision
  )
  self.session.execute(request)
  self.session.commit()
@@ -72,7 +78,7 @@ def add_eval_result(self, eval_result: EvalResult) -> None:
  self.session.execute(stmt)
  self.session.commit()
 
- def listen_to_new_requests(self, action: Callable[[str, str, bool], None]) -> None:
+ def listen_to_new_requests(self, action: Callable[[str, str], None]) -> None:
  cur = self.connection.cursor()
  cur.execute("LISTEN id;")
  while True:
@@ -82,14 +88,11 @@ def listen_to_new_requests(self, action: Callable[[str, str, bool], None]) -> No
  notify = self.connection.notifies.pop()
  query = "SELECT * FROM eval_requests"
  df = pd.DataFrame(self.engine.connect().execute(text(query)))
- model, precision, validate_big_tasks = (
+ model, precision = (
  df.loc[df["id"] == int(notify.payload)]["model_name"].to_string(index=False),
- df.loc[df["id"] == int(notify.payload)]["precision"].to_string(index=False),
- postgres_str_to_bool(
- df.loc[df["id"] == int(notify.payload)]["validate_big_tasks"].to_string(index=False)
- ),
+ df.loc[df["id"] == int(notify.payload)]["precision"].to_string(index=False)
  )
- action(model, precision, validate_big_tasks)
+ action(model, precision)
 
  def get_leaderboard_df(self) -> pd.DataFrame:
  query = "SELECT * FROM leaderboard"

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,14 +14,12 @@ readme = "README.md"
 license = { "text" = "Apache-2.0" }
 requires-python = ">=3.8"
 dependencies = [
- "beautifulsoup4>=4.12.2",
- "packaging",
- "numpy",
- "pandas",
- "requests",
  "typer<0.10.0",
- "Scrapy>=2.11.0",
- "gradio",
+]
+
+[project.optional-dependencies]
+frontend = ["gradio", "psycopg2>=2.9.9", "SQLAlchemy>=2.0.29",]
+backend = [
  "psycopg2>=2.9.9",
  "SQLAlchemy>=2.0.29",
  "torch==2.2.0",
@@ -30,8 +28,17 @@ dependencies = [
  "torchvision==0.17.0",
  "transformer_engine==0.0.0",
  "lm_eval@git+https://github.com/deepvk/lm-evaluation-harness@goat",
- "fschat[model_worker,llm_judge]@git+https://github.com/deepvk/FastChat/@goat",
+ "fschat[model_worker,llm_judge]@git+https://github.com/deepvk/FastChat",
+]
+parser = [
+ "beautifulsoup4>=4.12.2",
+ "packaging",
+ "numpy",
+ "pandas",
+ "requests",
+ "Scrapy>=2.11.0",
 ]
+all = ["goat[frontend]", "goat[backend]", "goat[parser]"]
 
 [tool.isort]
 profile = "black"