Skip to content

Commit

Permalink
Remove validate_big_tasks option and add review fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
VyrodovMikhail committed Jun 18, 2024
1 parent 477ff3b commit e32b355
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ jobs:
- name: "isort"
run: isort . --check --diff
- name: "mypy"
run: mypy --ignore-missing-imports
run: mypy
- name: "pytests"
run: pytest
17 changes: 15 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ After you run it (instructions on how to run it are provided below), it will dis
from the parsed test in the console.

### Usage
Firstly, you need to install parser dependencies. To do this, run the following command:

```bash
pip install -e ".[parser]"
```

To run the parser, run the following command from goat/parser directory:

```bash
Expand Down Expand Up @@ -61,6 +67,12 @@ My leaderboard follows similar structure that [Open LLM Leaderboard](https://hug
It is a gradio web app that is used in a HuggingFace space. Database info is stored in environment variables.

### Usage
Firstly, you need to install frontend dependencies. To do this, run the following command:

```bash
pip install -e ".[frontend]"
```

In this app you can send your model validation request to
backend database and after some time validation result on your model will appear
in the leaderboard folder after reloading the app.
Expand All @@ -75,13 +87,14 @@ To run leaderboard web app execute this command from root directory
### Structure
Leaderboard backend after receiving new validation request validate
the model in the request on GOAT dataset using modified
[LM Evaluation Harness benchmark](https://github.com/deepvk/lm-evaluation-harness/tree/goat) from deepvk repository.
[LM Evaluation Harness benchmark](https://github.com/deepvk/lm-evaluation-harness/tree/goat) and [FastChat LLM-as-judge benchmark](https://github.com/deepvk/FastChat/tree/goat/fastchat/llm_judge) from deepvk repositories.
After finishing validation it adds the resulting scores in the leaderboard.

### Usage
Firstly, you need to install one additional library to run leaderboard backend. To do this, run the following commands:
Firstly, you need to install backend dependencies. To do this, run the following commands:

```bash
pip install -e ".[backend]"
pip install -U wheel
pip install flash-attn==2.5.8
```
Expand Down
39 changes: 19 additions & 20 deletions goat/backend/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,28 @@ def eval(model_name: str, precision: str, generate_fastchat: bool) -> None:

model_id = model_name.replace("/", "__")
Path(f"goat/backend/results/{model_id}").mkdir(exist_ok=True)
lm_eval_output_file = f"goat/backend/results/{model_id + '_lm_eval'}.json"
lm_eval_output_file = f"goat/backend/results/{model_id}/{model_id + '_lm_eval'}.json"
with open(lm_eval_output_file, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False)

if generate_fastchat:
fastchat_filename = os.path.join("goat/backend/results", model_id + "_fastchat.jsonl")
question_file = "goat/backend/data/question.jsonl"

run_eval(
model_path=model_name,
model_id=model_id,
answer_file=fastchat_filename,
question_file=question_file,
question_begin=None,
question_end=None,
max_new_token=1024,
num_choices=1,
num_gpus_per_model=1,
num_gpus_total=1,
max_gpu_memory=None,
dtype=str_to_torch_dtype(precision),
revision="main",
)
fastchat_filename = os.path.join(f"goat/backend/results/{model_id}", model_id + "_fastchat.jsonl")
question_file = "goat/backend/data/question.jsonl"

run_eval(
model_path=model_name,
model_id=model_id,
answer_file=fastchat_filename,
question_file=question_file,
question_begin=None,
question_end=None,
max_new_token=1024,
num_choices=1,
num_gpus_per_model=1,
num_gpus_total=1,
max_gpu_memory=None,
dtype=str_to_torch_dtype(precision),
revision="main",
)

add_results(input_path=lm_eval_output_file)

Expand Down
3 changes: 1 addition & 2 deletions goat/database/bd_init_script.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ create table if not exists public.eval_requests
constraint eval_requests_pk
primary key,
model_name varchar not null,
precision varchar not null,
validate_big_tasks boolean not null
precision varchar not null
);

alter table public.eval_requests
Expand Down
6 changes: 1 addition & 5 deletions goat/frontend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,11 @@
value="float16",
interactive=True,
)
validate_big_tasks = gr.Checkbox(
label="Validate on big text tasks",
info="Do you need to validate your model on tasks that require large text answer?",
)
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
db_helper.add_eval_request,
[model_name, model_precision, validate_big_tasks],
[model_name, model_precision],
submission_result,
)

Expand Down
33 changes: 18 additions & 15 deletions goat/utils/database_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,26 @@ class EvalRequest:


def postgres_str_to_bool(val: str) -> bool:
if val == "True":
return True
else:
return False
return val == "True"


def get_env_var(name: str) -> str:
value = os.environ.get(name)
if value is None:
raise Exception(f'Environment variable {name} is not set. This variable is required for database connection.')
return value


class DatabaseHelper:
def __init__(self) -> None:
vars = ["POSTGRES_USER", "POSTGRES_PASSWORD", "POSTGRES_IP", "POSTGRES_PORT", "POSTGRES_DB"]
env_vars = {var: get_env_var(var) for var in vars}
self.engine = create_engine(
f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_IP}:{POSTGRES_PORT}/{POSTGRES_DB}",
f"postgresql+psycopg2://{env_vars["POSTGRES_USER"]}:{env_vars["POSTGRES_PASSWORD"]}@{env_vars["POSTGRES_IP"]}:{env_vars["POSTGRES_PORT"]}/{env_vars["POSTGRES_DB"]}",
echo=True,
)
self.engine.connect()
conn_string = f"dbname='{POSTGRES_DB}' user='{POSTGRES_USER}' password='{POSTGRES_PASSWORD}' port='{POSTGRES_PORT}' host='{POSTGRES_IP}'"
conn_string = f"dbname='{env_vars["POSTGRES_DB"]}' user='{env_vars["POSTGRES_USER"]}' password='{env_vars["POSTGRES_PASSWORD"]}' port='{env_vars["POSTGRES_PORT"]}' host='{env_vars["POSTGRES_IP"]}'"
self.connection = psycopg2.connect(conn_string)
self.connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
Session = sessionmaker(bind=self.engine)
Expand All @@ -55,9 +61,9 @@ def __init__(self) -> None:
self.leaderboard = Table("leaderboard", metadata, autoload_with=self.engine)
self.eval_requests = Table("eval_requests", metadata, autoload_with=self.engine)

def add_eval_request(self, model_name: str, precision: str, validate_big_tasks: bool) -> None:
def add_eval_request(self, model_name: str, precision: str) -> None:
request = insert(self.eval_requests).values(
model_name=model_name, precision=precision, validate_big_tasks=validate_big_tasks
model_name=model_name, precision=precision
)
self.session.execute(request)
self.session.commit()
Expand All @@ -72,7 +78,7 @@ def add_eval_result(self, eval_result: EvalResult) -> None:
self.session.execute(stmt)
self.session.commit()

def listen_to_new_requests(self, action: Callable[[str, str, bool], None]) -> None:
def listen_to_new_requests(self, action: Callable[[str, str], None]) -> None:
cur = self.connection.cursor()
cur.execute("LISTEN id;")
while True:
Expand All @@ -82,14 +88,11 @@ def listen_to_new_requests(self, action: Callable[[str, str, bool], None]) -> No
notify = self.connection.notifies.pop()
query = "SELECT * FROM eval_requests"
df = pd.DataFrame(self.engine.connect().execute(text(query)))
model, precision, validate_big_tasks = (
model, precision = (
df.loc[df["id"] == int(notify.payload)]["model_name"].to_string(index=False),
df.loc[df["id"] == int(notify.payload)]["precision"].to_string(index=False),
postgres_str_to_bool(
df.loc[df["id"] == int(notify.payload)]["validate_big_tasks"].to_string(index=False)
),
df.loc[df["id"] == int(notify.payload)]["precision"].to_string(index=False)
)
action(model, precision, validate_big_tasks)
action(model, precision)

def get_leaderboard_df(self) -> pd.DataFrame:
query = "SELECT * FROM leaderboard"
Expand Down
23 changes: 15 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,12 @@ readme = "README.md"
license = { "text" = "Apache-2.0" }
requires-python = ">=3.8"
dependencies = [
"beautifulsoup4>=4.12.2",
"packaging",
"numpy",
"pandas",
"requests",
"typer<0.10.0",
"Scrapy>=2.11.0",
"gradio",
]

[project.optional-dependencies]
frontend = ["gradio", "psycopg2>=2.9.9", "SQLAlchemy>=2.0.29",]
backend = [
"psycopg2>=2.9.9",
"SQLAlchemy>=2.0.29",
"torch==2.2.0",
Expand All @@ -30,8 +28,17 @@ dependencies = [
"torchvision==0.17.0",
"transformer_engine==0.0.0",
"lm_eval@git+https://github.com/deepvk/lm-evaluation-harness@goat",
"fschat[model_worker,llm_judge]@git+https://github.com/deepvk/FastChat/@goat",
"fschat[model_worker,llm_judge]@git+https://github.com/deepvk/FastChat",
]
parser = [
"beautifulsoup4>=4.12.2",
"packaging",
"numpy",
"pandas",
"requests",
"Scrapy>=2.11.0",
]
all = ["goat[frontend]", "goat[backend]", "goat[parser]"]

[tool.isort]
profile = "black"
Expand Down

0 comments on commit e32b355

Please sign in to comment.