Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mixeval evaluator #106

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 189 additions & 0 deletions mttl/evaluators/mixeval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import json
import os
import threading

try:
from mix_eval.api.registry import register_model
from mix_eval.evaluate import compute_metrics_p, eval, parse_args
from mix_eval.models.base import ChatModel

mixeval_available = True

except ImportError:
mixeval_available = False
register_model = lambda x: x


from copy import deepcopy
from dataclasses import dataclass

import torch
from transformers import AutoTokenizer

from mttl.datamodule.utils import get_tokenizer_with_args
from mttl.evaluators.base import GenerativeEvaluator
from mttl.models.expert_model import MultiExpertModel, MultiExpertModelConfig
from mttl.models.library.expert_library import ExpertLibrary


@dataclass
class MixEvalConfig:
batch_size: int = 16
model_name: str = "mix_eval_expert_adapter"
benchmark: str = "mixeval_hard"
data_path: str = None
free_form_parser: str = "model"
multi_choice_parser: str = "model"
multichoice_judge: str = "gpt-3.5-turbo-0125"
freeform_judge: str = "gpt-3.5-turbo-0125"
extract_base_model_response: bool = False
compute_score_from_judged_file: bool = False
version: str = "2024-08-11"
split: str = None
output_dir: str = None
verbose: bool = False
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we still need an --api_base_url



@register_model("mix_eval_expert_adapter")
class MultiExpertAdapter(ChatModel):
# model context is used to inject model into the class
model_context = threading.local()

def chunk_generate(
self,
inputs,
model,
tok,
max_tokens: int,
sliding_window: int = 128 * 1024,
chunk_size: int = 2500,
verbose: bool = False,
chunked: bool = False,
**kwargs,
):
if chunked:
raise ValueError("Chunked is not supported.")

with torch.no_grad():
input_ids = inputs.input_ids # (b, n)
attention_mask = inputs.attention_mask # (b, n)

outputs = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=max_tokens,
**kwargs,
)
generated_ids = [
output_ids[len(in_ids) :] for in_ids, output_ids in zip(input_ids, outputs)
]
responses = tok.batch_decode(generated_ids, skip_special_tokens=True)
return responses

def __init__(self, args):
super().__init__(args)

self.model = self.model_context.model
self.tokenizer = get_tokenizer_with_args(
model_name=self.model.base_model_name_or_path,
model_family="gpt",
padding_side="left",
truncation_side="left",
for_generation=True,
)

self.SYSTEM_MESSAGE = {
"role": "system",
"content": "You are a helpful assistant.",
} # set to None if no system message
self.USER_MESSAGE_TEMPLATE = lambda x: {"role": "user", "content": x}
self.ASSISTANT_MESSAGE_TEMPLATE = lambda x: {"role": "assistant", "content": x}

self.model_max_len = self.model.max_position_embeddings
self.max_input_length_closeend = (
min(self.model_max_len, self.max_input_length)
- self.closeended_max_new_tokens
)
self.max_input_length_openend = (
min(self.model_max_len, self.max_input_length)
- self.openended_max_new_tokens
)


class MixEvalEvaluator(GenerativeEvaluator):
def __init__(self, config: MixEvalConfig = None):
super().__init__(config=config or MixEvalConfig())

if not mixeval_available:
raise ValueError(
"MixEval is not installed. Please install it using `pip install mix-eval`."
)

self.download_data()

def download_data(self):
import shutil
import subprocess

import mix_eval

repo_url = "https://github.com/Psycoy/MixEval.git"
data_folder = "mix_eval/data"
temp_dir = "/tmp/mixeval_repo"
target_dir = os.path.join(os.path.dirname(mix_eval.__file__), "data")

self.config.data_path = target_dir

if os.path.exists(target_dir):
return

# Clone the repository
subprocess.run(["git", "clone", repo_url, temp_dir], check=True)

# Copy the data folder to the target directory
shutil.copytree(
os.path.join(temp_dir, data_folder), target_dir, dirs_exist_ok=True
)

# Clean up the temporary directory
shutil.rmtree(temp_dir)

def evaluate(
self,
model,
split=None,
shuffle=False,
subsample=-1,
output_path=None,
verbose=False,
**kwargs,
):
# inject model into MultiExpertAdapter
MultiExpertAdapter.model_context.model = model

# inject model into config
self.config.verbose = verbose

if split is not None:
self.config.split = split

if output_path is not None:
self.config.output_dir = output_path
else:
raise ValueError("Output path is required for evaluation.")

eval(self.config)
compute_metrics_p(self.config)

with open(os.path.join(self.config.output_dir, "score.json"), "r") as f:
score = json.load(f)
return score[self.config.model_name]["overall"]


if __name__ == "__main__":
evaluator = MixEvalEvaluator()
model = MultiExpertModel(
MultiExpertModelConfig(base_model="microsoft/Phi-3-mini-4k-instruct"),
device_map="cuda:0",
)
evaluator.evaluate(model, output_path="/tmp/mixeval/")
17 changes: 17 additions & 0 deletions mttl/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import torch
from huggingface_hub import hf_hub_download
from transformers import PreTrainedModel
from transformers.modeling_outputs import CausalLMOutput

from mttl.logging import logger
Expand Down Expand Up @@ -64,6 +65,10 @@ def __init__(
if model_object is None
else model_object
)
if not isinstance(self.model, PreTrainedModel):
raise ValueError(
f"Model is not a subclass of PreTrainedModel. Got {type(self.model)}."
)

if model_object:
logger.warning(
Expand All @@ -73,6 +78,18 @@ def __init__(
self.config = config
self.loading_kwargs = loading_kwargs

@property
def base_model_name_or_path(self) -> str:
return self.config.base_model

@property
def max_position_embeddings(self) -> int:
return self.base_model.config.max_position_embeddings

@property
def base_model(self) -> PreTrainedModel:
return self.model

def _delete_non_trainable_params(
self, state_dict: Dict[str, torch.Tensor]
) -> Dict[str, torch.Tensor]:
Expand Down