update government benchmark

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>
kubeedge · Sep 9, 2024 · 189f36a · 189f36a
1 parent 171d59e
commit 189f36a
Show file tree

Hide file tree

Showing 15 changed files with 9 additions and 1,685 deletions.
diff --git a/dataset/government/objective/test_data/data.jsonl b/dataset/government/objective/test_data/data.jsonl
diff --git a/dataset/government/objective/test_data/data_info.json b/dataset/government/objective/test_data/data_info.json
diff --git a/dataset/government/objective/test_data/prompts.json b/dataset/government/objective/test_data/prompts.json
diff --git a/dataset/government/subjective/test_data/data.jsonl b/dataset/government/subjective/test_data/data.jsonl
diff --git a/dataset/government/subjective/test_data/data_full.jsonl b/dataset/government/subjective/test_data/data_full.jsonl
diff --git a/dataset/government/subjective/test_data/data_info.json b/dataset/government/subjective/test_data/data_info.json
diff --git a/dataset/government/subjective/test_data/prompts.json b/dataset/government/subjective/test_data/prompts.json
diff --git a/dataset/llm_simple_qa/test_data/data.jsonl b/dataset/llm_simple_qa/test_data/data.jsonl
diff --git a/dataset/llm_simple_qa/train_data/data.jsonl b/dataset/llm_simple_qa/train_data/data.jsonl
diff --git a/examples/government/singletask_learning_bench/README.md b/examples/government/singletask_learning_bench/README.md
@@ -15,6 +15,8 @@ This Benchmark consists of two parts: subjective evaluation data and objective e
 
 ## Prepare Datasets
 
+You can download dataset in [kaggle](https://www.kaggle.com/datasets/hsj576/government-bench-master)
+
 ```
 dataset/government
 ├── objective

diff --git a/examples/government/singletask_learning_bench/objective/testalgorithms/gen/basemodel.py b/examples/government/singletask_learning_bench/objective/testalgorithms/gen/basemodel.py
@@ -63,14 +63,14 @@ def predict(self, data, input_shape=None, **kwargs):
             infer_system_prompt = data.prompts['infer_system_prompt']
 
         answer_list = []
-        for line in tqdm(data.question, desc="Processing", unit="question"):
+        for line in tqdm(data.x, desc="Processing", unit="question"):
             # 3-shot
-            indices = random.sample([i for i, l in enumerate(data.question) if l != line], 3)
+            indices = random.sample([i for i, l in enumerate(data.x) if l != line], 3)
             history = []
             if infer_system_prompt:
                 history.append({"role": "system", "content": infer_system_prompt})
             for idx in indices:
-                history.append({"role": "user", "content": data.question[idx]})
+                history.append({"role": "user", "content": data.x[idx]})
                 history.append({"role": "assistant", "content": data.y[idx]})
             history.append({"role": "user", "content": line})
             response = self._infer(history)

diff --git a/examples/government/singletask_learning_bench/subjective/testalgorithms/gen/basemodel.py b/examples/government/singletask_learning_bench/subjective/testalgorithms/gen/basemodel.py
@@ -63,7 +63,7 @@ def predict(self, data, input_shape=None, **kwargs):
             infer_system_prompt = data.prompts['infer_system_prompt']
 
         answer_list = []
-        for line in tqdm(data.question, desc="Processing", unit="question"):
+        for line in tqdm(data.x, desc="Processing", unit="question"):
             history = []
             query = line.split('||')[0]
             if infer_system_prompt:
@@ -78,7 +78,7 @@ def predict(self, data, input_shape=None, **kwargs):
 
         # evaluate by llm
         for index in tqdm(range(len(answer_list)), desc="Evaluating", ascii=False, ncols=75):
-            prompt = data.prompts['eval_user_template'].replace('{question}', data.question[index].split('||')[0]).replace('{reference}', data.question[index].split('||')[1]).replace('{answer}', answer_list[index])
+            prompt = data.prompts['eval_user_template'].replace('{question}', data.x[index].split('||')[0]).replace('{reference}', data.x[index].split('||')[1]).replace('{answer}', answer_list[index])
             print(prompt)
             judgement = self._openai_generate(prompt)
             print(judgement)
@@ -117,7 +117,8 @@ def _infer(self, messages):
 
 
     def _openai_generate(self, user_question, system=None):
-        client = OpenAI(api_key="", base_url="https://api.deepseek.com")
+        key = os.getenv("DEEPSEEK_API_KEY")
+        client = OpenAI(api_key=key, base_url="https://api.deepseek.com")
 
         messages = []
         if system:

diff --git a/examples/resources/opencompass-0.2.5-py3-none-any.whl b/examples/resources/opencompass-0.2.5-py3-none-any.whl
diff --git a/examples/resources/sedna-jsondatainfo.zip b/examples/resources/sedna-jsondatainfo.zip
diff --git a/examples/resources/sedna-with-jsonl.zip b/examples/resources/sedna-with-jsonl.zip