add government benchmark

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>
kubeedge · Sep 9, 2024 · 171d59e · 171d59e
1 parent 147ce57
commit 171d59e
Show file tree

Hide file tree

Showing 31 changed files with 2,318 additions and 10 deletions.
diff --git a/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py b/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py
@@ -84,5 +84,8 @@ def _inference(self, job, trained_model):
         inference_output_dir = os.path.join(self.workspace, "output/inference/")
         os.environ["RESULT_SAVED_URL"] = inference_output_dir
         job.load(trained_model)
-        infer_res = job.predict(inference_dataset.x)
+        if hasattr(inference_dataset, 'need_other_info'):
+            infer_res = job.predict(inference_dataset)
+        else:
+            infer_res = job.predict(inference_dataset.x)
         return infer_res
diff --git a/core/testenvmanager/dataset/dataset.py b/core/testenvmanager/dataset/dataset.py
@@ -16,9 +16,10 @@
 
 import os
 import tempfile
+import json
 
 import pandas as pd
-from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse, JsonlDataParse
+from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse, JsonlDataParse, JSONDataInfoParse
 
 from core.common import utils
 from core.common.constant import DatasetFormat
@@ -42,6 +43,8 @@ def __init__(self, config):
         self.test_index: str = ""
         self.train_data: str = ""
         self.test_data: str = ""
+        self.train_data_info: str = ""
+        self.test_data_info: str = ""
         self.label: str = ""
         self._parse_config(config)
 
@@ -54,6 +57,10 @@ def _check_fields(self):
             self._check_dataset_url(self.train_data)
         if self.test_data:
             self._check_dataset_url(self.test_data)
+        if self.train_data_info:
+            self._check_dataset_url(self.train_data_info)
+        if self.test_data_info:
+            self._check_dataset_url(self.test_data_info)
 
     def _parse_config(self, config):
         for attr, value in config.items():
@@ -120,6 +127,13 @@ def _process_data_file(self, file_url):
 
         return None
 
+    def _process_data_info_file(self, file_url):
+        file_format = utils.get_file_format(file_url)
+        if file_format == DatasetFormat.JSON.value:
+            return file_url
+
+        return None
+
     def process_dataset(self):
         """
         process dataset:
@@ -130,13 +144,24 @@ def process_dataset(self):
         """
         if self.train_index:
             self.train_url = self._process_index_file(self.train_index)
-        else:
+        elif self.train_data:
             self.train_url = self._process_data_file(self.train_data)
+        elif self.train_data_info:
+            self.train_url = self._process_data_info_file(self.train_data_info)
+            # raise NotImplementedError('to be done')
+        else:
+            raise NotImplementedError('not one of train_index/train_data/train_data_info')
 
         if self.test_index:
             self.test_url = self._process_index_file(self.test_index)
-        else:
+        elif self.test_data:
             self.test_url = self._process_data_file(self.test_data)
+        elif self.test_data_info:
+            self.test_url = self._process_data_info_file(self.test_data_info)
+            # raise NotImplementedError('to be done')
+        else:
+            raise NotImplementedError('not one of test_index/test_data/test_data_info')
+
 
     # pylint: disable=too-many-arguments
     def split_dataset(self, dataset_url, dataset_format, ratio, method="default",
@@ -411,6 +436,14 @@ def load_data(cls, file: str, data_type: str, label=None, use_raw=False, feature
             e.g.: TxtDataParse, CSVDataParse.
 
         """
+        print("file:")
+        print(file)
+        if file.split('/')[-1] == "data_info.json":
+            print('This is data_info.json')
+            data = JSONDataInfoParse(data_type=data_type, func=feature_process)
+            data.parse(file)
+            return data
+
         data_format = utils.get_file_format(file)
 
         data = None

diff --git a/dataset/government/objective/test_data/data.jsonl b/dataset/government/objective/test_data/data.jsonl
diff --git a/dataset/government/objective/test_data/data_info.json b/dataset/government/objective/test_data/data_info.json
@@ -0,0 +1,4 @@
+{
+    "keys": ["question", "A", "B", "C", "D", "answer"],
+    "answer_key": "answer"
+}
diff --git a/dataset/government/objective/test_data/prompts.json b/dataset/government/objective/test_data/prompts.json
@@ -0,0 +1,5 @@
+{
+    "infer_system_prompt": "你是一个中国的政务大模型助手，需要结合中国政务的一些知识来回答下面的问题。",
+    "infer_user_template": "Question:{question}\n请从下面四个选项中选出正确的选项:\nA:{A}\nB:{B}\nC:{C}\nD:{D}\n",
+    "infer_answer_template": "Answer:{answer}\n"
+}
diff --git a/dataset/government/subjective/test_data/data.jsonl b/dataset/government/subjective/test_data/data.jsonl
@@ -0,0 +1,5 @@
+{"question": "竞业限制适用于哪些员工?", "reference": "用人单位与劳动者可以在劳动合同中约定保守用人单位的商业秘密和与知识产权相关的保密事项。对负有保密义务的劳动者，用人单位可以在劳动合同或者保密协议中与劳动者约定竞业限制条款，并约定在解除或者终止劳动合同后，在竞业限制期限内按月给予劳动者经济补偿。劳动者违反竞业限制约定的，应当按照约定向用人单位支付违约金。 竞业限制的人员限于用人单位的高级管理人员、高级技术人员和其他负有保密义务的人员。"}
+{"question": "职工在多个单位就业，工伤保险费由谁缴纳?", "reference": "职工(包括非全日制从业人员)在两个或者两个以上用人单位同时就业的，各用人单位应当分别为职工缴纳工伤保险费。 职工发生工伤，由职工受到伤害时工作的单位依法承担工伤保险责任。"}
+{"question": "承诺放弃社保还能享受工伤保险待遇吗?", "reference": "根据《工伤保险条例》第二条规定，中华人民共和国境内的企业、事业单位、社会团体、民办非企业单位、基金会、律师事务所、会计师事务所等组织和有雇工的个体工商户应当依照本条例规定参加工伤保险，为本单位全部职工或者雇工缴纳工伤保险费。用人单位的雇工，均有依照本条例的规定享受工伤保险待遇的权利。 工伤保险是社会保险之一，不同于商业保险，属于国家强制性的保险。用人单位为职工参加工伤保险是为了保障职工在工作中受到事故伤害时，能依法从国家和社会获得物质帮助，也是法律明确规定用人单位应履行的义务，并不能由用人单位和职工自由协商决定放弃或免除。《劳动法》第七十二条规定，用人单位和劳动者必须依法参加社会保险，缴纳社会保险费。《社会保险法》第六十条规定，用人单位应当自行申报、按时足额缴纳社会保险费，非因不可抗力等法定事由不得缓缴、减免。"}
+{"question": "工伤职工进行劳动能力鉴定需符合什么条件？", "reference": "职工发生工伤，经治疗伤情相对稳定后存在残疾、影响劳动能力的，应当进行劳动能力鉴定。具体应符合以下条件： 一是经过治疗后，伤情处于相对稳定状态，这样便于劳动能力鉴定机构聘请的医疗卫生专家对伤情进行鉴定； 二是职工经治疗后，确认是因工伤原因造成职工身体上的残疾； 三是工伤职工的残疾将对以后的工作、生活产生直接影响，并且伤残程度已经影响职工本人的劳动能力。"}
+{"question": "工伤认定决定可以口头传达吗？", "reference": "不可以。工伤认定决定是工伤职工能否享受工伤保险待遇的依据，也是当事人进行行政复议和行政诉讼申请的依据。因此，工伤认定决定必须以书面方式送达。"}