Skip to content

Commit

Permalink
add llm-benchmarks proposal
Browse files Browse the repository at this point in the history
Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

add opencompass and llm singletask learning bench

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update llm single task learning bench readme

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

add government benchmark

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update government benchmark

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update llm government benchmark implementation

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update llm government benchmark implementation

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update government README

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update llm benchmark format

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update government benchmark

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update government benchmark dataset

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

add llm-benchmarks proposal

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update llm benchmark proposal

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update llm benchmark proposal

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update llm benchmark proposal

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

translate llm-benchmark proposal

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update proposal, add opencompass tutorial

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update government benchmark sedna package

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update government benchmark

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update llm benchmark format

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update llm benchmark format

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

fix pylint check problem

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

fix pylint check problem

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

fix pylint check problem

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

trans Chinese comments to English

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

add government llm benchmark

Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>

update government llm benchmark

Signed-off-by: IcyFeather <icyfeather@IcyFeatherdeMacBook-Pro.local>
  • Loading branch information
IcyFeather233 committed Oct 28, 2024
1 parent 5c48872 commit 179d39d
Show file tree
Hide file tree
Showing 31 changed files with 2,110 additions and 9 deletions.
1 change: 1 addition & 0 deletions core/common/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class DatasetFormat(Enum):
CSV = "csv"
TXT = "txt"
JSON = "json"
JSONL = "jsonl"


class ParadigmType(Enum):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,5 +84,8 @@ def _inference(self, job, trained_model):
inference_output_dir = os.path.join(self.workspace, "output/inference/")
os.environ["RESULT_SAVED_URL"] = inference_output_dir
job.load(trained_model)
infer_res = job.predict(inference_dataset.x)
if hasattr(inference_dataset, 'need_other_info'):
infer_res = job.predict(inference_dataset)
else:
infer_res = job.predict(inference_dataset.x)
return infer_res
77 changes: 69 additions & 8 deletions core/testenvmanager/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,16 @@

import os
import tempfile

import pandas as pd
from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse

# pylint: disable=no-name-in-module
# pylint: disable=too-many-instance-attributes
from sedna.datasources import (
CSVDataParse,
TxtDataParse,
JSONDataParse,
JsonlDataParse,
JSONMetaDataParse,
)
from core.common import utils
from core.common.constant import DatasetFormat

Expand All @@ -38,12 +44,28 @@ class Dataset:
def __init__(self, config):
self.train_url: str = ""
self.test_url: str = ""
self.train_index: str = ""
self.test_index: str = ""
self.train_data: str = ""
self.test_data: str = ""
self.train_data_info: str = ""
self.test_data_info: str = ""
self.label: str = ""
self._parse_config(config)

def _check_fields(self):
self._check_dataset_url(self.train_url)
self._check_dataset_url(self.test_url)
if self.train_index:
self._check_dataset_url(self.train_index)
if self.test_index:
self._check_dataset_url(self.test_index)
if self.train_data:
self._check_dataset_url(self.train_data)
if self.test_data:
self._check_dataset_url(self.test_data)
if self.train_data_info:
self._check_dataset_url(self.train_data_info)
if self.test_data_info:
self._check_dataset_url(self.test_data_info)

def _parse_config(self, config):
for attr, value in config.items():
Expand Down Expand Up @@ -103,6 +125,20 @@ def _process_index_file(self, file_url):

return None

def _process_data_file(self, file_url):
file_format = utils.get_file_format(file_url)
if file_format == DatasetFormat.JSONL.value:
return file_url

return None

def _process_data_info_file(self, file_url):
file_format = utils.get_file_format(file_url)
if file_format == DatasetFormat.JSON.value:
return file_url

return None

def process_dataset(self):
"""
process dataset:
Expand All @@ -111,9 +147,26 @@ def process_dataset(self):
in the index file(e.g.: txt index file).
"""
if self.train_index:
self.train_url = self._process_index_file(self.train_index)
elif self.train_data:
self.train_url = self._process_data_file(self.train_data)
elif self.train_data_info:
self.train_url = self._process_data_info_file(self.train_data_info)
# raise NotImplementedError('to be done')
else:
raise NotImplementedError('not one of train_index/train_data/train_data_info')

if self.test_index:
self.test_url = self._process_index_file(self.test_index)
elif self.test_data:
self.test_url = self._process_data_file(self.test_data)
elif self.test_data_info:
self.test_url = self._process_data_info_file(self.test_data_info)
# raise NotImplementedError('to be done')
else:
raise NotImplementedError('not one of test_index/test_data/test_data_info')

self.train_url = self._process_index_file(self.train_url)
self.test_url = self._process_index_file(self.test_url)

# pylint: disable=too-many-arguments
def split_dataset(self, dataset_url, dataset_format, ratio, method="default",
Expand Down Expand Up @@ -388,6 +441,11 @@ def load_data(cls, file: str, data_type: str, label=None, use_raw=False, feature
e.g.: TxtDataParse, CSVDataParse.
"""
if file.split('/')[-1] == "metadata.json":
data = JSONMetaDataParse(data_type=data_type, func=feature_process)
data.parse(file)
return data

data_format = utils.get_file_format(file)

data = None
Expand All @@ -397,11 +455,14 @@ def load_data(cls, file: str, data_type: str, label=None, use_raw=False, feature

if data_format == DatasetFormat.TXT.value:
data = TxtDataParse(data_type=data_type, func=feature_process)
#print(file)
data.parse(file, use_raw=use_raw)

if data_format == DatasetFormat.JSON.value:
data = JSONDataParse(data_type=data_type, func=feature_process)
data.parse(file)

if data_format == DatasetFormat.JSONL.value:
data = JsonlDataParse(data_type=data_type, func=feature_process)
data.parse(file)

return data
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 179d39d

Please sign in to comment.