diff --git a/core/testcasecontroller/algorithm/algorithm.py b/core/testcasecontroller/algorithm/algorithm.py index 5bad73a9..d933eac8 100644 --- a/core/testcasecontroller/algorithm/algorithm.py +++ b/core/testcasecontroller/algorithm/algorithm.py @@ -77,6 +77,9 @@ def __init__(self, name, config): self.initial_model_url: str = "" self.modules: list = [] self.modules_list = None + self.mode: str = "" + self.quantization_type: str = "" + self.llama_quantize_path: str = "" self._parse_config(config) self._load_third_party_packages() diff --git a/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py b/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py index 3194b7cd..90a22129 100644 --- a/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py +++ b/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py @@ -15,7 +15,7 @@ """Single Task Learning Paradigm""" import os - +import subprocess from core.common.constant import ParadigmType from core.testcasecontroller.algorithm.paradigm.base import ParadigmBase @@ -49,6 +49,11 @@ class SingleTaskLearning(ParadigmBase): def __init__(self, workspace, **kwargs): ParadigmBase.__init__(self, workspace, **kwargs) self.initial_model = kwargs.get("initial_model_url") + self.mode = kwargs.get("mode") + self.quantization_type = kwargs.get("quantization_type") + self.llama_quantize_path = kwargs.get("llama_quantize_path") + if kwargs.get("use_gpu", True): + os.environ["CUDA_VISIBLE_DEVICES"] = "0" def run(self): """ @@ -66,10 +71,43 @@ def run(self): trained_model = self._train(job, self.initial_model) + if trained_model is None: + trained_model = self.initial_model + + if self.mode == 'with_compression': + trained_model = self._compress(trained_model) + inference_result = self._inference(job, trained_model) return inference_result, self.system_metric_info + + def _compress(self, trained_model): + if not os.path.exists(trained_model): + return None + + if self.llama_quantize_path is None or not os.path.exists(self.llama_quantize_path): + return None + + if self.quantization_type is None: + return None + + compressed_model = trained_model.replace('.gguf', f'_{self.quantization_type}.gguf') + + command = [ + self.llama_quantize_path, + trained_model, + compressed_model, + self.quantization_type + ] + + try: + subprocess.run(command, check=True) + except subprocess.CalledProcessError as _: + return trained_model + + return compressed_model + def _train(self, job, initial_model): train_output_dir = os.path.join(self.workspace, "output/train/") os.environ["BASE_MODEL_URL"] = initial_model diff --git a/core/testenvmanager/testenv/testenv.py b/core/testenvmanager/testenv/testenv.py index d9916d1a..9e159901 100644 --- a/core/testenvmanager/testenv/testenv.py +++ b/core/testenvmanager/testenv/testenv.py @@ -46,6 +46,7 @@ def __init__(self, config): self.round = 1 self.client_number = 1 self.dataset = None + self.use_gpu = False # default false self._parse_config(config) def _check_fields(self): @@ -64,6 +65,8 @@ def _parse_config(self, config): for k, v in config_dict.items(): if k == str.lower(Dataset.__name__): self.dataset = Dataset(v) + elif k == 'use_gpu': + self.use_gpu = bool(v) else: if k in self.__dict__: self.__dict__[k] = v diff --git a/examples/llm-edge-benchmark-suite/README.md b/examples/llm-edge-benchmark-suite/README.md new file mode 100644 index 00000000..8ef4ae97 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/README.md @@ -0,0 +1,41 @@ +Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs + + +## dataset + +### Prepare Data + +The data of llm-edge-benchmark-suite example structure is: + +``` +. +├── test_data +│ └── data.jsonl +└── train_data + └── data.jsonl +``` + +`train_data/data.jsonl` is empty, and the `test_data/data.jsonl` is as follows: + +``` +{"question": "Which of the following numbers is the smallest prime number?\nA. 0\nB. 1\nC. 2\nD. 4", "answer": "C"} +``` +### prepare env + +```shell +python setup.py install +``` + +### Run Ianvs + + + +```shell +ianvs -f examples/llm-edge-benchmark-suite/single_task_bench/benchmarkingjob.yaml +``` + + +```shell +ianvs -f examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml +``` + diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/README.md b/examples/llm-edge-benchmark-suite/single_task_bench/README.md new file mode 100644 index 00000000..3a3835c7 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench/README.md @@ -0,0 +1,2 @@ +Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs + diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/benchmarkingjob.yaml b/examples/llm-edge-benchmark-suite/single_task_bench/benchmarkingjob.yaml new file mode 100644 index 00000000..ae2b23c6 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench/benchmarkingjob.yaml @@ -0,0 +1,30 @@ +benchmarkingjob: + name: "benchmarkingjob" + workspace: "./workspace" + + testenv: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml" + + test_object: + type: "algorithms" + algorithms: + - name: "llama-cpp" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml" + + rank: + sort_by: + - { "latency": "descend" } + - { "throughput": "ascend" } + - { "mem_usage": "ascend" } + - { "prefill_latency": "ascend"} + + visualization: + mode: "selected_only" + method: "print_table" + + selected_dataitem: + paradigms: [ "all" ] + modules: [ "all" ] + hyperparameters: [ "all" ] + metrics: [ "latency", "throughput", "prefill_latency" ] + + save_mode: "selected_and_all" \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/algorithm.yaml b/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/algorithm.yaml new file mode 100644 index 00000000..d15c4326 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/algorithm.yaml @@ -0,0 +1,16 @@ +algorithm: + paradigm_type: "singletasklearningwithcompression" + + initial_model_url: "models/qwen/qwen_1_5_0_5b.gguf" + + modules: + - type: "basemodel" + name: "LlamaCppModel" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py" + hyperparameters: + - model_path: + values: + - "models/qwen/qwen_1_5_0_5b.gguf" + - n_ctx: + values: + - 2048 \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/basemodel.py b/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/basemodel.py new file mode 100644 index 00000000..477cc61a --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/basemodel.py @@ -0,0 +1,135 @@ +from sedna.common.class_factory import ClassFactory, ClassType +from llama_cpp import Llama +from contextlib import redirect_stderr +import os +import psutil +import time +import io +import statistics +import logging + +logging.getLogger().setLevel(logging.INFO) + +@ClassFactory.register(ClassType.GENERAL, alias="LlamaCppModel") +class LlamaCppModel: + def __init__(self, **kwargs): + """ + init llama-cpp + """ + model_path = kwargs.get("model_path") + if not model_path: + raise ValueError("Model path is required.") + quantization_type = kwargs.get("quantization_type", None) + if quantization_type: + logging.info(f"Using quantization type: {quantization_type}") + # Init LLM model + self.model = Llama( + model_path=model_path, + n_ctx=kwargs.get("n_ctx", 512), + n_gpu_layers=kwargs.get("n_gpu_layers", 0), + seed=kwargs.get("seed", -1), + f16_kv=kwargs.get("f16_kv", True), + logits_all=kwargs.get("logits_all", False), + vocab_only=kwargs.get("vocab_only", False), + use_mlock=kwargs.get("use_mlock", False), + embedding=kwargs.get("embedding", False), + ) + + def predict(self, data, input_shape=None, **kwargs): + data = data[:10] + process = psutil.Process(os.getpid()) + start_time = time.time() + + results = [] + total_times = [] + prefill_latencies = [] + mem_usages = [] + + for prompt in data: + prompt_start_time = time.time() + + f = io.StringIO() + with redirect_stderr(f): + output = self.model( + prompt=prompt, + max_tokens=kwargs.get("max_tokens", 32), + stop=kwargs.get("stop", ["Q:", "\n"]), + echo=kwargs.get("echo", True), + temperature=kwargs.get("temperature", 0.8), + top_p=kwargs.get("top_p", 0.95), + top_k=kwargs.get("top_k", 40), + repeat_penalty=kwargs.get("repeat_penalty", 1.1), + ) + stdout_output = f.getvalue() + + # parse timing info + timings = self._parse_timings(stdout_output) + prefill_latency = timings.get('prompt_eval_time', 0.0) # ms + generated_text = output['choices'][0]['text'] + + prompt_end_time = time.time() + prompt_total_time = (prompt_end_time - prompt_start_time) * 1000 # convert to ms + + result_with_time = { + "generated_text": generated_text, + "total_time": prompt_total_time, + "prefill_latency": prefill_latency, + "mem_usage":process.memory_info().rss, + } + + results.append(result_with_time) + + predict_dict = { + "results": results, + } + + return predict_dict + + def _parse_timings(self, stdout_output): + import re + timings = {} + for line in stdout_output.split('\n'): + match = re.match(r'llama_print_timings:\s*(.+?)\s*=\s*([0-9\.]+)\s*ms', line) + if match: + key = match.group(1).strip() + value = float(match.group(2)) + + key = key.lower().replace(' ', '_') + timings[key] = value + + return timings + + def evaluate(self, data, model_path=None, **kwargs): + """ + evaluate model + """ + if data is None or data.x is None: + raise ValueError("Evaluation data is None.") + + if model_path: + self.load(model_path) + + # do predict + predict_dict = self.predict(data.x, **kwargs) + + # compute metrics + metric = kwargs.get("metric") + if metric is None: + raise ValueError("No metric provided in kwargs.") + + metric_name, metric_func = metric + + if callable(metric_func): + metric_value = metric_func(None, predict_dict["results"]) + return {metric_name: metric_value} + else: + raise ValueError(f"Metric function {metric_name} is not callable or not provided.") + + def save(self, model_path): + pass + + def load(self, model_url): + pass + + def train(self, train_data, valid_data=None, **kwargs): + return \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/download_model_modelscope.py b/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/download_model_modelscope.py new file mode 100644 index 00000000..15f8933c --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/download_model_modelscope.py @@ -0,0 +1,25 @@ +import os +import argparse +import logging +from modelscope import snapshot_download + +logging.getLogger().setLevel(logging.INFO) + +def download_model(model_id, revision, local_dir): + try: + model_dir = snapshot_download(model_id, revision=revision, cache_dir=local_dir) + logging.info(f"Model successfully downloaded to: {model_dir}") + return model_dir + except Exception as e: + logging.info(f"Error downloading model: {str(e)}") + return None + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download a model from ModelScope") + parser.add_argument("--model_id", type=str, required=True, help="ModelScope model ID") + parser.add_argument("--revision", type=str, default="master", help="Model revision") + parser.add_argument("--local_dir", type=str, required=True, help="Local directory to save the model") + + args = parser.parse_args() + + download_model(args.model_id, args.revision, args.local_dir) \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testenv/latency.py b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/latency.py new file mode 100644 index 00000000..c561cc7d --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/latency.py @@ -0,0 +1,29 @@ +# Copyright 2023 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sedna.common.class_factory import ClassType, ClassFactory +import statistics + +__all__ = ["latency"] + + +@ClassFactory.register(ClassType.GENERAL, alias="latency") +def latency(y_true, y_pred): + results_list = y_pred.get('results', []) + num_requests = len(results_list) + total_latency = 0.0 + for result in results_list: + total_latency += result['total_time'] + average_latency = total_latency / num_requests + return average_latency \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testenv/mem_usage.py b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/mem_usage.py new file mode 100644 index 00000000..3d57b672 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/mem_usage.py @@ -0,0 +1,13 @@ +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["mem_usage"] + +@ClassFactory.register(ClassType.GENERAL, alias="mem_usage") +def mem_usage(y_true, y_pred): + results_list = y_pred.get('results', []) + total_mem_usage = 0.0 + num_requests = len(results_list) + for result in results_list: + total_mem_usage += result['mem_usage'] + average_mem_usage = total_mem_usage / num_requests + return average_mem_usage \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testenv/prefill_latency.py b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/prefill_latency.py new file mode 100644 index 00000000..b7743577 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/prefill_latency.py @@ -0,0 +1,13 @@ +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["prefill_latency"] + +@ClassFactory.register(ClassType.GENERAL, alias="prefill_latency") +def prefill_latency(y_true, y_pred): + results_list = y_pred.get('results', []) + num_requests = len(results_list) + total_prefill_latency = 0.0 + for result in results_list: + total_prefill_latency += result['prefill_latency'] + avg_prefill_latency = total_prefill_latency / num_requests + return avg_prefill_latency \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testenv/testenv.yaml b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/testenv.yaml new file mode 100644 index 00000000..e4a6e88a --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/testenv.yaml @@ -0,0 +1,14 @@ +testenv: + dataset: + train_data: "ianvs/government/objective/train_data/data.jsonl" + test_data: "ianvs/government/objective/test_data/data.jsonl" + use_gpu: true + metrics: + - name: "latency" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py" + - name: "throughput" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py" + - name: "prefill_latency" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py" + - name: "mem_usage" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py" diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testenv/throughput.py b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/throughput.py new file mode 100644 index 00000000..3ad7a05a --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/throughput.py @@ -0,0 +1,30 @@ +# Copyright 2023 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["throughput"] + +@ClassFactory.register(ClassType.GENERAL, alias="throughput") +def throughput(y_true, y_pred): + # total_time = y_pred.get('avg_total_time', []) + results_list = y_pred.get('results', []) + num_requests = len(results_list) + total_latency = 0.0 + for result in results_list: + total_latency += result['total_time'] + avg_throughput = num_requests /total_latency + return avg_throughput \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md new file mode 100644 index 00000000..3a3835c7 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/README.md @@ -0,0 +1,2 @@ +Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs + diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml new file mode 100644 index 00000000..ae2b23c6 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml @@ -0,0 +1,30 @@ +benchmarkingjob: + name: "benchmarkingjob" + workspace: "./workspace" + + testenv: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml" + + test_object: + type: "algorithms" + algorithms: + - name: "llama-cpp" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml" + + rank: + sort_by: + - { "latency": "descend" } + - { "throughput": "ascend" } + - { "mem_usage": "ascend" } + - { "prefill_latency": "ascend"} + + visualization: + mode: "selected_only" + method: "print_table" + + selected_dataitem: + paradigms: [ "all" ] + modules: [ "all" ] + hyperparameters: [ "all" ] + metrics: [ "latency", "throughput", "prefill_latency" ] + + save_mode: "selected_and_all" \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml new file mode 100644 index 00000000..1fdd5d5b --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml @@ -0,0 +1,18 @@ +algorithm: + paradigm_type: "singletasklearning_with_compression" + mode: "with_compression" + initial_model_url: "models/qwen/qwen_1_5_0_5b.gguf" + quantization_type: "q8_0" + llama_quantize_path: "llama.cpp/llama-quantize" + modules: + - type: "basemodel" + name: "LlamaCppModel" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py" + + hyperparameters: + - model_path: + values: + - "models/qwen/qwen_1_5_0_5b.gguf" + - n_ctx: + values: + - 2048 \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py new file mode 100644 index 00000000..4ad1634b --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py @@ -0,0 +1,129 @@ +from sedna.common.class_factory import ClassFactory, ClassType +from llama_cpp import Llama +from contextlib import redirect_stderr +import os +import psutil +import time +import io +import statistics + +@ClassFactory.register(ClassType.GENERAL, alias="LlamaCppModel") +class LlamaCppModel: + def __init__(self, **kwargs): + """ + init llama-cpp + """ + model_path = kwargs.get("model_path") + if not model_path: + raise ValueError("Model path is required.") + quantization_type = kwargs.get("quantization_type", None) + # Init LLM model + self.model = Llama( + model_path=model_path, + n_ctx=kwargs.get("n_ctx", 512), + n_gpu_layers=kwargs.get("n_gpu_layers", 0), + seed=kwargs.get("seed", -1), + f16_kv=kwargs.get("f16_kv", True), + logits_all=kwargs.get("logits_all", False), + vocab_only=kwargs.get("vocab_only", False), + use_mlock=kwargs.get("use_mlock", False), + embedding=kwargs.get("embedding", False), + ) + + def predict(self, data, input_shape=None, **kwargs): + data = data[:10] + process = psutil.Process(os.getpid()) + start_time = time.time() + + results = [] + total_times = [] + prefill_latencies = [] + mem_usages = [] + + for prompt in data: + prompt_start_time = time.time() + + f = io.StringIO() + with redirect_stderr(f): + output = self.model( + prompt=prompt, + max_tokens=kwargs.get("max_tokens", 32), + stop=kwargs.get("stop", ["Q:", "\n"]), + echo=kwargs.get("echo", True), + temperature=kwargs.get("temperature", 0.8), + top_p=kwargs.get("top_p", 0.95), + top_k=kwargs.get("top_k", 40), + repeat_penalty=kwargs.get("repeat_penalty", 1.1), + ) + stdout_output = f.getvalue() + + # parse timing info + timings = self._parse_timings(stdout_output) + prefill_latency = timings.get('prompt_eval_time', 0.0) # ms + generated_text = output['choices'][0]['text'] + + prompt_end_time = time.time() + prompt_total_time = (prompt_end_time - prompt_start_time) * 1000 # convert to ms + + result_with_time = { + "generated_text": generated_text, + "total_time": prompt_total_time, + "prefill_latency": prefill_latency, + "mem_usage":process.memory_info().rss, + } + + results.append(result_with_time) + + predict_dict = { + "results": results, + } + + return predict_dict + + def _parse_timings(self, stdout_output): + import re + timings = {} + for line in stdout_output.split('\n'): + match = re.match(r'llama_print_timings:\s*(.+?)\s*=\s*([0-9\.]+)\s*ms', line) + if match: + key = match.group(1).strip() + value = float(match.group(2)) + + key = key.lower().replace(' ', '_') + timings[key] = value + return timings + + def evaluate(self, data, model_path=None, **kwargs): + """ + evaluate model + """ + if data is None or data.x is None: + raise ValueError("Evaluation data is None.") + + if model_path: + self.load(model_path) + + # do predict + predict_dict = self.predict(data.x, **kwargs) + + # compute metrics + metric = kwargs.get("metric") + if metric is None: + raise ValueError("No metric provided in kwargs.") + + metric_name, metric_func = metric + + if callable(metric_func): + metric_value = metric_func(None, predict_dict["results"]) + return {metric_name: metric_value} + else: + raise ValueError(f"Metric function {metric_name} is not callable or not provided.") + + def save(self, model_path): + pass + + def load(self, model_url): + pass + + def train(self, train_data, valid_data=None, **kwargs): + return \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/download_model_modelscope.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/download_model_modelscope.py new file mode 100644 index 00000000..1adda3af --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/download_model_modelscope.py @@ -0,0 +1,25 @@ +import os +import argparse +from modelscope import snapshot_download +import logging + +logging.getLogger().setLevel(logging.INFO) + +def download_model(model_id, revision, local_dir): + try: + model_dir = snapshot_download(model_id, revision=revision, cache_dir=local_dir) + logging.info(f"Model successfully downloaded to: {model_dir}") + return model_dir + except Exception as e: + logging.info(f"Error downloading model: {str(e)}") + return None + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download a model from ModelScope") + parser.add_argument("--model_id", type=str, required=True, help="ModelScope model ID") + parser.add_argument("--revision", type=str, default="master", help="Model revision") + parser.add_argument("--local_dir", type=str, required=True, help="Local directory to save the model") + + args = parser.parse_args() + + download_model(args.model_id, args.revision, args.local_dir) \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py new file mode 100644 index 00000000..c561cc7d --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py @@ -0,0 +1,29 @@ +# Copyright 2023 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sedna.common.class_factory import ClassType, ClassFactory +import statistics + +__all__ = ["latency"] + + +@ClassFactory.register(ClassType.GENERAL, alias="latency") +def latency(y_true, y_pred): + results_list = y_pred.get('results', []) + num_requests = len(results_list) + total_latency = 0.0 + for result in results_list: + total_latency += result['total_time'] + average_latency = total_latency / num_requests + return average_latency \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py new file mode 100644 index 00000000..3d57b672 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py @@ -0,0 +1,13 @@ +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["mem_usage"] + +@ClassFactory.register(ClassType.GENERAL, alias="mem_usage") +def mem_usage(y_true, y_pred): + results_list = y_pred.get('results', []) + total_mem_usage = 0.0 + num_requests = len(results_list) + for result in results_list: + total_mem_usage += result['mem_usage'] + average_mem_usage = total_mem_usage / num_requests + return average_mem_usage \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py new file mode 100644 index 00000000..b7743577 --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py @@ -0,0 +1,13 @@ +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["prefill_latency"] + +@ClassFactory.register(ClassType.GENERAL, alias="prefill_latency") +def prefill_latency(y_true, y_pred): + results_list = y_pred.get('results', []) + num_requests = len(results_list) + total_prefill_latency = 0.0 + for result in results_list: + total_prefill_latency += result['prefill_latency'] + avg_prefill_latency = total_prefill_latency / num_requests + return avg_prefill_latency \ No newline at end of file diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml new file mode 100644 index 00000000..69de256f --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml @@ -0,0 +1,14 @@ +testenv: + dataset: + train_data: "ianvs/government/objective/train_data/data.jsonl" + test_data: "ianvs/government/objective/test_data/data.jsonl" + use_gpu: false + metrics: + - name: "latency" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py" + - name: "throughput" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py" + - name: "prefill_latency" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/prefill_latency.py" + - name: "mem_usage" + url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/mem_usage.py" diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py new file mode 100644 index 00000000..3ad7a05a --- /dev/null +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/throughput.py @@ -0,0 +1,30 @@ +# Copyright 2023 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["throughput"] + +@ClassFactory.register(ClassType.GENERAL, alias="throughput") +def throughput(y_true, y_pred): + # total_time = y_pred.get('avg_total_time', []) + results_list = y_pred.get('results', []) + num_requests = len(results_list) + total_latency = 0.0 + for result in results_list: + total_latency += result['total_time'] + avg_throughput = num_requests /total_latency + return avg_throughput \ No newline at end of file