diff --git a/examples/llm-edge-benchmark-suite/README.md b/examples/llm-edge-benchmark-suite/README.md index 9499a005..8ef4ae97 100644 --- a/examples/llm-edge-benchmark-suite/README.md +++ b/examples/llm-edge-benchmark-suite/README.md @@ -18,16 +18,7 @@ The data of llm-edge-benchmark-suite example structure is: `train_data/data.jsonl` is empty, and the `test_data/data.jsonl` is as follows: ``` -{"question": "如果小明有5个苹果,他给了小华3个,那么小明还剩下多少个苹果?\nA. 2个\nB. 3个\nC. 4个\nD. 5个", "answer": "A"} -{"question": "下列哪个数是最小的质数?\nA. 0\nB. 1\nC. 2\nD. 4", "answer": "C"} -{"question": "一个长方形的长是10厘米,宽是5厘米,它的周长是多少厘米?\nA. 20厘米\nB. 30厘米\nC. 40厘米\nD. 50厘米", "answer": "B"} -{"question": "下列哪个分数是最接近1的?\nA. 1/2\nB. 3/4\nC. 4/5\nD. 5/6", "answer": "D"} -{"question": "如果一个数加上10等于30,那么这个数是多少?\nA. 20\nB. 21\nC. 22\nD. 23", "answer": "A"} -{"question": "下列哪个算式的结果最大?\nA. 3 + 4\nB. 5 - 2\nC. 6 * 2\nD. 7 ÷ 2", "answer": "C"} -{"question": "一个班级有24个学生,如果每个学生都带了2本书,那么总共有多少本书?\nA. 48本\nB. 36本\nC. 24本\nD. 12本", "answer": "A"} -{"question": "下列哪个是正确的乘法口诀?\nA. 三三得七\nB. 四四十六\nC. 五五二十五\nD. 六六三十六", "answer": "B"} -{"question": "如果一个数是另一个数的3倍,并且这个数是15,那么另一个数是多少?\nA. 5\nB. 10\nC. 15\nD. 45", "answer": "A"} -{"question": "下列哪个图形的周长最长?\nA. 正方形\nB. 长方形\nC. 圆形\nD. 三角形", "answer": "C"} +{"question": "Which of the following numbers is the smallest prime number?\nA. 0\nB. 1\nC. 2\nD. 4", "answer": "C"} ``` ### prepare env diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/basemodel.py b/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/basemodel.py index d38fc2f8..06b45875 100644 --- a/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/basemodel.py +++ b/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/basemodel.py @@ -88,9 +88,6 @@ def predict(self, data, input_shape=None, **kwargs): def _parse_timings(self, stdout_output): import re timings = {} - print("================================") - print(stdout_output) - print("================================") for line in stdout_output.split('\n'): match = re.match(r'llama_print_timings:\s*(.+?)\s*=\s*([0-9\.]+)\s*ms', line) if match: @@ -99,9 +96,6 @@ def _parse_timings(self, stdout_output): key = key.lower().replace(' ', '_') timings[key] = value - print(f"Captured timing: {key} = {value}") - else: - print("No match found for this line.") return timings diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testenv/prefill_latency.py b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/prefill_latency.py index 15e94f35..b7743577 100644 --- a/examples/llm-edge-benchmark-suite/single_task_bench/testenv/prefill_latency.py +++ b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/prefill_latency.py @@ -4,9 +4,6 @@ @ClassFactory.register(ClassType.GENERAL, alias="prefill_latency") def prefill_latency(y_true, y_pred): - # avg_prefill_latency = y_pred.get('avg_prefill_latency', []) - # return avg_prefill_latency - #TODO 前面所有歌predict_dict 的结果,可以通过下面拿出来,我想把计算平均的过程放在这里,帮我实现 results_list = y_pred.get('results', []) num_requests = len(results_list) total_prefill_latency = 0.0 diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml index 749868a1..1fdd5d5b 100644 --- a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml @@ -1,6 +1,6 @@ algorithm: paradigm_type: "singletasklearning_with_compression" - + mode: "with_compression" initial_model_url: "models/qwen/qwen_1_5_0_5b.gguf" quantization_type: "q8_0" llama_quantize_path: "llama.cpp/llama-quantize" diff --git a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml index e4a6e88a..69de256f 100644 --- a/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml +++ b/examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml @@ -2,7 +2,7 @@ testenv: dataset: train_data: "ianvs/government/objective/train_data/data.jsonl" test_data: "ianvs/government/objective/test_data/data.jsonl" - use_gpu: true + use_gpu: false metrics: - name: "latency" url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/latency.py"