add: example for LLM joint inference

Signed-off-by: Yu Fan <fany@buaa.edu.cn>
kubeedge · Aug 2, 2024 · 6824017 · 6824017
1 parent 4ac560b
commit 6824017
Show file tree

Hide file tree

Showing 13 changed files with 560 additions and 0 deletions.
diff --git a/examples/cloud-edge-collaborative-inference-for-llm/README.md b/examples/cloud-edge-collaborative-inference-for-llm/README.md
@@ -0,0 +1,54 @@
+# README
+
+## Simple QA
+
+### Prepare Data
+
+The data of simple-qa example structure is:
+
+```
+.
+├── test_data
+│   └── data.jsonl
+└── train_data
+    └── data.jsonl
+```
+
+`train_data/data.jsonl` is empty, and the `test_data/data.jsonl` is as follows:
+
+```
+{"question": "如果小明有5个苹果，他给了小华3个，那么小明还剩下多少个苹果？\nA. 2个\nB. 3个\nC. 4个\nD. 5个", "answer": "A"}
+{"question": "下列哪个数是最小的质数？\nA. 0\nB. 1\nC. 2\nD. 4", "answer": "C"}
+{"question": "一个长方形的长是10厘米，宽是5厘米，它的周长是多少厘米？\nA. 20厘米\nB. 30厘米\nC. 40厘米\nD. 50厘米", "answer": "B"}
+{"question": "下列哪个分数是最接近1的？\nA. 1/2\nB. 3/4\nC. 4/5\nD. 5/6", "answer": "D"}
+{"question": "如果一个数加上10等于30，那么这个数是多少？\nA. 20\nB. 21\nC. 22\nD. 23", "answer": "A"}
+{"question": "下列哪个算式的结果最大？\nA. 3 + 4\nB. 5 - 2\nC. 6 * 2\nD. 7 ÷ 2", "answer": "C"}
+{"question": "一个班级有24个学生，如果每个学生都带了2本书，那么总共有多少本书？\nA. 48本\nB. 36本\nC. 24本\nD. 12本", "answer": "A"}
+{"question": "下列哪个是正确的乘法口诀？\nA. 三三得七\nB. 四四十六\nC. 五五二十五\nD. 六六三十六", "answer": "B"}
+{"question": "如果一个数是另一个数的3倍，并且这个数是15，那么另一个数是多少？\nA. 5\nB. 10\nC. 15\nD. 45", "answer": "A"}
+{"question": "下列哪个图形的周长最长？\nA. 正方形\nB. 长方形\nC. 圆形\nD. 三角形", "answer": "C"}
+```
+
+### Prepare Environment
+
+You need to install the changed-sedna package, which added `JsonlDataParse` in `sedna.datasources`
+
+Replace the file in `yourpath/anaconda3/envs/ianvs/lib/python3.x/site-packages/sedna` with `examples/resources/sedna-with-jsonl.zip`
+
+
+### Run Ianvs
+
+Run the following command:
+
+`ianvs -f examples/llm/singletask_learning_bench/simple_qa/benchmarkingjob.yaml`
+
+## OpenCompass Evaluation
+
+### Prepare Environment
+
+`pip install examples/resources/opencompass-0.2.5-py3-none-any.whl`
+
+### Run Evaluation
+
+`python run_op.py examples/llm/singletask_learning_bench/simple_qa/testalgorithms/gen/op_eval.py`
+
diff --git a/examples/cloud-edge-collaborative-inference-for-llm/benchmarkingjob.yaml b/examples/cloud-edge-collaborative-inference-for-llm/benchmarkingjob.yaml
@@ -0,0 +1,73 @@
+benchmarkingjob:
+  # job name of bechmarking; string type;
+  name: "benchmarkingjob"
+  # the url address of job workspace that will reserve the output of tests; string type;
+  # "~/" cannot be identified, so must be relative path or absoulute path
+  workspace: "./workspace"
+
+  # the url address of test environment configuration file; string type;
+  # the file format supports yaml/yml;
+  testenv: "./examples/cloud-edge-collaborative-inference-for-llm/testenv/testenv.yaml"
+
+  # the configuration of test object
+  test_object:
+    # test type; string type;
+    # currently the option of value is "algorithms",the others will be added in succession.
+    type: "algorithms"
+    # test algorithm configuration files; list type;
+    algorithms:
+      # algorithm name; string type;
+      - name: "query-routing"
+        # the url address of test algorithm configuration file; string type;
+        # the file format supports yaml/yml;
+        url: "./examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/test_queryrouting.yaml"
+
+  # the configuration of ranking leaderboard
+  rank:
+    # rank leaderboard with metric of test case's evaluation and order ; list type;
+    # the sorting priority is based on the sequence of metrics in the list from front to back;
+    sort_by: [ { "acc": "descend" } ]
+
+    # visualization configuration
+    visualization:
+      # mode of visualization in the leaderboard; string type;
+      # There are quite a few possible dataitems in the leaderboard. Not all of them can be shown simultaneously on the screen.
+      # In the leaderboard, we provide the "selected_only" mode for the user to configure what is shown or is not shown.
+      mode: "selected_only"
+      # method of visualization for selected dataitems; string type;
+      # currently the options of value are as follows:
+      #  1> "print_table": print selected dataitems;
+      method: "print_table"
+
+    # selected dataitem configuration
+    # The user can add his/her interested dataitems in terms of "paradigms", "modules", "hyperparameters" and "metrics",
+    # so that the selected columns will be shown.
+    selected_dataitem:
+      # currently the options of value are as follows:
+      #   1> "all": select all paradigms in the leaderboard;
+      #   2> paradigms in the leaderboard, e.g., "singletasklearning"
+      paradigms: [ "all" ]
+      # currently the options of value are as follows:
+      #   1> "all": select all modules in the leaderboard;
+      #   2> modules in the leaderboard, e.g., "basemodel"
+      modules: [ "all" ]
+      # currently the options of value are as follows:
+      #   1> "all": select all hyperparameters in the leaderboard;
+      #   2> hyperparameters in the leaderboard, e.g., "momentum"
+      hyperparameters: [ "all" ]
+      # currently the options of value are as follows:
+      #   1> "all": select all metrics in the leaderboard;
+      #   2> metrics in the leaderboard, e.g., "f1_score"
+      metrics: [ "acc" , "latency", "throughput", "bandwith"]
+
+    # model of save selected and all dataitems in workspace; string type;
+    # currently the options of value are as follows:
+    #  1> "selected_and_all": save selected and all dataitems;
+    #  2> "selected_only": save selected dataitems;
+    save_mode: "selected_and_all"
+
+
+
+
+
+
diff --git a/...es/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/cloud_model.py b/...es/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/cloud_model.py
@@ -0,0 +1,51 @@
+# Copyright 2022 The KubeEdge Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import os
+import tempfile
+import time
+import zipfile
+import logging
+
+import numpy as np
+from sedna.common.config import Context
+from sedna.common.class_factory import ClassType, ClassFactory
+
+from models import HuggingfaceLLM, VllmLLM, APIBasedLLM
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+device = "cuda" # the device to load the model onto
+
+
+logging.disable(logging.WARNING)
+
+__all__ = ["BaseModel"]
+
+@ClassFactory.register(ClassType.GENERAL, alias="CloudModel")
+class BaseModel:
+    def __init__(self, **kwargs):
+        # The API KEY and API URL are confidential data and should not be written in yaml.
+        self.client = APIBasedLLM(
+            model_name = kwargs.get("model_name", "gpt-4o-mini"),
+            config = kwargs.get("config", None),
+        )
+
+    def inference(self, data, input_shape=None, **kwargs):
+        answer_list = []
+        for line in data:
+            response = self.model.inference(line)
+            answer_list.append(response)
+        return answer_list
diff --git a/...les/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/edge_model.py b/...les/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/edge_model.py
@@ -0,0 +1,74 @@
+# Copyright 2022 The KubeEdge Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import os
+import tempfile
+import time
+import zipfile
+import logging
+
+import numpy as np
+from sedna.common.config import Context
+from sedna.common.class_factory import ClassType, ClassFactory
+
+from models import HuggingfaceLLM, VllmLLM, APIBasedLLM
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+device = "cuda" # the device to load the model onto
+
+
+logging.disable(logging.WARNING)
+
+__all__ = ["BaseModel"]
+
+@ClassFactory.register(ClassType.GENERAL, alias="EdgeModel")
+class BaseModel:
+    """
+        This is actually the Edge Model.
+    """
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+        self.model_url = kwargs.get("model_name", None)
+        self.backend = kwargs.get("backend", "huggingface") 
+        self.quantization = kwargs.get("quantization", "full")
+        self._set_config()
+        # 'backend' means serving framework: "huggingface", "vllm"
+        # 'quantization' means quantization mode："full","4-bit","8-bit"
+
+    def _set_config(self):
+        # Some parameters are passed to Sedna through environment variables 
+        parameters = os.environ
+        # EdgeModel URL, see at https://github.com/kubeedge/sedna/blob/ac623ab32dc37caa04b9e8480dbe1a8c41c4a6c2/lib/sedna/core/base.py#L132
+        parameters["MODEL_URL"] = self.model_url
+
+    def load(self, model_url=None):
+        if self.backend == "huggingface":
+            self.model = HuggingfaceLLM(model_url, self.quantization)
+        elif self.backend == "vllm":
+            self.model = VllmLLM(model_url, self.quantization)
+        else:
+            raise Exception(f"Backend {self.backend} is not supported")
+
+        self.model.load(model_url=model_url)
+
+        # TODO cloud service must be configured in JointInference
+
+    def predict(self, data, input_shape=None, **kwargs):
+        answer_list = []
+        for line in data:
+            response = self.model.inference(line)
+            answer_list.append(response)
+        return answer_list
diff --git a/...d-edge-collaborative-inference-for-llm/testalgorithms/query-routing/hard_sample_mining.py b/...d-edge-collaborative-inference-for-llm/testalgorithms/query-routing/hard_sample_mining.py
@@ -0,0 +1,64 @@
+# Copyright 2021 The KubeEdge Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Hard Example Mining Algorithms"""
+
+import abc
+import math
+import random
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from sedna.common.class_factory import ClassFactory, ClassType
+
+__all__ = ('ThresholdFilter', 'CrossEntropyFilter', 'IBTFilter')
+
+class BaseFilter(metaclass=abc.ABCMeta):
+    """The base class to define unified interface."""
+
+    def __call__(self, infer_result=None):
+        """
+        predict function, judge the sample is hard or not.
+
+        Parameters
+        ----------
+        infer_result : array_like
+            prediction result
+
+        Returns
+        -------
+        is_hard_sample : bool
+            `True` means hard sample, `False` means not.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def data_check(cls, data):
+        """Check the data in [0,1]."""
+        return 0 <= float(data) <= 1
+
+
+@ClassFactory.register(ClassType.HEM, alias="BERT")
+class BERTFilter(BaseFilter, abc.ABC):
+    def __init__(self, model_path, **kwargs):
+        self.classifier = pipeline(
+            "text-classification", 
+            model=model_path, 
+            trust_remote_code=True
+        )
+
+    def _predict(self, data):
+        result = self.classifier(data)
+        return result
+
+    def __call__(self, data=None) -> bool:
+        return self._predict(data)
diff --git a/...loud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/__init__.py b/...loud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/__init__.py
@@ -0,0 +1,3 @@
+from .api_llm import APIBasedLLM
+from .huggingface_llm import HuggingfaceLLM
+from .vllm_llm import VllmLLM
diff --git a/...cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/api_llm.py b/...cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/api_llm.py
@@ -0,0 +1,38 @@
+import os 
+from openai import OpenAI
+
+from base_llm import BaseLLM
+from sedna.core.joint_inference.joint_inference import BigModelService
+
+class APIBasedLLM(BaseLLM):
+    def __init__(self, model_name, **kwargs) -> None:
+
+        api_key=os.environ.get("OPENAI_API_KEY"),
+        base_url=os.environ.get("OPENAI_BASE_URL")
+
+        self.model = model_name
+        self.client = OpenAI(
+            api_key=api_key,
+            base_url=base_url
+        )
+
+    def _infer(self, prompt, system=None):
+        if system:   
+            messages = [
+                {"role": "system", "content": system},
+                {"role": "user", "content": prompt}
+            ]
+        else:
+            messages = [
+                {"role": "user", "content": prompt}
+            ]
+
+        self.chat_completion = self.client.chat.completions.create(
+            messages = messages,
+            model=self.model,
+        )
+
+        response = self.chat_completion.choices[0].message.content
+
+        return response
+
diff --git a/...loud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/base_llm.py b/...loud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/base_llm.py
@@ -0,0 +1,17 @@
+class BaseLLM:
+    def __init__(self, **kwargs) -> None:
+        BaseLLM.__init__(self, **kwargs)
+        self.quantization = kwargs.get("quantization", "full")
+
+    def load(self):
+        raise NotImplementedError
+
+    def inference(self, datas):
+        answer_list = []
+        for line in datas:
+            response = self._infer(line)
+            answer_list.append(response)
+        return answer_list
+
+    def _infer(self, data):
+        raise NotImplementedError