Merge remote-tracking branch 'origin/habana_main' into private/kzawor…

…a/insert_or_update_cache_opt
HabanaAI · Sep 25, 2024 · a334539 · a334539
2 parents 8a90083 + 9111a80
commit a334539
Show file tree

Hide file tree

Showing 874 changed files with 89,752 additions and 21,822 deletions.
diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
@@ -1,36 +1,43 @@
 import os
+import sys
 import zipfile
 
-MAX_SIZE_MB = 250
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
 
 
 def print_top_10_largest_files(zip_file):
+    """Print the top 10 largest files in the given zip file."""
     with zipfile.ZipFile(zip_file, 'r') as z:
         file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
         file_sizes.sort(key=lambda x: x[1], reverse=True)
         for f, size in file_sizes[:10]:
-            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
+            print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
 
 
 def check_wheel_size(directory):
+    """Check the size of .whl files in the given directory."""
     for root, _, files in os.walk(directory):
-        for f in files:
-            if f.endswith(".whl"):
-                wheel_path = os.path.join(root, f)
-                wheel_size = os.path.getsize(wheel_path)
-                wheel_size_mb = wheel_size / (1024 * 1024)
-                if wheel_size_mb > MAX_SIZE_MB:
-                    print(
-                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
-                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
+        for file_name in files:
+            if file_name.endswith(".whl"):
+                wheel_path = os.path.join(root, file_name)
+                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
+                if wheel_size_mb > VLLM_MAX_SIZE_MB:
+                    print(f"Not allowed: Wheel {wheel_path} is larger "
+                          f"({wheel_size_mb:.2f} MB) than the limit "
+                          f"({VLLM_MAX_SIZE_MB} MB).")
                     print_top_10_largest_files(wheel_path)
                     return 1
                 else:
                     print(f"Wheel {wheel_path} is within the allowed size "
-                          f"({wheel_size_mb} MB).")
+                          f"({wheel_size_mb:.2f} MB).")
     return 0
 
 
 if __name__ == "__main__":
-    import sys
-    sys.exit(check_wheel_size(sys.argv[1]))
+    if len(sys.argv) < 2:
+        print("Usage: python check-wheel-size.py <directory>")
+        sys.exit(1)
+
+    directory = sys.argv[1]
+    sys.exit(check_wheel_size(directory))
diff --git a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@@ -9,3 +9,4 @@ tasks:
     value: 0.664
 limit: 1000
 num_fewshot: 5
+trust_remote_code: True
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.419
+  - name: "exact_match,flexible-extract"
+    value: 0.416
+limit: 1000
+num_fewshot: 5
diff --git a/...val-harness/configs/Minitron-4B-Base.yaml → ...harness/configs/Minitron-4B-Base-FP8.yaml b/...val-harness/configs/Minitron-4B-Base.yaml → ...harness/configs/Minitron-4B-Base-FP8.yaml
@@ -1,11 +1,11 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
-model_name: "nvidia/Minitron-4B-Base"
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
+model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.252
+    value: 0.233
   - name: "exact_match,flexible-extract"
-    value: 0.252
+    value: 0.236
 limit: 1000
 num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -1,9 +1,9 @@
 Meta-Llama-3-8B-Instruct.yaml
-Meta-Llama-3-8B-Instruct-FP8.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base.yaml
+Minitron-4B-Base-FP8.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
+Meta-Llama-3-8B-QQQ.yaml
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -14,7 +14,7 @@
 import numpy
 import yaml
 
-RTOL = 0.02
+RTOL = 0.05
 TEST_DATA_FILE = os.environ.get(
     "LM_EVAL_TEST_DATA_FILE",
     ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
@@ -23,9 +23,12 @@
 
 
 def launch_lm_eval(eval_config):
+    trust_remote_code = eval_config.get('trust_remote_code', False)
+
     model_args = f"pretrained={eval_config['model_name']}," \
                  f"tensor_parallel_size={TP_SIZE}," \
-                 f"add_bos_token=true"
+                 f"add_bos_token=true," \
+                 f"trust_remote_code={trust_remote_code}"
 
     results = lm_eval.simple_evaluate(
         model="vllm",

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
@@ -34,17 +34,18 @@ See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
 
 Performance benchmark will be triggered when:
 - A PR being merged into vllm.
-- Every commit for those PRs with `perf-benchmarks` label.
+- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 
 Nightly benchmark will be triggered when:
-- Every commit for those PRs with `nightly-benchmarks` label.
+- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 
 
 
 
 ## Performance benchmark details
 
-See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
+
+See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 
 
 #### Latency test
@@ -68,7 +69,7 @@ Here is an example of one test inside `latency-tests.json`:
 
 In this example:
 -  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
--  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
 

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -8,8 +8,7 @@ steps:
           containers:
           - image: badouralix/curl-jq
             command:
-            - sh
-            - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
   - wait
   - label: "A100"
     agents:
@@ -21,7 +20,7 @@ steps:
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
             command:
-            - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
             resources:
               limits:
                 nvidia.com/gpu: 8
@@ -42,20 +41,20 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - label: "H100"
-    agents:
-      queue: H100
-    plugins:
-    - docker#v5.11.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
+  # - label: "H100"
+  #   agents:
+  #     queue: H100
+  #   plugins:
+  #   - docker#v5.11.0:
+  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #       command:
+  #       - bash
+  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+  #       mount-buildkite-agent: true
+  #       propagate-environment: true
+  #       ipc: host
+  #       gpus: all
+  #       environment:
+  #       - VLLM_USAGE_SOURCE
+  #       - HF_TOKEN
 
diff --git a/.../nightly-benchmarks/tests/descriptions.md → ...ks/performance-benchmarks-descriptions.md b/.../nightly-benchmarks/tests/descriptions.md → ...ks/performance-benchmarks-descriptions.md
@@ -1,47 +1,42 @@
 
 ## Latency tests
 
-This test suite aims to test vllm's end-to-end latency under a controlled setup.
-
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 
-### Latency benchmarking results
 
 {latency_tests_markdown_table}
 
-## Throughput tests
 
-This test suite aims to test vllm's throughput.
+## Throughput tests
 
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.
 
-### Throughput benchmarking results
 
 {throughput_tests_markdown_table}
 
-## Serving tests
 
-This test suite aims to test vllm's real serving metrics.
+## Serving tests
 
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- We also added a speculative decoding test for llama-3 70B, under QPS 2
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 
-### Serving benchmarking results
 
 {serving_tests_markdown_table}
 
+
 ## json version of the benchmarking tables
 
 This section contains the data of the markdown tables above in JSON format. 

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -174,8 +174,8 @@ def results_to_json(latency, throughput, serving):
     # document the result
     with open(results_folder / "benchmark_results.md", "w") as f:
 
-        results = read_markdown(
-            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
+        results = read_markdown("../.buildkite/nightly-benchmarks/" +
+                                "performance-benchmarks-descriptions.md")
         results = results.format(
             latency_tests_markdown_table=latency_md_table,
             throughput_tests_markdown_table=throughput_md_table,