From ce06b118f2dbb36db5ab7affe03b0c4b24e1b02a Mon Sep 17 00:00:00 2001 From: Jason Andrews Date: Fri, 13 Dec 2024 12:53:08 -0600 Subject: [PATCH 1/2] New vLLM Learning Path --- .../vLLM/_index.md | 42 +++++++ .../vLLM/_next-steps.md | 28 +++++ .../vLLM/_review.md | 44 +++++++ .../vLLM/vllm-run.md | 110 ++++++++++++++++++ .../vLLM/vllm-server.md | 83 +++++++++++++ .../vLLM/vllm-setup.md | 107 +++++++++++++++++ 6 files changed, 414 insertions(+) create mode 100644 content/learning-paths/servers-and-cloud-computing/vLLM/_index.md create mode 100644 content/learning-paths/servers-and-cloud-computing/vLLM/_next-steps.md create mode 100644 content/learning-paths/servers-and-cloud-computing/vLLM/_review.md create mode 100644 content/learning-paths/servers-and-cloud-computing/vLLM/vllm-run.md create mode 100644 content/learning-paths/servers-and-cloud-computing/vLLM/vllm-server.md create mode 100644 content/learning-paths/servers-and-cloud-computing/vLLM/vllm-setup.md diff --git a/content/learning-paths/servers-and-cloud-computing/vLLM/_index.md b/content/learning-paths/servers-and-cloud-computing/vLLM/_index.md new file mode 100644 index 000000000..b1fa0e378 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/vLLM/_index.md @@ -0,0 +1,42 @@ +--- +title: Large language models (LLMs) on Arm servers with vLLM + +draft: true +cascade: + draft: true + +minutes_to_complete: 45 + +who_is_this_for: This is an introductory topic for software developers and AI engineers interested in learning how to use vLLM (Virtual Large Language Model) on Arm servers. + +learning_objectives: + - Build vLLM from source on an Arm server. + - Download a Qwen LLM from Hugging Face. + - Run local batch inference using vLLM. + - Create and interact with an OpenAI compatible server provided by vLLM on your Arm server.. + +prerequisites: + - An [Arm-based instance](/learning-paths/servers-and-cloud-computing/csp/) from a cloud service provider or a local Arm Linux computer with at least 8 CPUs and 16 GB RAM. + +author_primary: Jason Andrews + +### Tags +skilllevels: Introductory +subjects: ML +armips: + - Neoverse +operatingsystems: + - Linux +tools_software_languages: + - vLLM + - LLM + - GenAI + - Python + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/vLLM/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/vLLM/_next-steps.md new file mode 100644 index 000000000..c7b3dd91b --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/vLLM/_next-steps.md @@ -0,0 +1,28 @@ +--- +next_step_guidance: > + Thank you for completing this learning path on how to build and run vLLM on Arm servers. You might be interested in learning how to further optimize and benchmark LLM performance on Arm-based platforms. + +recommended_path: "/learning-paths/servers-and-cloud-computing/benchmark-nlp/" + +further_reading: + - resource: + title: vLLM Documentation + link: https://docs.vllm.ai/ + type: documentation + - resource: + title: vLLM GitHub Repository + link: https://github.com/vllm-project/vllm + type: github + - resource: + title: Hugging Face Model Hub + link: https://huggingface.co/models + type: website + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +weight: 21 # set to always be larger than the content in this path, and one more than 'review' +title: "Next Steps" # Always the same +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/servers-and-cloud-computing/vLLM/_review.md b/content/learning-paths/servers-and-cloud-computing/vLLM/_review.md new file mode 100644 index 000000000..28fb7bbdb --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/vLLM/_review.md @@ -0,0 +1,44 @@ + +--- +review: + - questions: + question: > + What is the primary purpose of vLLM? + answers: + - "Operating System Development" + - "Large Language Model Inference and Serving" + - "Database Management" + correct_answer: 2 + explanation: > + vLLM is designed for fast and efficient Large Language Model inference and serving. + + - questions: + question: > + In addition to Python, which extra programming languages are required by the vLLM build system? + answers: + - "Java" + - "Rust" + - "C++" + - "Rust and C++" + correct_answer: 4 + explanation: > + The vLLM build system requires the Rust toolchain and GCC for its compilation. + + - questions: + question: > + What is the VLLM_TARGET_DEVICE environment variable set to for building vLLM for Arm CPUs? + answers: + - "cuda" + - "gpu" + - "cpu" + correct_answer: 3 + explanation: > + The VLLM_TARGET_DEVICE environment variable needs to be set to cpu to target the Arm processor. + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +title: "Review" # Always the same title +weight: 20 # Set to always be larger than the content in this path +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-run.md b/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-run.md new file mode 100644 index 000000000..07a29dcd8 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-run.md @@ -0,0 +1,110 @@ +--- +title: Run batch inference using vLLM +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Use a model from Hugging Face + +vLLM is designed to work seamlessly with models from the Hugging Face Hub, + +The first time you run vLLM it downloads the required model. This means you don't have to explicitly download any models. + +If you want to use a model that requires you to request access or accept terms, you need to log in to Hugging Face using a token. + +```bash +huggingface-cli login +``` + +Enter your Hugging Face token. You can generate a token from [Hugging Face Hub](https://huggingface.co/) by clicking your profile on the top right corner and selecting `Access Tokens`. + +You also need to visit the Hugging Face link printed in the login output and accept the terms by clicking the "Agree and access repository" button or filling out the request for access form (depending on the model). + +To run batched inference without the need for a login, you can use the `Qwen/Qwen2.5-0.5B-Instruct` model. + +## Create a batch script + +To run inference with multiple prompts you can create a simple Python script to load a model and run the prompts. + +Use a text editor to save the Python script below in a file called `batch.py`. + +```python +import json +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Write a hello world program in C", + "Write a hello world program in Java", + "Write a hello world program in Rust", +] + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) + +# Create an LLM. +llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", dtype="bfloat16") + +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + result = { + "Prompt": prompt, + "Generated text": generated_text + } + print(json.dumps(result, indent=4)) +``` + +The script uses `bfloat16` precision. + +You can also change the length of the output using the `max_tokens` value. + +Run the Python script: + +```bash +python ./batch.py +``` + +The output shows vLLM starting, the model loading, and the batch processing of the 3 prompts: + +```output +INFO 12-12 22:52:57 config.py:441] This model supports multiple tasks: {'generate', 'reward', 'embed', 'score', 'classify'}. Defaulting to 'generate'. +WARNING 12-12 22:52:57 config.py:567] Async output processing is not supported on the current platform type cpu. +WARNING 12-12 22:52:57 cpu.py:56] CUDA graph is not supported on CPU, fallback to the eager mode. +WARNING 12-12 22:52:57 cpu.py:68] Environment variable VLLM_CPU_KVCACHE_SPACE (GB) for CPU backend is not set, using 4 by default. +INFO 12-12 22:52:57 importing.py:15] Triton not installed or not compatible; certain GPU-related functions will not be available. +INFO 12-12 22:52:57 llm_engine.py:250] Initializing an LLM engine (v0.6.4.post2.dev322+g72ff3a96) with config: VllmConfig(model_config=, cache_config=, parallel_config=ParallelConfig(pipeline_parallel_size=1, tensor_parallel_size=1, worker_use_ray=False, max_parallel_loading_workers=None, disable_custom_all_reduce=False, tokenizer_pool_config=None, ray_workers_use_nsight=False, placement_group=None, distributed_executor_backend=None, worker_cls='vllm.worker.cpu_worker.CPUWorker', sd_worker_cls='auto', world_size=1, rank=0), scheduler_config=SchedulerConfig(runner_type='generate', max_num_batched_tokens=32768, max_num_seqs=256, max_model_len=32768, num_lookahead_slots=0, delay_factor=0.0, enable_chunked_prefill=False, is_multimodal_model=False, preemption_mode=None, num_scheduler_steps=1, multi_step_stream_outputs=True, send_delta_data=False, policy='fcfs', chunked_prefill_enabled=False), device_config=, load_config=LoadConfig(load_format=, download_dir=None, model_loader_extra_config=None, ignore_patterns=['original/**/*']), lora_config=None, speculative_config=None, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), prompt_adapter_config=None, quant_config=None, compilation_config=CompilationConfig(level=0, debug_dump_path='', backend='', custom_ops=[], splitting_ops=['vllm.unified_attention', 'vllm.unified_attention_with_output'], use_inductor=True, candidate_compile_sizes=[], inductor_compile_config={}, inductor_passes={}, use_cudagraph=False, cudagraph_num_of_warmups=0, cudagraph_capture_sizes=None, cudagraph_copy_inputs=False, pass_config=PassConfig(dump_graph_stages=[], dump_graph_dir=PosixPath('.'), enable_fusion=True, enable_reshape=True), compile_sizes=[], capture_sizes=[256, 248, 240, 232, 224, 216, 208, 200, 192, 184, 176, 168, 160, 152, 144, 136, 128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 4, 2, 1], enabled_custom_ops=Counter(), disabled_custom_ops=Counter(), compilation_time=0.0, static_forward_context={}), kv_transfer_config=None, instance_id='5c715'),use_cached_outputs=False, +INFO 12-12 22:52:58 cpu.py:33] Cannot use _Backend.FLASH_ATTN backend on CPU. +INFO 12-12 22:52:58 selector.py:141] Using Torch SDPA backend. +INFO 12-12 22:52:58 weight_utils.py:243] Using model weights format ['*.safetensors'] +INFO 12-12 22:52:58 weight_utils.py:288] No model.safetensors.index.json found in remote. +Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00\n\nint main() {\n printf(\"Hello, World!\\n\");\n return 0;\n}\n```\n\nThis program does the following:\n\n1. Includes the `` header to use the `printf` function.\n2. Defines a `main` function, which is the entry point of the program.\n3. Uses `printf` to output the message \"Hello, World!\" to the console.\n4. Returns 0 to indicate that the program executed successfully.\n\nWhen you run this program, you should see the output:\n\n```\nHello, World!\n``` \n\nThis is the basic structure of a C program, providing a simple example of how to create, run, and display a basic program. Note that C is a high-level programming language, meaning that it provides low-level operations for users to interact with the hardware, but at the same time, it is a low-level language that needs to be compiled and linked into an executable file (.exe) that the computer's operating system can load and run. C, as a compiled language, often requires additional libraries and tools for use. For more information, you can refer to the C Programming Language documentation." +} +{ + "Prompt": "Write a hello world program in Java", + "Generated text": "\n\nCertainly! Below is a simple `HelloWorld.java` file that prints \"Hello, World!\" to the console when you run it:\n\n```java\npublic class HelloWorld {\n public static void main(String[] args) {\n System.out.println(\"Hello, World!\");\n }\n}\n```\n\nTo compile this program, you would use an integrated development environment (IDE) like IntelliJ IDEA, Eclipse, or NetBeans. Here is how you can compile it:\n\n1. Open a terminal or command prompt.\n2. Navigate to the directory where you saved the `HelloWorld.java` file.\n3. Compile the program using the following command:\n ```bash\n javac HelloWorld.java\n ```\n4. Run the compiled program using the following command:\n ```bash\n java HelloWorld\n ```\n\nThis will output:\n```\nHello, World!\n```" +} +{ + "Prompt": "Write a hello world program in Rust", + "Generated text": "\n\nCertainly! Here is a simple example of a `HelloWorld` program in Rust:\n\n```rust\nfn main() {\n println!(\"Hello, world!\");\n}\n```\n\n### Explanation:\n\n- `fn main()`: This is the entry point of the program.\n- `println!`: This function is used to print out the message `Hello, world!` to the console.\n- `println!`: The `println!` macro is used to print messages in Rust.\n\n### How to Run the Program:\n\n1. Make sure you have Rust installed on your system.\n2. Save the above code in a file with a `.rs` extension, e.g., `hello.rs`.\n3. Open a terminal or command prompt and navigate to the directory where the file is saved.\n4. Run the program by typing `rustc hello.rs` (if you're using `rustc`, you don't need to specify the file extension).\n5. After the program runs, it should print the message `Hello, world!` to the console.\n\n### Running in Development:\n\nIf you want to run the program in development mode to see the output in the terminal, you can use the `-o` flag:\n\n```sh\nrustc -o hello-dev hello.rs\n./" +} +``` + +You can try with other prompts and models such as `meta-llama/Llama-3.2-1B`. + +Continue to learn how to setup an OpenAI compatible server. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-server.md b/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-server.md new file mode 100644 index 000000000..ad6ab472d --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-server.md @@ -0,0 +1,83 @@ +--- +title: Run an OpenAI compatible server +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +Instead of a batch run from Python, you can create an OpenAI compatible server. This allows you to leverage the power of large language models without relying on external APIs. + +Running a local LLM offers several advantages: + +Cost-Effective: Avoids the costs associated with using external APIs, especially for high-usage scenarios.   +Privacy: Keeps your data and prompts within your local environment, enhancing privacy and security. +Offline Capability: Enables operation without an internet connection, making it ideal for scenarios with limited or unreliable network access. + +OpenAI compatibility means you can reuse existing software which was designed to communicate with OpenAI and have it talk to your local vLLM service. + +Run vLLM with the same `Qwen/Qwen2.5-0.5B-Instruct` model: + +```bash +python3 -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --dtype float16 +``` + +The server output displays that it is ready for requests: + +```output +INFO 12-12 22:54:40 cpu_executor.py:186] # CPU blocks: 21845 +INFO 12-12 22:54:40 llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 0.26 seconds +INFO 12-12 22:54:41 api_server.py:560] Using supplied chat template: +INFO 12-12 22:54:41 api_server.py:560] None +INFO 12-12 22:54:41 launcher.py:19] Available routes are: +INFO 12-12 22:54:41 launcher.py:27] Route: /openapi.json, Methods: HEAD, GET +INFO 12-12 22:54:41 launcher.py:27] Route: /docs, Methods: HEAD, GET +INFO 12-12 22:54:41 launcher.py:27] Route: /docs/oauth2-redirect, Methods: HEAD, GET +INFO 12-12 22:54:41 launcher.py:27] Route: /redoc, Methods: HEAD, GET +INFO 12-12 22:54:41 launcher.py:27] Route: /health, Methods: GET +INFO 12-12 22:54:41 launcher.py:27] Route: /tokenize, Methods: POST +INFO 12-12 22:54:41 launcher.py:27] Route: /detokenize, Methods: POST +INFO 12-12 22:54:41 launcher.py:27] Route: /v1/models, Methods: GET +INFO 12-12 22:54:41 launcher.py:27] Route: /version, Methods: GET +INFO 12-12 22:54:41 launcher.py:27] Route: /v1/chat/completions, Methods: POST +INFO 12-12 22:54:41 launcher.py:27] Route: /v1/completions, Methods: POST +INFO 12-12 22:54:41 launcher.py:27] Route: /v1/embeddings, Methods: POST +INFO 12-12 22:54:41 launcher.py:27] Route: /v1/score, Methods: POST +INFO: Started server process [12905] +INFO: Waiting for application startup. +INFO: Application startup complete. +INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) +``` + +You can submit requests to the server using the `curl` command. + +For example, run the command below using another terminal on the same server: + +```bash +curl http://0.0.0.0:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer DUMMY" \ + -d '{ + "model": "Qwen/Qwen2.5-0.5B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Write a hello world program in C" + } + ] + }' +``` + +The server processes the request and the output prints the results: + +```output +"id":"chatcmpl-6677cb4263b34d18b436b9cb8c6a5a65","object":"chat.completion","created":1734044182,"model":"Qwen/Qwen2.5-0.5B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Certainly! Here is a simple \"Hello, World!\" program in C:\n\n```c\n#include \n\nint main() {\n printf(\"Hello, World!\\n\");\n return 0;\n}\n```\n\nThis program defines a function called `main` which contains the body of the program. Inside the `main` function, it calls the `printf` function to display the text \"Hello, World!\" to the console. The `return 0` statement indicates that the program was successful and the program has ended.\n\nTo compile and run this program:\n\n1. Save the code above to a file named `hello.c`.\n2. Open a terminal or command prompt.\n3. Navigate to the directory where you saved the file.\n4. Compile the program using the following command:\n ```\n gcc hello.c -o hello\n ```\n5. Run the compiled program using the following command:\n ```\n ./hello\n ```\n Or simply type `hello` in the terminal.\n\nYou should see the output:\n\n```\nHello, World!\n```","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":26,"total_tokens":241,"completion_tokens":215,"prompt_tokens_details":null},"prompt_logprobs":null} +``` + +There are many other experiments you can try. Most Hugging Face models have a `Use this model` button on the top right of the model card with the instructions for vLLM. You can now use these instructions on your Arm Linux computer. + +You can also try out OpenAI compatible chat clients to connect to the served model. diff --git a/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-setup.md b/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-setup.md new file mode 100644 index 000000000..45d2dd727 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-setup.md @@ -0,0 +1,107 @@ +--- +title: Build vLLM from source code +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Before you begin + +You can follow the instructions for this Learning Path using an Arm server running Ubuntu 22.04 or 24.04 LTS with at least 8 cores, 16GB of RAM, and 50GB of disk storage. + +## What is vLLM? + +[vLLM](https://github.com/vllm-project/vllm) stands for Virtual Large Language Model, and is a fast and easy-to-use library for inference and model serving. + +vLLM can be used in batch mode or by running an OpenAI compatible server. + +In this Learning Path, you will learn how to build vLLM from source and run inference on an Arm-based server, highlighting its effectiveness. + +### What software do I need to install to build vLLM? + +First, ensure your system is up-to-date and install the required tools and libraries: + +```bash +sudo apt-get update -y +sudo apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip python3-venv python-is-python3 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 libssl-dev pkg-config +``` + +Set the default GCC to version 12: + +```bash +sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +``` + +Install Rust, refer to the [Rust install guide](/install-guides/rust/) if necessary. + +```bash +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +source "$HOME/.cargo/env" +``` + +Four environment variables are required. You can enter these at the command line or add them to your `$HOME/.bashrc` file and source the file. + +To add them at the command line: + +```bash +export CCACHE_DIR=/home/ubuntu/.cache/ccache +export CMAKE_CXX_COMPILER_LAUNCHER=ccache +export VLLM_CPU_DISABLE_AVX512="true" +export LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4" +``` + +Create and activate a Python virtual environment: + +```bash +python -m venv env +source env/bin/activate +``` + +Your command line prompt has `(env)` in front of it indicating you are in the Python virtual environment. + +Update Pip and install Python packages: + +```bash +pip install --upgrade pip +pip install py-cpuinfo +``` + +### How do I download vLLM and build it? + +Clone the vLLM repository from GitHub: + +```bash +git clone https://github.com/vllm-project/vllm.git +cd vllm +git checkout 72ff3a968682e6a3f7620ab59f2baf5e8eb2777b +``` + +{{% notice Note %}} +The Git checkout specifies a specific hash known to work for this example. + +Omit this command to use the latest code on the main branch. +{{% /notice %}} + +Install the Python packages for vLLM: + +```bash +pip install -r requirements-build.txt +pip install -v -r requirements-cpu.txt +``` + +Build vLLM using Pip: + +```bash +VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel +pip install dist/*.whl +``` + +When the build completes, navigate out of the repository: + +```bash +rm -rf dist +cd .. +``` + +You are now ready to download an LLM run vLLM. From 19337dd8890ad92ecbfa7b929810db5da590d042 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Dec 2024 18:28:08 -0500 Subject: [PATCH 2/2] Update vllm-setup.md --- .../servers-and-cloud-computing/vLLM/vllm-setup.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-setup.md b/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-setup.md index 45d2dd727..c228003e2 100644 --- a/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-setup.md +++ b/content/learning-paths/servers-and-cloud-computing/vLLM/vllm-setup.md @@ -8,7 +8,7 @@ layout: learningpathall ## Before you begin -You can follow the instructions for this Learning Path using an Arm server running Ubuntu 22.04 or 24.04 LTS with at least 8 cores, 16GB of RAM, and 50GB of disk storage. +You can follow the instructions for this Learning Path using an Arm server running Ubuntu 24.04 LTS with at least 8 cores, 16GB of RAM, and 50GB of disk storage. ## What is vLLM?