diff --git a/.wordlist.txt b/.wordlist.txt index 12f6888c8..10571ce17 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -3380,4 +3380,44 @@ wiseeye wlcsp xB xmodem -yolov \ No newline at end of file +yolov +Dsouza +FGCT +GCT +GCs +GC’s +HNso +HeapRegionSize +HugePages +InitiatingHeapOccupancyPercent +JDKs +JVMs +LZMA +Lau +LuaJIT +NGFW +ParallelGCThreads +Preema +Roesch +Sourcefire +TPACKET +WebGPU’s +Whitepaper +YGCT +axion +callstack +et +gc +grubfile +jstat +mqF +netresec +parallelizing +profileable +profilers +ruleset +snortrules +techmahindra +unreferenced +uptime +wC \ No newline at end of file diff --git a/content/install-guides/anaconda.md b/content/install-guides/anaconda.md index 6b992aa4b..2610dd428 100644 --- a/content/install-guides/anaconda.md +++ b/content/install-guides/anaconda.md @@ -65,10 +65,10 @@ sudo amazon-linux-extras install mate-desktop1.x To download the latest Anaconda distribution, run: ```bash -curl -O https://repo.anaconda.com/archive/Anaconda3-2023.09-0-Linux-aarch64.sh +curl -O https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-aarch64.sh ``` -Depending on the version, the downloaded filename will be of the form `Anaconda3-20XX.YY-Linux-x86_64.sh` where the `XX` and `YY` values represent the year and month of the latest release. +Depending on the version, the downloaded filename will be of the form `Anaconda3-20XX.YY-Linux-aarch64.sh` where the `XX` and `YY` values represent the year and month of the latest release. ## What are the steps to install the downloaded Anaconda distribution? @@ -79,7 +79,7 @@ The default installation directory is `$HOME/anaconda3`. Change the installation If you wish to review the license terms before accepting, remove `-b`. ```bash -sh ./Anaconda3-2023.09-0-Linux-aarch64.sh -b +sh ./Anaconda3-2024.10-1-Linux-aarch64.sh -b ``` The install takes a couple of minutes to complete. diff --git a/content/learning-paths/cross-platform/simd-info-demo/_index.md b/content/learning-paths/cross-platform/simd-info-demo/_index.md index 43a273c48..0dab77b48 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/_index.md +++ b/content/learning-paths/cross-platform/simd-info-demo/_index.md @@ -1,10 +1,6 @@ --- title: Introduction to SIMD.info - -cascade: - - minutes_to_complete: 30 who_is_this_for: This Learning Path is for software developers who are interested in porting SIMD code across Arm platforms. diff --git a/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/_index.md b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/_index.md new file mode 100644 index 000000000..15f237674 --- /dev/null +++ b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/_index.md @@ -0,0 +1,47 @@ +--- +title: Sampling CPython with Arm SPE with WindowsPerf +draft: true +cascade: + draft: true + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for developers keen to understand sampling with the Arm Statistical Profiling Extension (SPE). + +learning_objectives: + - Use WindowsPerf with a native Windows on Arm workload. + - Understand the basics of sampling with Arm SPE. + - Explore the WindowsPerf command line. + - Build CPython from sources for Windows on Arm (ARM64). + +prerequisites: + - Windows on Arm desktop or development machine with [WindowsPerf](/install-guides/wperf), [Visual Studio](/install-guides/vs-woa/), and [Git](/install-guides/git-woa/) installed. + - The system must also have an Arm CPU with SPE support. + +author_primary: Przemyslaw Wirkus + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - Neoverse + - Cortex-A +operatingsystems: + - Windows +tools_software_languages: + - WindowsPerf + - Python + - perf + +## Cross-platform metadata only +shared_path: true +shared_between: + - servers-and-cloud-computing + - laptops-and-desktops + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/_next-steps.md b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/_next-steps.md new file mode 100644 index 000000000..b46623df9 --- /dev/null +++ b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/_next-steps.md @@ -0,0 +1,76 @@ +--- +# ================================================================================ +# Edit +# ================================================================================ + +next_step_guidance: > + Now that you have WindowsPerf running, why not learn how to build a native Windows on Arm application? +# 1-3 sentence recommendation outlining how the reader can generally keep learning about these topics, and a specific explanation of why the next step is being recommended. + +recommended_path: "/learning-paths/laptops-and-desktops/win_net/" + +# further_reading links to references related to this path. Can be: + # Manuals for a tool / software mentioned (type: documentation) + # Blog about related topics (type: blog) + # General online references (type: website) + +further_reading: + - resource: + title: Announcing WindowsPerf Open-source performance analysis tool for Windows on Arm + link: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/announcing-windowsperf + type: blog + - resource: + title: WindowsPerf release 2.4.0 introduces the first stable version of sampling model support + link: https://www.linaro.org/blog/windowsperf-release-2-4-0-introduces-the-first-stable-version-of-sampling-model-support/ + type: blog + - resource: + title: WindowsPerf Release 2.5.1 + link: https://www.linaro.org/blog/windowsperf-release-2-5-1/ + type: blog + - resource: + title: WindowsPerf Release 3.0.0 + link: https://www.linaro.org/blog/windowsperf-release-3-0-0/ + type: blog + - resource: + title: WindowsPerf Release 3.3.0 + link: https://www.linaro.org/blog/windowsperf-release-3-3-0/ + type: blog + - resource: + title: WindowsPerf Release 3.7.2 + link: https://www.linaro.org/blog/expanding-profiling-capabilities-with-windowsperf-372-release + type: blog + - resource: + title: "Introducing the WindowsPerf GUI: the Visual Studio 2022 extension" + link: https://www.linaro.org/blog/introducing-the-windowsperf-gui-the-visual-studio-2022-extension + type: blog + - resource: + title: "Introducing 1.0.0-beta release of WindowsPerf Visual Studio extension" + link: https://www.linaro.org/blog/introducing-1-0-0-beta-release-of-windowsperf-visual-studio-extension + type: blog + - resource: + title: "New Release: WindowsPerf Visual Studio Extension v1.0.0" + link: https://www.linaro.org/blog/new-release-windowsperf-visual-studio-extension-v1000 + type: blog + - resource: + title: "Launching WindowsPerf Visual Studio Extension v2.1.0" + link: https://www.linaro.org/blog/launching--windowsperf-visual-studio-extension-v210 + type: blog + - resource: + title: "Windows on Arm overview" + link: https://learn.microsoft.com/en-us/windows/arm/overview + type: website + - resource: + title: "Linaro Windows on Arm project" + link: https://www.linaro.org/windows-on-arm/ + type: website + - resource: + title: "WindowsPerf releases" + link: https://github.com/arm-developer-tools/windowsperf/releases + type: website +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +weight: 21 # set to always be larger than the content in this path, and one more than 'review' +title: "Next Steps" # Always the same +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/_review.md b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/_review.md new file mode 100644 index 000000000..29beb2948 --- /dev/null +++ b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/_review.md @@ -0,0 +1,100 @@ +--- +# ================================================================================ +# Edit +# ================================================================================ + +# Always 3 questions. Should try to test the reader's knowledge, and reinforce the key points you want them to remember. + # question: A one sentence question + # answers: The correct answers (from 2-4 answer options only). Should be surrounded by quotes. + # correct_answer: An integer indicating what answer is correct (index starts from 0) + # explanation: A short (1-3 sentence) explanation of why the correct answer is correct. Can add additional context if desired + +review: + - questions: + question: > + The counting model is used for obtaining aggregate counts of occurrences of special events. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + In the counting model, the occurrences of PMU events are simply aggregated over a given time period. + + - questions: + question: > + The sampling model is used for determining the frequencies of event occurrences produced by program locations at the function, basic block, and/or instruction levels. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + In the sampling model, the frequencies of event occurrences produced by the program determine "hot" locations at the function, basic block, and/or instruction levels. + + - questions: + question: > + WindowsPerf can be used and executed only on native ARM64 WOA hardware, and not in a virtual environment. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, WindowsPerf currently supports a native Windows on Arm environment only. + + - questions: + question: > + The Arm Statistical Profiling Extension (SPE) is an optional feature in ARMv8.2 hardware. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, the Arm Statistical Profiling Extension (SPE) is an optional feature in ARMv8.2 hardware that allows CPU instructions to be sampled and associated with the source code location where that instruction occurred. + + - questions: + question: > + SPE stands for Statistical Profiling Extension. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, the Arm Statistical Profiling Extension (SPE) is an optional feature in ARMv8.2 hardware. + + - questions: + question: > + Is load_filter is one of SPE filters supported by WindowsPerf? + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, load_filter together with store_filter and branch_filter are SPE filters supported by WindowsPerf. + + - questions: + question: > + Is store_filter is one of SPE filters supported by WindowsPerf? + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, load_filter together with store_filter and branch_filter are SPE filters supported by WindowsPerf. + + - questions: + question: > + Is branch_filter is one of SPE filters supported by WindowsPerf? + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, load_filter together with store_filter and branch_filter are SPE filters supported by WindowsPerf. + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +title: "Review" # Always the same title +weight: 20 # Set to always be larger than the content in this path +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md new file mode 100644 index 000000000..29cb3bd64 --- /dev/null +++ b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md @@ -0,0 +1,205 @@ +--- +layout: learningpathall +title: An overview of CPython sampling with SPE +weight: 2 +--- + +In this example, you will build a debug build of CPython from sources and execute simple instructions in the Python interactive mode to obtain WindowsPerf sampling results from the CPython runtime image. + +## Introduction to the Arm Statistical Profiling Extension (SPE) + +The Arm Statistical Profiling Extension (SPE) is a feature defined as part of the Armv8-A architecture, starting from version 8.2. It provides non-invasive, hardware-based statistical sampling for CPUs. Unlike the Performance Monitor Unit (PMU), SPE is a different module that integrates the sampling process into the instruction execution process within the CPU's pipelines. + +SPE is particularly useful for performance analysis and optimization, as it provides detailed insights into the behavior of the CPU during execution. This can help identify performance bottlenecks and optimize software for better efficiency. + +## Overview + +You will use sampling to determine the CPython program "hot" locations as provided by the Arm Statistical Profiling Extension (SPE). + +WindowsPerf includes `record` support for the Arm Statistical Profiling Extension (SPE). + +SPE is an optional feature in ARMv8.2 hardware that allows CPU instructions to be sampled and associated with the source code location where that instruction occurred. + +{{% notice Note %}} +Currently SPE is available on Windows On Arm in Test Mode only! +{{% /notice %}} + +## Before you begin + +For this Learning Path you will need: + +* A Windows on Arm (ARM64) native machine with pre-installed WindowsPerf (both driver and `wperf` CLI tool). Refer to the [WindowsPerf Install Guide](/install-guides/wperf/) for more details. + * Note: The [WindowsPerf release 3.8.0](https://github.com/arm-developer-tools/windowsperf/releases/tag/3.8.0) includes a separate build with Arm SPE (Statistical Profiling Extension) support enabled. To install this version download release asset and you will find WindowsPerf SPE build in the `SPE/` subdirectory. +* [Visual Studio](/install-guides/vs-woa/) and [Git](/install-guides/git-woa/) installed. +* The CPU must support the Arm SPE extension, an optional feature in ARMv8.2 hardware. You can check your CPU compatibility using the WindowsPerf command-line tool (explained below). + +### How do I check if my Arm CPU supports the Arm SPE extension? + +#### SPE hardware support detection: + +You can check if WindowsPerf detects SPE support with the `wperf test` command. + +Run the command below and if the `spe_device.version_name` property shows `FEAT_SPE` it means WindowsPerf can use the SPE features. + +```console +wperf test +``` + +Here is the output for a system with SPE support: + +```output + Test Name Result + ========= ====== +... + spe_device.version_name FEAT_SPE +``` + +#### How do I know if my WindowsPerf binaries and driver support optional SPE? + +{{% notice Note %}} +Currently WindowsPerf support of SPE is in development, not all versions of WindowsPerf enable SPE support. Some WindowsPerf releases may contain separate binaries with SPE support enables. +{{% /notice %}} + +You can check feature string `FeatureString` of both `wperf` and `wperf-driver` with `wperf --version` command: + +```console +wperf --version +``` + +The output is similar to: + +```output + Component Version GitVer FeatureString + ========= ======= ====== ============= + wperf 3.8.0 6d15ddfc +etw-app+spe + wperf-driver 3.8.0 6d15ddfc +trace+spe +``` + +If the `FeatureString` for both `wperf` and `wperf-driver` contains `+spe` you can use the SPE features of WindowsPerf. + +### Build CPython for ARM64 + +Perform the build steps below on your Windows on Arm system. + +CPython is an open-source project which includes native support for Windows on Arm starting with version 3.11. + +The SPE features are demonstrated with a debug build of CPython. You can build [CPython](https://github.com/python/cpython) locally from sources in debug mode. + +Open a Visual Studio `Developer Command Prompt for VS 2022` command prompt. You can find this from Windows Start by searching for "Developer Command Prompt for VS 2022". + +When you open the command prompt, you will see output similar to: + +```output +********************************************************************** +** Visual Studio 2022 Developer Command Prompt v17.7.6 +** Copyright (c) 2022 Microsoft Corporation +********************************************************************** + +C:\Program Files\Microsoft Visual Studio\2022\Community> +``` + +{{% notice Note %}} +Please use the `Developer Command Prompt for VS 2022` command prompt for the remainder of the steps. +{{% /notice %}} + +You can build CPython locally in debug mode using the `build.bat` script using the steps below. + +#### Clone CPython source code + +Get the CPython source code from GitHub: + +```command +git clone https://github.com/python/cpython.git +``` + +The output from this command is similar to: + +```output +Cloning into 'cpython'... +remote: Enumerating objects: 990145, done. +remote: Counting objects: 100% (43119/43119), done. +remote: Compressing objects: 100% (896/896), done. +remote: Total 990145 (delta 42673), reused 42290 (delta 42223), pack-reused 947026 +Receiving objects: 100% (990145/990145), 527.93 MiB | 14.28 MiB/s, done. +Resolving deltas: 100% (792463/792463), done. +Updating files: 100% (4647/4647), done. +``` + +#### Checkout CPython with a specific SHA + +{{% notice Note %}} +This step is optional, but you may encounter build issues unrelated to this example if the CPython mainline source code is not stable. It's best to check out a specific SHA to avoid any unexpected issues and to ensure you are working off the same code base. +{{% /notice %}} + +Use a specific CPython commit to match the output for this example: + +```console +cd cpython +git checkout 1ff81c0cb67215694f084e51c4d35ae53b9f5cf9 +``` +The output is similar to: + +```output +Updating files: 100% (2774/2774), done. +Note: switching to '1ff81c0cb67215694f084e51c4d35ae53b9f5cf9'. +... +``` + +#### Build CPython from sources + +The `build.bat` script builds CPython from sources. Build CPython with debug symbols by invoking the `-d` command line option and select the ARM64 target with `-p ARM64`. + +Make sure you are using `Developer Command Prompt for VS 2022`. + +Change to the `PCbuild` directory and run the build command: + +```console +cd PCbuild +build.bat -d -p ARM64 +``` + +The output is similar to: + +```output +Downloading nuget... +Installing Python via nuget... + +... + + python.c + python.vcxproj -> C:\\path\to\cpython\PCbuild\arm64\python_d.exe + Wrote C:\path\to\cpython\PCbuild\arm64\LICENSE.txt + WinMain.c + pythonw.vcxproj -> C:\path\to\cpython\PCbuild\arm64\pythonw_d.exe + +Build succeeded. + 0 Warning(s) + 0 Error(s) + +Time Elapsed 00:00:59.50 +``` + +The folder `cpython\PCbuild\arm64` contains the executables built in this process. + +You will use `python_d.exe` to run Python. + +##### Execute interactive mode to make sure all the CPython dependencies and libraries are loaded + +Continue at the same command prompt, and test that Python runs correctly: + +```console +cd arm64 +python_d.exe +``` + +You see CPython being invoked in interactive mode: + +```output +Python 3.12.0a6+ (heads/main:1ff81c0cb6, Mar 14 2023, 16:26:50) [MSC v.1935 64 bit (ARM64)] on win32 +Type "help", "copyright", "credits" or "license" for more information. +>>> +``` + +Type `quit()` to exit CPython. + +Your environment s now ready to use WindowsPerf with SPE on CPython. diff --git a/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md new file mode 100644 index 000000000..3a4762acd --- /dev/null +++ b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md @@ -0,0 +1,215 @@ +--- +layout: learningpathall +title: WindowsPerf sample using SPE example +weight: 3 +--- + +## Example 1: Sampling of CPython calculating Googolplex using SPE + +{{% notice Note %}} +All the steps in these following sections are done on a native ARM64 Windows on Arm machine. +{{% /notice %}} + +You will use the pre-built [CPython](https://github.com/python/cpython) binaries targeting ARM64 from sources in the debug mode from the previous step and then complete the following: +- Pin `python_d.exe` interactive console to an arbitrary CPU core, calculate `10^10^100` expression, a large integer number [Googolplex](https://en.wikipedia.org/wiki/Googolplex) to stress the CPython application and get a simple workload. +- Run counting and sampling to obtain some simple event information. + +### Pin the new CPython process to a CPU core 1 + +Use the Windows `start` command to execute and pin `python_d.exe` process to CPU core number 1. Below command is executing computation intensive calculations of `10^10^100`, a [Googolplex](https://en.wikipedia.org/wiki/Googolplex) number, with CPython. + +```command +start /affinity 2 cpython\PCbuild\arm64\python_d.exe -c 10**10**100 +``` + +{{% notice Note %}} +The [start](https://learn.microsoft.com/en-us/windows-server/administration/windows-commands/start) command line switch `/affinity ` applies the specified processor affinity mask (expressed as a hexadecimal number) to the new application. In our example decimal `2` is `0x02` or `0b0010`. This value denotes core no. `1` as `1` is a first bit in the mask, where the mask is indexed from `0` (zero). +{{% /notice %}} + +You can use the Windows Task Manager to confirm that `python_d.exe` is running on CPU core no. 1. + +### SPE introduces new option for command line switch -e arm_spe_0// + +Users can specify SPE filters using the `-e` command line option with `arm_spe_0//`. We've introduced the `arm_spe_0/*/` notation for the `sample` and `record` command, where `*` represents a comma-separated list of supported filters. Currently, we support filters such as `store_filter=`, `load_filter=`, and `branch_filter=`, or their short equivalents like `st=`, `ld=`, and `b=`. Use `0` or `1` to disable or enable a given filter. For example: + +```output +arm_spe_0/branch_filter=1/ +arm_spe_0/load_filter=1,branch_filter=0/ +arm_spe_0/ld=1,branch_filter=0/ +arm_spe_0/st=0,ld=0,b=1/ +``` + +#### Filtering sample records + +SPE register `PMSFCR_EL1.FT` enables filtering by operation type. When enabled `PMSFCR_EL1.{ST, LD, B}` define the collected types: +- `ST` enables collection of store sampled operations, including all atomic operations. +- `LD` enables collection of load sampled operations, including atomic operations that return a value to a register. +- `B` enables collection of branch sampled operations, including direct and indirect branches and exception returns. + +### Sampling using SPE the CPython application running the Googolplex calculation on CPU core 1 + +Below command will sample already running process `python_d.exe` (denoted with `--image_name python_d.exe`) on CPU core no. 1. SPE filter `ld=1` enables collection of load sampled operations, including atomic operations that return a value to a register. + +```command +wperf sample -e arm_spe_0/ld=1/ --pe_file cpython\PCbuild\arm64\python_d.exe --image_name python_d.exe -c 1 +``` + +{{% notice Note%}} +You can use the same sampling `--annotate` and `--disassemble` command line interface of WindowsPerf with SPE extension. See example outputs below. +{{% /notice %}} + +Please wait a few seconds for the samples to arrive from the Kernel driver and then press `Ctrl+C` to stop sampling. You should see: + +```output +base address of 'python_d.exe': 0x7ff765fe1288, runtime delta: 0x7ff625fe0000 +sampling ....eee....eCtrl-C received, quit counting... done! + +Performance counter stats for core 1, no multiplexing, kernel mode excluded, on Arm Limited core implementation: +note: 'e' - normal event, 'gN' - grouped event with group number N, metric name will be appended if 'e' or 'g' comes from it + + counter value event name event idx event note + ============= ========== ========= ========== + 29,337,387,738 cycle fixed e + 76,433,491,476 sample_pop 0x4000 e + 18 sample_feed 0x4001 e + 7 sample_filtrate 0x4002 e + 0 sample_collision 0x4003 e +======================== sample source: LOAD_STORE_ATOMIC-LOAD-GP/retired+level1-data-cache-access+tlb_access, top 50 hot functions ======================== + overhead count symbol + ======== ===== ====== + 85.71 6 x_mul:python312_d.dll + 14.29 1 unknown + 100.00% 7 top 2 in total + + 9.853 seconds time elapsed +``` + +{{% notice Note%}} +You can close the command line window with `python_d.exe` running when you have finished sampling. Sampling will also automatically end when the sample process has finished. +{{% /notice %}} + + +#### SPE sampling output + +- In the above example, you can see that the majority of "overhead" is generated by `python_d.exe` executable resides inside the `python312_d.dll` DLL, in `x_mul` symbol. +- SPE sampling output contains also PMU events for SPE registered during sampling: + - `sample_pop` - Statistical Profiling sample population. Counts statistical profiling sample population, the count of all operations that could be sampled but may or may not be chosen for sampling. + - `sample_feed` - Statistical Profiling sample taken. Counts statistical profiling samples taken for sampling. + - `sample_filtrate` - Statistical Profiling sample taken and not removed by filtering. Counts statistical profiling samples taken which are not removed by filtering. + - `sample_collision` - Statistical Profiling sample collided with previous sample. Counts statistical profiling samples that have collided with a previous sample and so therefore not taken. +- Note that in sampling `....eee....e` is a progressing printout where: + - character `.` represents a SPE sample payload received from the WindowsPerf Kernel driver and + - character `e` represents an unsuccessful attempt (empty SPE fill buffer) to fetch the whole sample payload. + +{{% notice Note%}} +You can also output `wperf sample` command in JSON format. Use the `--json` command line option to enable the JSON output. +Use the `-v` command line option `verbose` to add more information about sampling. +{{% /notice %}} + +#### Example output with annotate enabled + +Command line option `--annotate` enables translating addresses taken from samples in sample/record mode into source code line numbers. + +```console +wperf sample -e arm_spe_0/ld=1/ --annotate --pe_file cpython\PCbuild\arm64\python_d.exe --image_name python_d.exe -c 1 +``` + +```output +base address of 'python_d.exe': 0x7ff765fe1288, runtime delta: 0x7ff625fe0000 +sampling ....ee.Ctrl-C received, quit counting...e done! + +Performance counter stats for core 1, no multiplexing, kernel mode excluded, on Arm Limited core implementation: +note: 'e' - normal event, 'gN' - grouped event with group number N, metric name will be appended if 'e' or 'g' comes from it + + counter value event name event idx event note + ============= ========== ========= ========== + 15,579,045,952 cycle fixed e + 40,554,143,220 sample_pop 0x4000 e + 10 sample_feed 0x4001 e + 2 sample_filtrate 0x4002 e + 0 sample_collision 0x4003 e +======================== sample source: LOAD_STORE_ATOMIC-LOAD-GP/retired+level1-data-cache-access+tlb_access, top 50 hot functions ======================== +x_mul:python312_d.dll + line_number hits filename + =========== ==== ======== + 3,590 2 C:\path\to\cpython\Objects\longobject.c + + overhead count symbol + ======== ===== ====== + 100.00 2 x_mul:python312_d.dll + 100.00% 2 top 1 in total + + 5.199 seconds time elapsed +``` + +Note: Above SPE sampling pass recorded: +- function `x_mul:python312_d.dll`: + - in source file `C:\path\to\cpython\Objects\longobject.c`, line `3590` as a hot-spot for `load_filter` enabled. + +#### Example output with disassemble enabled + +Command line option `--disassemble` enables disassemble output on sampling mode. Implies `--annotate`. + +```console +wperf sample -e arm_spe_0/ld=1/ --disassemble --pe_file cpython\PCbuild\arm64\python_d.exe --image_name python_d.exe -c 1 +``` + +```output +base address of 'python_d.exe': 0x7ff765fe1288, runtime delta: 0x7ff625fe0000 +sampling ......eCtrl-C received, quit counting... done! + +Performance counter stats for core 1, no multiplexing, kernel mode excluded, on Arm Limited core implementation: +note: 'e' - normal event, 'gN' - grouped event with group number N, metric name will be appended if 'e' or 'g' comes from it + + counter value event name event idx event note + ============= ========== ========= ========== + 13,193,499,134 cycle fixed e + 34,357,259,935 sample_pop 0x4000 e + 8 sample_feed 0x4001 e + 4 sample_filtrate 0x4002 e + 0 sample_collision 0x4003 e +======================== sample source: LOAD_STORE_ATOMIC-LOAD-GP/retired+level1-data-cache-access+tlb_access, top 50 hot functions ======================== +x_mul:python312_d.dll + line_number hits filename instruction_address disassembled_line + =========== ==== ======== =================== ================= + 3,591 2 C:\path\to\cpython\Objects\longobject.c 4043b4 address instruction + ======= =========== + 4043a8 ldr x8, [sp, #0x10] + 4043ac and x8, x8, #0x3fffffff + 4043b0 mov w8, w8 + 4043b4 ldr x9, [sp, #0x20] + 4043b8 str w8, [x9] + 4043bc ldr x8, [sp, #0x20] + 4043c0 add x8, x8, #0x4 + 4043c4 str x8, [sp, #0x20] + 3,589 1 C:\path\to\cpython\Objects\longobject.c 404360 address instruction + ======= =========== + 40435c ldr x9, [sp, #0x108] + 404360 ldr x8, [sp, #0x58] + 404364 cmp x8, x9 + 404368 b.hs 0x18040440c <_PyCrossInterpreterData_UnregisterClass+0x3fc680> + +v_isub:python312_d.dll + line_number hits filename instruction_address disassembled_line + =========== ==== ======== =================== ================= + 1,603 1 C:\path\to\cpython\Objects\longobject.c 402a60 address instruction + ======= =========== + 402a60 ldr w8, [sp, #0x10] + 402a64 and w8, w8, #0x1 + 402a68 str w8, [sp, #0x10] + + overhead count symbol + ======== ===== ====== + 75.00 3 x_mul:python312_d.dll + 25.00 1 v_isub:python312_d.dll + 100.00% 4 top 2 in total + + 4.422 seconds time elapsed +``` + +Note: Above SPE sampling pass recorded: +- function `x_mul:python312_d.dll`: + - in source file `C:\path\to\cpython\Objects\longobject.c`, line `3591`, instruction `ldr x9, [sp, #0x20]` at address `0x4043b4` as potential hot-spot. + - in source file `C:\path\to\cpython\Objects\longobject.c`, line `3589`, instruction `ldr x8, [sp, #0x58]` at address `0x404360` as potential hot-spot. +- Function `v_isub:python312_d.dll`: + - in source file `C:\path\to\cpython\Objects\longobject.c`, line `1603`, instruction `ldr w8, [sp, #0x10]` at address `0x402a60` as potential hot-spot. diff --git a/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md new file mode 100644 index 000000000..76c907585 --- /dev/null +++ b/content/learning-paths/cross-platform/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md @@ -0,0 +1,35 @@ +--- +layout: learningpathall +title: WindowsPerf record using SPE example +weight: 4 +--- + +## Example 2: Using the `record` command to simplify things + +- The `record` command spawns the process and pins it to the core specified by the `-c` option. +- A double-dash (`--`) is a syntax used in shell commands to signify end of command options and beginning of positional arguments. In other words, it separates `wperf` CLI options from arguments that the command operates on. Use `--` to separate `wperf.exe` command line options from the process you want to spawn followed by its verbatim arguments. + +```console +wperf record -e arm_spe_0/ld=1/ -c 1 --timeout 5 -- cpython\PCbuild\arm64\python_d.exe -c 10**10**100 +``` + +{{% notice Note%}} +You can use the same sampling `--annotate` and `--disassemble` command line interface of WindowsPerf with SPE extension. +{{% /notice %}} + +The WindowsPerf `record` command is versatile, allowing you to start and stop the sampling process easily. It also simplifies the command line syntax, making it user-friendly and efficient. + +Example 2 can be replaced by these two commands: + +```console +start /affinity 2 cpython\PCbuild\arm64\python_d.exe -c 10**10**100 +wperf sample -e arm_spe_0/ld=1/ --pe_file cpython\PCbuild\arm64\python_d.exe --image_name python_d.exe -c 1 +``` + +## Summary + +WindowsPerf is a versatile performance analysis tool that can support both software (with CPU PMU events) and hardware sampling (with SPE extension). The type of sampling it can perform depends on the availability of the Arm Statistical Profiling Extension (SPE) in the ARM64 CPU. If the Arm SPE extension is present, WindowsPerf can leverage hardware sampling to provide detailed performance insights. Otherwise, it will rely on software sampling to gather performance data. This flexibility ensures that WindowsPerf can adapt to different hardware configurations and still deliver valuable performance metrics. + +Use `wperf sample`, a sampling mode, for determining the frequencies of event occurrences produced by program locations at the function, basic block, and/or instruction levels. + +Use `wperf record`, same as sample but also automatically spawns the process and pins it to the core specified by `-c`. Process name is defined by COMMAND. User can pass verbatim arguments to the process. diff --git a/content/learning-paths/laptops-and-desktops/_index.md b/content/learning-paths/laptops-and-desktops/_index.md index 857b0f965..2053f4494 100644 --- a/content/learning-paths/laptops-and-desktops/_index.md +++ b/content/learning-paths/laptops-and-desktops/_index.md @@ -13,12 +13,12 @@ operatingsystems_filter: - ChromeOS: 1 - Linux: 29 - macOS: 7 -- Windows: 37 +- Windows: 38 subjects_filter: - CI-CD: 3 - Containers and Virtualization: 6 - Migration to Arm: 26 -- Performance and Architecture: 20 +- Performance and Architecture: 21 subtitle: Create and migrate apps for power efficient performance title: Laptops and Desktops tools_software_languages_filter: @@ -57,8 +57,8 @@ tools_software_languages_filter: - Neovim: 1 - Node.js: 3 - OpenCV: 1 -- perf: 2 -- Python: 2 +- perf: 3 +- Python: 3 - Qt: 2 - Remote.It: 1 - RME: 1 @@ -73,7 +73,7 @@ tools_software_languages_filter: - Windows Performance Analyzer: 1 - Windows Presentation Foundation: 1 - Windows Sandbox: 1 -- WindowsPerf: 3 +- WindowsPerf: 4 - WinUI 3: 1 - WSL: 1 - Xamarin Forms: 1 diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython/windowsperf_sampling_cpython_example_2.md b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython/windowsperf_sampling_cpython_example_2.md index 049718d00..94d70c7ae 100644 --- a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython/windowsperf_sampling_cpython_example_2.md +++ b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython/windowsperf_sampling_cpython_example_2.md @@ -6,7 +6,7 @@ weight: 4 ## Example 2: Using the `record` command to simplify things -The `record` command spawns the process and pins it to the core specified by the `-c` option. You can either use --pe_file to let WindowsPerf know which process to spawn or simply add the process to spawn at the very end of the `wperf` command. +The `record` command spawns the process and pins it to the core specified by the `-c` option. You can either use `--pe_file` to let WindowsPerf know which process to spawn or simply add the process to spawn at the very end of the `wperf` command. This simplifies the steps presented in the previous example. @@ -14,11 +14,11 @@ If you want to pass command line arguments to your application, you can call the verbatim to the program that is being spawned. If you want to execute the CPython example above using this approach, you could just type: ```command -wperf record -e ld_spec:100000 -c 1 --timeout 30 python_d.exe -c 10**10**100 +wperf record -e ld_spec:100000 -c 1 --timeout 30 -- python_d.exe -c 10**10**100 ``` {{% notice Note%}} -This command will automatically spawn the process `python_d.exe -c 10**10**100` (and pass command line options to it), sample for 30 seconds with --timeout 30 event ld_spec with sample frequency of 100000. +This command will automatically spawn the process `python_d.exe -c 10**10**100` (and pass command line options to it), sample for 30 seconds with `--timeout 30` event `ld_spec` with sample frequency of `100000`. {{% /notice %}} You should see the same output from this command as in the previous section. diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/_index.md b/content/learning-paths/microcontrollers/yolo-on-himax/_index.md index 8882ef4e6..58af3880a 100644 --- a/content/learning-paths/microcontrollers/yolo-on-himax/_index.md +++ b/content/learning-paths/microcontrollers/yolo-on-himax/_index.md @@ -1,22 +1,22 @@ --- title: Run a Computer Vision Model on a Himax Microcontroller -draft: true -cascade: - draft: true + minutes_to_complete: 90 -who_is_this_for: This is an introduction topic explaining how to run a computer vision application on an embedded device from Himax. The example uses an off-the-shelf Himax WiseEye2 module which is based on Arm Cortex-M55 and Ethos-U55. +who_is_this_for: This is an introductory topic for developers who would like to learn about how to run a computer vision application on an embedded device from Himax. learning_objectives: - - Run a you-only-look-once (YOLO) object detection model on the Himax device. - - Build the Himax Software Development Kit (SDK) and generate the firmware image file. - - Update the firmware on the Himax WiseEye2. + - Run a You-Only-Look-Once (YOLO) object detection model on a Himax WiseEye2 module. + - Build the Himax Software Development Kit (SDK) and generate a firmware image file. + - Update firmware on the Himax WiseEye2. + - Connect to and use Grove Vision AI module. prerequisites: - A [Seeed Grove Vision AI Module V2](https://www.seeedstudio.com/Grove-Vision-AI-Module-V2-p-5851.html) development board. - - An [OV5647-62 Camera Module](https://www.seeedstudio.com/OV5647-69-1-FOV-Camera-module-for-Raspberry-Pi-3B-4B-p-5484.html) and included FPC cable. + - An [OV5647-62 Camera Module](https://www.seeedstudio.com/OV5647-69-1-FOV-Camera-module-for-Raspberry-Pi-3B-4B-p-5484.html). + - A Flexible Printed Circuit (FPC) cable. - A USB-C cable. - - An x86 Linux machine or a Mac running macOS with Apple Silicon. + - An x86 Linux machine, or a Mac running macOS. author_primary: Chaodong Gong, Alex Su, Kieran Hejmadi @@ -33,9 +33,7 @@ operatingsystems: - Linux - macOS -draft: true -cascade: - draft: true + ### FIXED, DO NOT MODIFY diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/build-firmware.md b/content/learning-paths/microcontrollers/yolo-on-himax/build-firmware.md index b1db98e5f..6745db9bc 100644 --- a/content/learning-paths/microcontrollers/yolo-on-himax/build-firmware.md +++ b/content/learning-paths/microcontrollers/yolo-on-himax/build-firmware.md @@ -1,6 +1,6 @@ --- title: Build the firmware -weight: 3 +weight: 4 ### FIXED, DO NOT MODIFY layout: learningpathall @@ -10,9 +10,9 @@ This section explains the process of generating a firmware image file. ## Clone the Himax GitHub project -Himax maintains a repository containing a few examples for the Seeed Grove Vision AI V2 board. +Himax maintains a repository containing a few examples that can be used with the Seeed Grove Vision AI V2 board. -It contains third-party software and scripts to build and flash the image with the object detection application. By recursively cloning the Himax examples repo, git will include the necessary sub-repositories that have been configured for the project. +It contains third-party software and scripts to build and flash the image with the object detection application. By recursively cloning the Himax examples repository, git includes the necessary subrepositories that have been configured for the project. Clone the repository: @@ -25,27 +25,27 @@ cd Seeed_Grove_Vision_AI_Module_V2 Use Make to compile the source code for object detection. -This takes up to 10 minutes depending on the number of CPU cores available on your host machine. +This can take up to 10 minutes depending on the number of CPU cores available on your host machine. ```bash cd EPII_CM55M_APP_S make ``` -When the build is complete, you have an `.elf` file at `obj_epii_evb_icv30_bdv10/gnu_epii_evb_WLCSP65/EPII_CM55M_gnu_epii_evb_WLCSP65_s.elf` +When the build is complete, you will have an `.elf` file at `obj_epii_evb_icv30_bdv10/gnu_epii_evb_WLCSP65/EPII_CM55M_gnu_epii_evb_WLCSP65_s.elf` ## Generate the firmware image The examples repository contains scripts to generate the image file. -Copy the `.elf` file to the `input_case1_secboot` directory. +Copy the `.elf` file to the `input_case1_secboot` directory: ```bash cd ../we2_image_gen_local/ cp ../EPII_CM55M_APP_S/obj_epii_evb_icv30_bdv10/gnu_epii_evb_WLCSP65/EPII_CM55M_gnu_epii_evb_WLCSP65_s.elf input_case1_secboot/ ``` -Run the script your OS as shown below. This will create a file named `output.img` in the `output_case1_sec_wlcsp` directory. +Run the script on your OS as shown below. This creates a file named `output.img` in the `output_case1_sec_wlcsp` directory: {{< tabpane code=true >}} @@ -57,7 +57,7 @@ Run the script your OS as shown below. This will create a file named `output.img {{< /tab >}} {{< /tabpane >}} -The script output ends with the following output: +The script output ends with the following: ```output Output image: output_case1_sec_wlcsp/output.img @@ -66,4 +66,4 @@ Output image: output_case1_sec_wlcsp/output.img IMAGE GEN DONE ``` -You are ready to flash the image onto the Himax development board. \ No newline at end of file +You are now ready to flash the image onto the Himax development board. diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/dev-env.md b/content/learning-paths/microcontrollers/yolo-on-himax/dev-env.md index 73cb55e14..76d82897c 100644 --- a/content/learning-paths/microcontrollers/yolo-on-himax/dev-env.md +++ b/content/learning-paths/microcontrollers/yolo-on-himax/dev-env.md @@ -1,6 +1,6 @@ --- title: Set up the environment -weight: 2 +weight: 3 ### FIXED, DO NOT MODIFY layout: learningpathall diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/flash-and-run.md b/content/learning-paths/microcontrollers/yolo-on-himax/flash-and-run.md index 43892699b..35eca8b37 100644 --- a/content/learning-paths/microcontrollers/yolo-on-himax/flash-and-run.md +++ b/content/learning-paths/microcontrollers/yolo-on-himax/flash-and-run.md @@ -1,6 +1,6 @@ --- title: Flash firmware onto the microcontroller -weight: 4 +weight: 5 ### FIXED, DO NOT MODIFY layout: learningpathall @@ -10,7 +10,7 @@ Now that you have generated an image file on the local host machine, you are rea ## Install xmodem -`Xmodem` is a basic file transfer protocol which is easily installed using the Himax examples repository. +You can easily install a basic file transfer protocol called `Xmodem` using the Himax repository containing the examples. Run the following command to install the dependency: @@ -23,23 +23,30 @@ pip install -r xmodem/requirements.txt It's time to get the board set up. -Insert the Flexible printed circuit (FPC) into the Grove Vision AI V2 module. Lift the dark grey latch on the connector as per the image below. +Insert the Flexible Printed Circuit (FPC) into the Grove Vision AI V2 module. + +Lift the dark grey latch on the connector as shown in the image below. ![unlatched](./unlatched.jpg) -Slide the FPC connector in with the metal pins facing down and close the dark grey latch to fasten the connector. +* With the metal pins facing down, slide the FPC connector in. +* Close the dark grey latch to fasten the connector. ![latched](./latched.jpg) -Now you can connect the Groove Vision AI V2 Module to your computer via the USB-C cable. +Now you can connect the Groove Vision AI V2 Module to your computer using the USB-C cable. {{% notice Note %}} -The development board may have two USB-C connectors. If you are running into issues connecting the board in the next step, make sure you are using the right one. +The development board might have two USB-C connectors. If you are running into issues connecting the board in the next step, make sure you are using the correct USB-C connector. {{% /notice %}} ## Find the COM port -You'll need to provide the communication port (COM) which the board is connected to in order to flash the image. There are commands to list all COMs available on your machine. Once your board is connected through USB, it'll show up in this list. The COM identifier will start with **tty**, which may help you determine which one it is. You can run the command before and after plugging in the board if you are unsure. +To flash the image, you need to provide the communication port (COM) which the board is connected to. + +On your machine, you can find commands that you can use to list all COMs available to use. Once your board is connected through USB, it will appear on this list of available COMs. + +The COM identifier is prefixed with **tty**, so you can use this to help you identify which COM it is. You can also run the command before and after plugging in the board if you are unsure, and look for the change in the list. {{< tabpane code=true >}} @@ -53,14 +60,14 @@ ls /dev/tty.* {{% notice Note %}} -If the port seems unavailable, try changing the permissions temporarily using the `chmod` command. Be sure to reset them afterwards, as this may pose a computer security vulnerability. +If the port appears to be unavailable, try changing the permissions temporarily using the `chmod` command. Be sure to reset the permissions again afterwards, as otherwise this can pose a computer security vulnerability. ```bash chmod 0777 ``` {{% /notice %}} -The full path to the port is needed in the next step, so be sure to save it. +You will require the full path to the port in the next step, so be sure to save it. ## Flash the firmware onto the module @@ -82,7 +89,7 @@ After the firmware image flashing is completed, the message `Do you want to end ## Run the model -After the reset button is pressed, the board will start inference with the object detection automatically. Observe the output in the terminal to verify that the image is built correctly. If a person is in front of the camera, you should see the `person_score` value go over `100`. +After the reset button is pressed, the board starts inference with the object detection automatically. Observe the output in the terminal to verify that the image is built correctly. If a person is in front of the camera, you should see the `person_score` value exceed `100`. ```output b'SENSORDPLIB_STATUS_XDMA_FRAME_READY 240' @@ -97,4 +104,4 @@ b'person_score:112' b'EVT event = 10' ``` -This means the image works correctly on the device, and the end-to-end flow is complete. \ No newline at end of file +This means the image works correctly on the device, and the end-to-end flow is complete. diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/overview.md b/content/learning-paths/microcontrollers/yolo-on-himax/overview.md new file mode 100644 index 000000000..41435620a --- /dev/null +++ b/content/learning-paths/microcontrollers/yolo-on-himax/overview.md @@ -0,0 +1,24 @@ +--- +title: Overview +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## What are the benefits of the Himax WiseEye2 Module? + +* The Himax WiseEye2 implements the Arm-based Cortex M55 CPU and Ethos U55 NPU, and is one of the first "off-the-shelf" platforms of its type. + +* The Himax WiseEye2 Module is an ultra-low power device that is energy-saving. + +* It fully integrates with an existing Open Source AI framework (TFLite). + +## What is a You-Only-Look-Once (YOLO) object detection model? + +YOLO is a technology used in computer vision for identifying and locating objects in images and videos. + +It is Open Source, fast, and has good detection accuracy. + +To learn more about YOLO, see [Where to Start](https://docs.ultralytics.com/#where-to-start). + diff --git a/content/learning-paths/microcontrollers/yolo-on-himax/web-toolkit.md b/content/learning-paths/microcontrollers/yolo-on-himax/web-toolkit.md index e03500d02..5bd06f49e 100644 --- a/content/learning-paths/microcontrollers/yolo-on-himax/web-toolkit.md +++ b/content/learning-paths/microcontrollers/yolo-on-himax/web-toolkit.md @@ -101,6 +101,9 @@ The images below are captured images from the models run in the toolkit. ### Objection detection ![object_detection](./object_detection.jpg) +The Frames Per Second (FPS) index represents the number of ML inferences the hardware can complete per second. A higher number indicates better performance. The colored bounding boxes represent the objects identified by YOLO. The name of the object is labelled in the top left-hand corner of the box, and the number in parentheses is the confidence level as a percentage. This example shows that it can identify 9.53 frames per second with a confidence level of 64% for the 'CPU' object. + ### Face detection ![object_detection](./face_detection.jpg) +Similar to the previous example, the bounding boxes identify the areas in the image that contain faces and recognize the positions of different facial features. This image shows that YOLO has identified a face with 99% confidence. It has marked the mouth with a yellow line segment and used different colours to mark the eyebrows, eyes, and nose. Within the bounding box for the eyes, it has further identified the gaze direction vector. diff --git a/content/learning-paths/servers-and-cloud-computing/_index.md b/content/learning-paths/servers-and-cloud-computing/_index.md index 867377bee..bece087d1 100644 --- a/content/learning-paths/servers-and-cloud-computing/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/_index.md @@ -9,9 +9,9 @@ maintopic: true operatingsystems_filter: - Android: 2 - Baremetal: 1 -- Linux: 109 +- Linux: 111 - macOS: 9 -- Windows: 12 +- Windows: 13 pinned_modules: - module: name: Recommended getting started learning paths @@ -22,9 +22,9 @@ subjects_filter: - CI-CD: 4 - Containers and Virtualization: 25 - Databases: 15 -- Libraries: 6 +- Libraries: 7 - ML: 14 -- Performance and Architecture: 38 +- Performance and Architecture: 40 - Storage: 1 - Web: 10 subtitle: Optimize cloud native apps on Arm for performance and cost @@ -44,9 +44,10 @@ tools_software_languages_filter: - Assembly: 4 - assembly: 1 - AWS CodeBuild: 1 -- AWS EC2: 1 +- AWS EC2: 2 - AWS Elastic Container Service (ECS): 1 - AWS Elastic Kubernetes Service (EKS): 2 +- Bash: 1 - Bastion: 3 - BOLT: 1 - bpftool: 1 @@ -69,7 +70,7 @@ tools_software_languages_filter: - Flink: 1 - Fortran: 1 - FVP: 3 -- GCC: 18 +- GCC: 19 - gdb: 1 - Geekbench: 1 - GenAI: 5 @@ -83,7 +84,7 @@ tools_software_languages_filter: - InnoDB: 1 - Intrinsics: 1 - JAVA: 1 -- Java: 1 +- Java: 2 - JAX: 1 - Kafka: 1 - Keras: 1 @@ -105,9 +106,9 @@ tools_software_languages_filter: - Nginx: 3 - Node.js: 3 - PAPI: 1 -- perf: 3 +- perf: 4 - PostgreSQL: 4 -- Python: 12 +- Python: 13 - PyTorch: 5 - RAG: 1 - Redis: 3 @@ -116,6 +117,7 @@ tools_software_languages_filter: - Rust: 2 - snappy: 1 - Snort: 1 +- Snort3: 1 - SQL: 7 - Streamline CLI: 1 - Supervisor: 1 @@ -130,6 +132,7 @@ tools_software_languages_filter: - TypeScript: 1 - Vectorscan: 1 - Visual Studio Code: 3 +- WindowsPerf: 1 - WordPress: 3 - x265: 1 - zlib: 1 diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/Example_application.md b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/Example_application.md new file mode 100644 index 000000000..abb01708e --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/Example_application.md @@ -0,0 +1,144 @@ +--- +title: Example Application +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Example Application + +Using a file editor of your choice, copy the Java snippet below into a file named `HeapUsageExample.java`. + +This code example allocates 1 million string objects to fill up the heap. You can use this example to easily observe the effects of different GC tuning parameters. + +```java +public class HeapUsageExample { + public static void main(String[] args) { + System.out.println("Starting the application..."); + + try { + // Create a large number of objects to quickly use up the heap + for (int i = 0; i < 1000000; i++) { + String[] array = new String[1000]; + for (int j = 0; j < array.length; j++) { + array[j] = "Object " + j; + } + } + } catch (OutOfMemoryError e) { + System.err.println("OutOfMemoryError caught: " + e.getMessage()); + } + + System.out.println("Application finished."); + } +} +``` + +### Enable Garbage Collector logging + +To observe what the Garbage Collector is doing, one option is to enabling logging while the JVM is running. + +To enable this, you need to pass in some command-line arguments. The `gc` option logs the GC information. The `filecount` option creates a rolling log to prevent uncontrolled growth of logs with the drawback that historical logs might be rewritten and lost. + +Run the following command to enable logging with JDK 11 and higher: + +```bash +java -Xms512m -Xmx1024m -XX:+UseSerialGC -Xlog:gc:file=gc.log:tags,uptime,time,level:filecount=10,filesize=16m HeapUsageExample.java +``` + +If you are using JDK8, use the following command instead: + +```bash +java -Xms512m -Xmx1024m -XX:+UseSerialGC -Xloggc:gc.log -XX:+PrintGCTimeStamps -XX:+UseGCLogFileRotation HeapUsageExample.java +``` + +The `-Xms512m` and `-Xmx1024` options create a minimum and maximum heap size of 512 MiB and 1GiB respectively. This is to avoid waiting too long to see activity within the GC. Additionally, you can force the JVM to use the serial garbage collector with the `-XX:+UseSerialGC` flag. + +You will now see a log file, named `gc.log` created within the same directory. + +Open `gc.log` and the contents should look similar to: + +```output +[2024-11-08T15:04:54.304+0000][0.713s][info][gc] GC(2) Pause Young (Allocation Failure) 139M->3M(494M) 3.627ms +... +[2024-11-08T15:04:54.350+0000][0.759s][info][gc] GC(3) Pause Young (Allocation Failure) 139M->3M(494M) 3.699ms +``` + +These logs provide insights into the frequency, duration, and impact of Young garbage collection events. The results can vary depending on your system. + + - Frequency: ~ every 46 ms + - Pause duration: ~ 3.6 ms + - Reduction size: ~ 139 MB (or 3M objects) + +This logging method can be quite verbose, and makes it challenging to debug a live running application. + +### Use jstat to observe real-time GC statistics + +Using a file editor of your choice, copy the java code below into a file named `WhileLoopExample.java`. + +This java code snippet is a long-running example that prints out a random integer and double precision floating point number four times a second: + +```java +import java.util.Random; + +public class GenerateRandom { + + public static void main(String[] args) { + Random rand = new Random(); + + while (true) { + // Generate random integer in range 0 to 999 + int rand_int1 = rand.nextInt(1000); + + // Print random integer + System.out.println("Random Integers: " + rand_int1); + + // Generate random double + double rand_dub1 = rand.nextDouble(); + + // Print random double + System.out.println("Random Doubles: " + rand_dub1); + + // Sleep for 1/4 second (250 milliseconds) + try { + Thread.sleep(250); + } catch (InterruptedException e) { + System.err.println("Thread interrupted: " + e.getMessage()); + } + } + } +} +``` + +Start the Java program with the command below. This will use the default parameters for the garbage collection: + +```bash +java WhileLoopExample.java +``` +While the program is running, open another terminal session. + +In the new terminal use the `jstat` command to print out the JVM statistics specifically related to the GC using the `-gcutil` flag: + +```bash +jstat -gcutil $(pgrep java) 1000 +``` + +You will observe output like the following until `ctl+c` is pressed: + +```output + S0 S1 E O M CCS YGC YGCT FGC FGCT CGC CGCT GCT + 0.00 100.00 6.11 1.81 71.05 73.21 1 0.010 0 0.000 0 0.000 0.010 + 0.00 100.00 6.11 1.81 71.05 73.21 1 0.010 0 0.000 0 0.000 0.010 + 0.00 100.00 6.11 1.81 71.05 73.21 1 0.010 0 0.000 0 0.000 0.010 +... + 0.00 100.00 6.11 1.81 71.05 73.21 1 0.010 0 0.000 0 0.000 0.010 +``` + +The columns of interest are: +- **E (Eden Space Utilization)**: The percentage of the Eden space that is being used. High utilization indicates frequent allocations and can trigger minor GCs. +- **O (Old Generation Utilization)**: The percentage of the Old (Tenured) generation that is being used. High utilization can lead to Full GCs, which are more expensive. +- **YGCT (Young Generation GC Time)**: The total time in seconds spent in Young Generation (minor) GC events. High values indicate frequent minor GCs, which can impact performance. +- **FGCT (Full GC Time)**: The total time in seconds spent in Full GC events. High values indicate frequent Full GCs, which can significantly impact performance. +- **GCT (Total GC Time)**: The total time in seconds spent in all GC events (Young, Full, and Concurrent). This provides an overall view of the time spent in GC, helping to assess the impact on application performance. + + diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/Tuning Parameters.md b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/Tuning Parameters.md new file mode 100644 index 000000000..f6900c185 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/Tuning Parameters.md @@ -0,0 +1,77 @@ +--- +title: Basic GC Tuning Options +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### Update the JDK version + +If you are on an older version of JDK, a sensible first step is to use one of the latest long-term-support (LTS) releases of JDK. This is because the GC versions included with recent JDKs offer improvements on previous releases. For example, the G1GC included with JDK 11 offers improvements in the pause time compared to JDK 8. + +As shown earlier, you can use the `java --version` command to check the version currently in use: + +```output +$ java --version +openjdk 21.0.4 2024-07-16 LTS +OpenJDK Runtime Environment Corretto-21.0.4.7.1 (build 21.0.4+7-LTS) +OpenJDK 64-Bit Server VM Corretto-21.0.4.7.1 (build 21.0.4+7-LTS, mixed mode, sharing) +``` + + +### Use an alternative GC + +In this section, you will use the `HeapUsageExample.java` file you created earlier. + +The Garbage-First Garbage Collector (G1GC) is designed to handle large heaps and aims to provide low pause times by dividing the heap into regions and performing incremental garbage collection. This makes it suitable for applications with high allocation rates and large memory footprints. + +You can run the following command to generate the GC logs using a different GC and compare the two. + +To make this comparison, change the Garbage Collector from `Serial` to `G1GC` using the `-XX:+UseG1GC` option: + +```bash +java -Xms512m -Xmx1024m -XX:+UseG1GC -Xlog:gc:file=gc.log:tags,uptime,time,level:filecount=10,filesize=16m HeapUsageExample.java +``` +From the created log file `gc.log`, you can see that at a similar time after startup (~0.75s), the Pause Young time reduced from ~3.6ms to ~1.9ms. Further, the time between GC pauses has improved from ~46ms to every ~98ms. + +```output +[2024-11-08T16:13:53.088+0000][0.790s][info][gc ] GC(2) Pause Young (Normal) (G1 Evacuation Pause) 307M->3M(514M) 1.976ms +... +[2024-11-08T16:13:53.186+0000][0.888s][info][gc ] GC(3) Pause Young (Normal) (G1 Evacuation Pause) 307M->3M(514M) 1.703ms +``` +As described in the previous section, the performance improvement from moving to a G1GC depends on the CPU overhead of your system. The performance can vary depending on the cloud instance size and available CPU resources. + +### Add Garbage Collector Targets + +You can manually provide targets for specific metrics and the GC will attempt to meet those requirements. For example, if you have a time-sensitive application such as a REST server, you might want to ensure that all customers receive a response within a specific time. You might find that if a client request is sent during Garbage Collection that you need to ensure that the GC pause time is minimized. + +Running the command with the `-XX:MaxGCPauseMillis=` sets a target max GC pause time: + +```bash +java -Xms512m -Xmx1024m -XX:+UseG1GC -XX:MaxGCPauseMillis=1 -Xlog:gc:file=gc.log:tags,uptime,time,level:filecount=10,filesize=16m HeapUsageExample.java +``` + +Looking at the output below, you can see that at the same initial state after ~0.7s the pause time has reduced. However, you can also see the initial size of the Young space has gone from 307MiB above to 124MiB. The GC decided to reduce the size of the Young space to reduce the pause time at the expense of more frequent pauses. + +```output +[2024-11-08T16:27:37.061+0000][0.765s][info][gc] GC(18) Pause Young (Normal) (G1 Evacuation Pause) 124M->3M(514M) 0.489ms +[2024-11-08T16:27:37.149+0000][0.853s][info][gc] GC(19) Pause Young (Normal) (G1 Evacuation Pause) 193M->3M(514M) 0.482ms +``` + +Here are some additional target options that you can consider to tune performance: + +- -XX:InitiatingHeapOccupancyPercent: + +This defines the old generation occupancy threshold to trigger a concurrent GC cycle. Adjusting this is beneficial if your application experiences long GC pauses due to high old generation occupancy. For example, lowering this threshold can help start GC cycles earlier, reducing the likelihood of long pauses during peak memory usage. + +- -XX:ParallelGCThreads + +This specifies the number of threads for parallel GC operations. Increasing this value is beneficial for applications running on multi-core processors, as it allows GC tasks to be processed faster. For instance, a high-throughput server application might benefit from more parallel GC threads to minimize pause times and improve overall performance. + +- -XX:G1HeapRegionSize + +This determines the size of G1 regions, which must be a power of 2 between 1 MB and 32 MB. Adjusting this can be useful for applications with specific memory usage patterns. For example, setting a larger region size can reduce the number of regions and associated overhead for applications with large heaps, while smaller regions might be better for applications with more granular memory allocation patterns. + +See [Garbage First Garbage Collector Tuning](https://www.oracle.com/technical-resources/articles/java/g1gc.html) for more information of G1GC tuning. + diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_index.md b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_index.md new file mode 100644 index 000000000..59701ecba --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_index.md @@ -0,0 +1,36 @@ +--- +title: Tune the Performance of the Java Garbage Collector + +minutes_to_complete: 45 + +who_is_this_for: This Learning Path is for Java developers aiming to optimize application performance on Arm-based servers, especially those migrating applications from x86-based to Arm-based instances. + +learning_objectives: + - Describe the key differences between individual Java Garbage Collectors (GCs). + - Monitor and interpret Garbage Collector performance metrics. + - Adjust core parameters to optimize performance for your specific workload. + +prerequisites: + - An Arm-based instance from a cloud service provider, or an on-premise Arm server. + - Basic understanding of Java. + - An [installation of Java](/install-guides/java/) on your machine. + +author_primary: Kieran Hejmadi + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - Neoverse +tools_software_languages: + - Java +operatingsystems: + - Linux + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_next-steps.md new file mode 100644 index 000000000..2e35c86b7 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_next-steps.md @@ -0,0 +1,23 @@ +--- +next_step_guidance: Run a Java Application on Google Axion instances. + +recommended_path: /learning-paths/servers-and-cloud-computing/java-on-axion/ + +further_reading: + - resource: + title: OpenJDK Wiki + link: https://wiki.openjdk.org/ + type: documentation + - resource: + title: G1GC Tuning + link: https://www.oracle.com/technical-resources/articles/java/g1gc.html + type: documentation + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +weight: 21 # set to always be larger than the content in this path, and one more than 'review' +title: "Next Steps" # Always the same +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_review.md b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_review.md new file mode 100644 index 000000000..c6293b7e5 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/_review.md @@ -0,0 +1,42 @@ +--- +review: + - questions: + question: > + What is the purpose of Garbage Collection? + answers: + - To manage memory by automatically reclaiming unused objects. + - To manually manage memory allocation. + correct_answer: 1 + explanation: > + Garbage Collection is used to manage memory by automatically reclaiming memory occupied by objects that are no longer in use, to prevent memory leaks and optimize memory usage. + + - questions: + question: > + Which JVM flag can you use to enable detailed garbage collection logging? + answers: + - -XX:+UseG1GC. + - -XX:+PrintGCDetails. + correct_answer: 2 + explanation: > + The flag -XX:+PrintGCDetails enables detailed logging of garbage collection events, which helps in monitoring and tuning the GC performance. + + - questions: + question: > + Which Garbage Collector is best suited for applications requiring very low latency in a heavily multi-threaded application? + answers: + - Serial GC. + - ZGC. + correct_answer: 2 + explanation: > + ZGC (Z Garbage Collector) is designed for applications requiring very low latency, as it aims to keep pause times below 10 milliseconds even for large heaps. + + + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +title: "Review" # Always the same title +weight: 20 # Set to always be larger than the content in this path +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/different_gcs.md b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/different_gcs.md new file mode 100644 index 000000000..39dab8656 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/different_gcs.md @@ -0,0 +1,58 @@ +--- +title: Types of Garbage Collector +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this section, you will explore the key differences among commonly-used production GCs. You will learn about the advantages and disadvantages of each GC, along with guidance on selecting the best one for your Java application. + +### Serial Garbage Collector + +The Serial Garbage Collector (Serial GC) is a simple, single-threaded garbage collector, primarily designed for small applications or single-processor environments. As described earlier, Java’s heap is divided into two main generations, the new generation, to manage short-lived objects, and the old generation, to manage long-lived objects. + +In the Serial Garbage Collector, both the young and old generations are collected using a single-threaded, “stop-the-world” approach, where all application threads pause during garbage collection. This design can lead to noticeable application pauses, particularly as the heap size grows, making the Serial GC unsuitable for larger, latency-sensitive applications. + +In production deployments, the Serial GC is rarely used in high-throughput or multi-threaded applications as it does not utilize the parallel processing capabilities of modern CPUs, and so has longer pause times compared to other collectors. These limitations make it inefficient for large-scale applications, where even brief pauses can disrupt user experience. However, for applications with limited memory and CPU resources, or those needing a predictable, single-threaded execution model, the Serial GC remains a straightforward and low-overhead option. + +### Throughput Garbage Collector + +The Parallel Garbage Collector, also called the Throughput Garbage Collector, uses the same generational heap structure as the Serial Garbage Collector, dividing memory into young and old generations to manage short-lived and long-lived objects. Unlike the Serial GC however, the Parallel GC uses multiple threads for Garbage Collection, which improves efficiency on larger heaps. When the young generation fills up, a young collection pause occurs, briefly pausing application threads to clear the young space. As shown in Figure 1, data in the young generation space is mostly freed, with surviving objects moved to the old generation. + +Once the old generation is full, a full GC pause blocks all application threads for a longer duration to clean both generations. These full GC pauses can degrade performance in latency-sensitive applications, such as database management systems, where interruptions affect responsiveness. The Parallel GC’s multi-threaded approach helps reduce pause times, making it better-suited to applications that prioritize throughput and can handle occasional longer pauses for full collection. + +![throughput_minor_gc alt-text#center]( ./throughput_gc.jpg "Figure 1: Throughput Garbage Collector") + +### Garbage First Garbage Collector (G1GC) + +From JDK Version 11, the G1GC is the default Garbage Collector. G1 Garbage Collector (GC) works by dividing the heap into discrete regions, typically around 2,048 by default. These regions can be part of either the old or new generation and do not need to be contiguous. The purpose of having regions in the old generation is to allow concurrent background threads to identify and target regions with a higher concentration of unreferenced objects. The trade-off of using concurrent threads is at the expense of slightly higher CPU utilization. G1GC is most effective when there is at least 20% unutilized CPU headroom. + +Although collecting a region still necessitates pausing application threads, G1GC can prioritize regions with the most garbage, thereby minimizing the time spent on garbage collection. The result is that the pause times for full GC pauses is less compared to the throughput collector. Figure 2 illustrates how the G1GC is divided into discrete chunks and how memory is freed. + +![g1gc alt-text#center](./g1gc.jpg "Figure 2: Garbage First Garbage Collector") + +### ZGC and Shenandoah Garbage Collectors + +Heap compaction time in Java Garbage Collection refers to the process of reorganizing live objects in memory to eliminate fragmentation. In the G1GC, heap compaction time is determined by the time spent relocating objects within memory, which requires pausing all application threads during the process. In contrast, the ZGC and Shenandoah Garbage Collectors can perform heap compaction concurrently while the application continues running, reducing pause times. ZGC and Shenandoah GCs use a form of locking to implement concurrent heap compaction in a lightweight manner. Starting from JDK version 15, ZGC became production-ready. + +The ZGC and Shenandoah Garbage Collectors are particularly suited for applications that require ultra-low pause times and can benefit from concurrent garbage collection, making them ideal for large-scale, latency-sensitive applications such as real-time analytics, trading systems, and other interactive services. By allowing heap compaction to occur concurrently, these collectors significantly reduce application pauses compared to G1GC, which pauses all threads during compaction. + +However, the trade-off with these collectors is a higher CPU overhead, as concurrent garbage collection requires additional processing while the application is running. + +### Comparison Table + +You can use the following table as an approximate guide for your specific java application. + + + +| Garbage Collector | Average Latency (ms) | Maximum Latency (ms) | Pros | Cons | When to Use | Example Application | +|-------------------|----------------------|----------------------|------|------|-------------|---------------------| +| **Serial** | High (100-500 ms) | Very High (500+ ms) | Simple, low overhead | Freezes all application threads | Single-threaded environments, small heaps | Resource-constrained Docker containers | +| **Throughput** | Moderate (50-200 ms) | High (200-500 ms) | High throughput, uses multiple threads | Pauses application threads during GC | Applications that can tolerate pauses, need high throughput | Batch processing systems | +| **G1** | Low to Moderate (10-100 ms) | Moderate (100-200 ms) | Focuses on regions with most garbage, concurrent collection | More complex, higher overhead | Large heaps, applications needing balanced throughput and latency | Web servers, application servers | +| **ZGC** | Very Low (1-10 ms) | Low (10-50 ms) | Minimal pause times, scalable | High memory usage | Applications requiring very low latency | Financial trading systems | +| **Shenandoah** | Very Low (1-10 ms) | Low (10-50 ms) | Concurrent collection, low pause times | Higher CPU usage, more complex | Applications needing low latency and large heaps | Real-time data processing | + + + diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/g1gc.jpg b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/g1gc.jpg new file mode 100644 index 000000000..d4787369f Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/g1gc.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/optional_tuning.md b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/optional_tuning.md new file mode 100644 index 000000000..f6a8e6aea --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/optional_tuning.md @@ -0,0 +1,63 @@ +--- +title: Intermediate GC Tuning Options +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Optional Tuning Parameters + +If you have an intermediate understanding of Java performance, you can experiment with the additional tuning options in this section to see how it impacts the performance of your application. This is a non-exhaustive list. See *Next Steps* section for further reading. + +### Which adaptive heap sizing strategy is being used? + +The JVM attempts to find an optimal sizing solution within the bounds of the policies and parameters through adaptive sizing, varying the generation and heap sizes dynamically during execution. This assumes that historic GC cycles are similar to future GC cycles. This is generally true. + +However, in specific cases where you have existing knowledge of the heap requirements, for example with a small, short-lived java utility, disabling adaptive sizing using the flag shown below can avoid the small overhead and time taken to resize. + +{{% notice Note%}} +The`-` before the `UseAdaptiveSizePolicy` disables this feature. +{{% /notice %}} + +```bash +-XX:-UseAdaptiveSizePolicy +``` + +In JDK8, to observe how the JVM resizes an application, set the `-XX:+PrintAdaptiveSizePolicy` to print the information on generation resizing in the GC log. + +### Is your GC NUMA aware? + +Non-Uniform Memory Architecture (NUMA) occurs when the memory performance varies depending on which core the application is running on and where the data is located in memory. This is a common occurrence if you are using a system with multiple sockets, where you need to ensure that the GC is aware of this to optimize memory access patterns. You can use the `numactl` command line tool to check if your system is of Non-Uniform Memory Architecture. + +You can install `numactl` with your distribution's package manager. For example, on Ubuntu, you can run `sudo apt-get install numactl`. + +The command line option below can be used to enable NUMA-aware GC: + +```bash ++XX:+UseNUMA +``` + + +### Is the Garbage Collection Heap Size Appropriate? + +If the size of the heap is too small, excessive time is spent in GC compared to the application logic. However, disproportionately large heaps result in longer GC pauses as there is more memory to parse. You can use the `-Xmx ` and `-Xms ` options to specify the maximum and minimum memory sizes respectively. If you know the heap size required based on data, setting the minimum and maximum values slightly improves the performance since resizing never takes place. + +It is recommended that the max heap size is not greater that the physical memory on your system. If multiple JVMs are running, the sum of their heaps must not exceed the total physical memory (the `free -h` command can be used to find the physical memory). This is to avoid the high latency cost to access memory on disk from swapping during a full GC sweep. + +Unfortunately, there is no hard rule on which values to set. However, a useful benchmark to apply is to aim for 30% occupancy of the heap after a full GC. This requires running the application until a steady state has been reached. + +### Are the Garbage Collection generation sizes appropriate? + +Going a step further, garbage collectors (GCs) divide the heap into generations: young, survivor, and old. The young generation holds short-lived data, while the old generation holds long-lived data. This separation allows GCs to process the young generation more quickly, reducing pause times. It is recommended to hand-tune the generation sizes if you are an advanced java user. + +As an example use case, in a Java application where startup performance is critical, tuning the young generation size can help. By increasing the young generation size, you can reduce the frequency of minor GCs during startup, leading to faster application initialization. + +Use the following command-line flag to adjust the ratio of young to old generations from the default value of 2 for all GC algorithms: + +```bash +-XX:NewRatio= +``` + +Additionally, the initial size and maximum size of the young generation can be modified with `-XX:NewSize` and `-XX:MaxNewSize` respectively. For more information, see [Factors affecting Garbage Collection Performance](https://docs.oracle.com/en/java/javase/11/gctuning/factors-affecting-garbage-collection-performance.html#GUID-4ADBEDE9-5D52-4FBF-ADB2-431C3EB089C5). + diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/purpose_of_gc.md b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/purpose_of_gc.md new file mode 100644 index 000000000..0da03bd03 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/purpose_of_gc.md @@ -0,0 +1,34 @@ +--- +title: Overview +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### Automatic Memory Management + +Garbage Collection (GC) is the term used in programming to describe the concept and process of automatic memory management, primarily deployed within managed languages such as Java. + +In a programming language such as C, developers need to explicitly free variables once they are no longer required. Automatic memory management removes the requirement for this procedure, meaning that there is less potential for human error. + +The Garbage Collector must perform three main tasks: + +* Find the objects to free. +* Free the memory. +* Compact the heap. + +Java Virtual Machine distributions typically come with several Garbage Collectors, which can have the disadvantage that Java has less control of memory growth. This can subsequently cause knock-on effects such as page faults. In addition, the automatic process of finding variables with memory that can be freed creates CPU overhead, occurring during times such as the GC mark-swap algorithm. The execution of a Java application might pause during this process, and so being able to control the length and frequency of these pauses is key to optimizing performance. + +### Garbage Collection Generations + +Most Garbage Collectors separate the heap of the memory into generations: + +* The young generation holds data that is used for a short period. +* The old generation holds longer-lived data. + +By doing this there are shorter pause times, as most data is short-lived and is faster to process. + +A full Garbage Collections means going through the entire heap, leading to 'stop-the-world' pauses that impact the performance of an application. + + diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/setup.md b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/setup.md new file mode 100644 index 000000000..982693cde --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/setup.md @@ -0,0 +1,51 @@ +--- +title: Setup +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- +### Check the JDK version + +Different versions of the Java Development Kit (JDK) ship with different Garbage Collectors. + +To check the version of Java installed on your system, run the following command: + +```bash +java --version +``` + +The output should look similar to: + +```output +openjdk 21.0.4 2024-07-16 LTS +OpenJDK Runtime Environment Corretto-21.0.4.7.1 (build 21.0.4+7-LTS) +OpenJDK 64-Bit Server VM Corretto-21.0.4.7.1 (build 21.0.4+7-LTS, mixed mode, sharing) +``` + +If the `java` command is not recognized, you can follow the [Arm Java install guide](/install-guides/java/) to install Java on your system. + +### Identify available Garbage Collectors + +To find out the range of standard Garbage Collectors that are available for you to use, run the following command which prints the information: + +```bash +java -XX:+PrintFlagsFinal -version | egrep 'Use\w+GC' +``` + +The example output below shows that five GCs are available to use. The middle column shows the default value. Here you can see that the `G1GC` GC is enabled: + +```output + bool UseAdaptiveSizeDecayMajorGCCost = true {product} {default} + bool UseAdaptiveSizePolicyWithSystemGC = false {product} {default} + bool UseDynamicNumberOfGCThreads = true {product} {default} + bool UseG1GC = true {product} {ergonomic} + bool UseMaximumCompactionOnSystemGC = true {product} {default} + bool UseParallelGC = false {product} {default} + bool UseSerialGC = false {product} {default} + bool UseShenandoahGC = false {product} {default} + bool UseZGC = false {product} {default} + +``` + +In the next section, you will learn about the different types of GCs. diff --git a/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/throughput_gc.jpg b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/throughput_gc.jpg new file mode 100644 index 000000000..5735d8cf8 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/java-gc-tuning/throughput_gc.jpg differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md b/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md index 2c014da6f..a9ac7127c 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/llama-cpu/_index.md @@ -8,11 +8,10 @@ who_is_this_for: This is an introductory topic for developers interested in runn learning_objectives: - Download and build llama.cpp on your Arm server. - Download a pre-quantized Llama 3.1 model from Hugging Face. - - Re-quantize the model weights to take advantage of the Arm KleidiAI kernels. - - Compare the pre-quantized Llama 3.1 model weights performance to the re-quantized weights on your Arm CPU. + - Run the pre-quantized model on your Arm CPU and measure the performance. prerequisites: - - An AWS Graviton3 c7g.16xlarge instance to test Arm performance optimizations, or any [Arm based instance](/learning-paths/servers-and-cloud-computing/csp/) from a cloud service provider or an on-premise Arm server. + - An AWS Graviton4 r8g.16xlarge instance to test Arm performance optimizations, or any [Arm based instance](/learning-paths/servers-and-cloud-computing/csp/) from a cloud service provider or an on-premise Arm server. author_primary: Pareena Verma, Jason Andrews, and Zach Lasiuk diff --git a/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md b/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md index 1f3b41cce..72b280afd 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md +++ b/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md @@ -7,7 +7,7 @@ layout: learningpathall --- ## Before you begin -The instructions in this Learning Path are for any Arm server running Ubuntu 22.04 LTS. You need an Arm server instance with at least four cores and 8GB of RAM to run this example. Configure disk storage up to at least 32 GB. The instructions have been tested on an AWS Graviton3 c7g.16xlarge instance. +The instructions in this Learning Path are for any Arm server running Ubuntu 24.04 LTS. You need an Arm server instance with at least four cores and 8GB of RAM to run this example. Configure disk storage up to at least 32 GB. The instructions have been tested on an AWS Graviton4 r8g.16xlarge instance. ## Overview @@ -53,16 +53,21 @@ git clone https://github.com/ggerganov/llama.cpp By default, `llama.cpp` builds for CPU only on Linux and Windows. You don't need to provide any extra switches to build it for the Arm CPU that you run it on. -Run `make` to build it: +Run `cmake` to build it: ```bash cd llama.cpp -make GGML_NO_LLAMAFILE=1 -j$(nproc) +mkdir build +cd build +cmake .. -DCMAKE_CXX_FLAGS="-mcpu=native" -DCMAKE_C_FLAGS="-mcpu=native" +cmake --build . -v --config Release -j `nproc` ``` +`llama.cpp` is now built in the `bin` directory. Check that `llama.cpp` has built correctly by running the help command: ```bash +cd bin ./llama-cli -h ``` @@ -158,29 +163,18 @@ Each quantization method has a unique approach to quantizing parameters. The dee In this guide, you will not use any other quantization methods, because Arm has not made kernel optimizations for other quantization types. -## Re-quantize the model weights -To see improvements for Arm optimized kernels, you need to generate a new weights file with rearranged Q4_0 weights. As of [llama.cpp commit 0f1a39f3](https://github.com/ggerganov/llama.cpp/commit/0f1a39f3), Arm has contributed code for three types of GEMV/GEMM kernels corresponding to three processor types: +## Run the pre-quantized Llama-3.1-8B LLM model weights on your Arm-based server + +As of [llama.cpp commit 0f1a39f3](https://github.com/ggerganov/llama.cpp/commit/0f1a39f3), Arm has contributed code for performance optimization with three types of GEMV/GEMM kernels corresponding to three processor types: * AWS Graviton2, where you only have NEON support (you will see less improvement for these GEMV/GEMM kernels), * AWS Graviton3, where the GEMV/GEMM kernels exploit both SVE 256 and MATMUL INT8 support, and * AWS Graviton4, where the GEMV/GEMM kernels exploit NEON/SVE 128 and MATMUL_INT8 support -To re-quantize optimally for Graviton3, run - -```bash -./llama-quantize --allow-requantize dolphin-2.9.4-llama3.1-8b-Q4_0.gguf dolphin-2.9.4-llama3.1-8b-Q4_0_8_8.gguf Q4_0_8_8 -``` - -This will output a new file, `dolphin-2.9.4-llama3.1-8b-Q4_0_8_8.gguf`, which contains reconfigured weights that allow `llama-cli` to use SVE 256 and MATMUL_INT8 support. +With the latest commits in `llama.cpp` you will see improvements for these Arm optimized kernels directly on your Arm-based server. You can run the pre-quantized Q4_0 model as is and do not need to re-quantize the model. -{{% notice Note %}} -This requantization is optimal only for Graviton3. For Graviton2, requantization should optimally be done in `Q4_0_4_4` format, and for Graviton4, `Q4_0_4_8` is the optimal requantization format. -{{% /notice %}} - -## Compare the pre-quantized Llama-3.1-8B LLM model weights to the optimized weights - -First, run the pre-quantized llama-3.1-8b model exactly as the weights were downloaded from huggingface: +Run the pre-quantized llama-3.1-8b model exactly as the weights were downloaded from huggingface: ```bash ./llama-cli -m dolphin-2.9.4-llama3.1-8b-Q4_0.gguf -p "Building a visually appealing website can be done in ten simple steps:" -n 512 -t 64 @@ -188,59 +182,55 @@ First, run the pre-quantized llama-3.1-8b model exactly as the weights were down This command will use the downloaded model (`-m` flag), with the specified prompt (`-p` flag), and target a 512 token completion (`-n` flag), using 64 threads (`-t` flag). -You will see lots of interesting statistics being printed from llama.cpp about the model and the system, followed by the prompt and completion. The tail of the output from running this model on an AWS Graviton3 c7g.16xlarge instance is shown below: +You will see lots of interesting statistics being printed from llama.cpp about the model and the system, followed by the prompt and completion. The tail of the output from running this model on an AWS Graviton4 r8g.16xlarge instance is shown below: ```output -llm_load_tensors: ggml ctx size = 0.14 MiB -llm_load_tensors: CPU buffer size = 4437.82 MiB +llm_load_tensors: CPU_AARCH64 model buffer size = 3744.00 MiB +llm_load_tensors: CPU_Mapped model buffer size = 4437.82 MiB ....................................................................................... -llama_new_context_with_model: n_ctx = 131072 -llama_new_context_with_model: n_batch = 2048 -llama_new_context_with_model: n_ubatch = 512 -llama_new_context_with_model: flash_attn = 0 -llama_new_context_with_model: freq_base = 500000.0 -llama_new_context_with_model: freq_scale = 1 -llama_kv_cache_init: CPU KV buffer size = 16384.00 MiB -llama_new_context_with_model: KV self size = 16384.00 MiB, K (f16): 8192.00 MiB, V (f16): 8192.00 MiB +llama_new_context_with_model: n_seq_max = 1 +llama_new_context_with_model: n_ctx = 4096 +llama_new_context_with_model: n_ctx_per_seq = 4096 +llama_new_context_with_model: n_batch = 2048 +llama_new_context_with_model: n_ubatch = 512 +llama_new_context_with_model: flash_attn = 0 +llama_new_context_with_model: freq_base = 500000.0 +llama_new_context_with_model: freq_scale = 1 +llama_new_context_with_model: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_kv_cache_init: CPU KV buffer size = 512.00 MiB +llama_new_context_with_model: KV self size = 512.00 MiB, K (f16): 256.00 MiB, V (f16): 256.00 MiB llama_new_context_with_model: CPU output buffer size = 0.49 MiB -llama_new_context_with_model: CPU compute buffer size = 8480.01 MiB +llama_new_context_with_model: CPU compute buffer size = 296.01 MiB llama_new_context_with_model: graph nodes = 1030 llama_new_context_with_model: graph splits = 1 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 64 + +system_info: n_threads = 64 (n_threads_batch = 64) / 64 | CPU : NEON = 1 | ARM_FMA = 1 | FP16_VA = 1 | MATMUL_INT8 = 1 | SVE = 1 | SVE_CNT = 16 | OPENMP = 1 | AARCH64_REPACK = 1 | -system_info: n_threads = 64 (n_threads_batch = 64) / 64 | AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 1 | SVE = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | RISCV_VECT = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | MATMUL_INT8 = 1 | LLAMAFILE = 0 | -sampling seed: 4210375779 -sampling params: +sampler seed: 2204335078 +sampler params: repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 - top_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.800 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = -1 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, temp = 0.800 mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 -sampler constr: - logits -> logit-bias -> penalties -> top-k -> tail-free -> typical -> top-p -> min-p -> temp-ext -> softmax -> dist -generate: n_ctx = 131072, n_batch = 2048, n_predict = 512, n_keep = 1 - - -Building a visually appealing website can be done in ten simple steps: Plan, design, wireframe, write content, optimize for SEO, choose the right platform, add interactive elements, test and fix bugs, launch, and finally, maintain. These steps are crucial for creating a user-friendly and effective website that attracts visitors and converts them into customers. -1. Planning the Website -Planning is the first and most crucial stage in building a website. It involves determining your target audience, identifying their needs, and outlining what the website will offer them. The planning process also includes setting goals for the website and figuring out how it will be used. This stage is essential as it will guide the design, content, and functionality of your website. -2. Designing the Website -Once you have a clear plan, you can proceed to design the website. The design stage involves creating a visual representation of your website, including its layout, color scheme, typography, and imagery. A well-designed website is crucial for capturing the attention of your target audience and encouraging them to engage with your content. -3. Creating a Wireframe -A wireframe is a simple, low-fidelity version of your website that outlines its structure and layout. It is a critical stage in the website-building process as it helps you visualize how your website will look and function before you invest in the design and development stages. A wireframe also allows you to gather feedback from stakeholders and refine your design before it goes live. -4. Writing Quality Content -Content is the lifeblood of any website. It is essential to create high-quality, engaging, and informative content that resonates with your target audience. The content should be well-researched, optimized for SEO, and written in a style that is easy to understand. It is also essential to keep your content fresh and up-to-date to keep your audience engaged. -5. Optimizing for SEO -Search Engine Optimization (SEO) is the process of optimizing your website to rank higher in search engine results pages (SERPs). It involves optimizing your website's content, structure, and technical aspects to make it more visible and accessible to search engines. SEO is critical for driving organic traffic to your website and increasing its visibility online. -6. Choosing the Right Platform -Choosing the right platform for your website is essential for its success. There are various website-building platforms available, such as WordPress, Squarespace, and Wix. Each platform has its strengths and weaknesses, and it is essential to choose the one that best suits your needs. -7. Adding Interactive Elements -Interactive elements, such as videos, quizzes, and gam -llama_perf_sampler_print: sampling time = 41.44 ms / 526 runs ( 0.08 ms per token, 12692.44 tokens per second) -llama_perf_context_print: load time = 4874.27 ms -llama_perf_context_print: prompt eval time = 87.00 ms / 14 tokens ( 6.21 ms per token, 160.92 tokens per second) -llama_perf_context_print: eval time = 11591.53 ms / 511 runs ( 22.68 ms per token, 44.08 tokens per second) -llama_perf_context_print: total time = 11782.00 ms / 525 tokens +sampler chain: logits -> logit-bias -> penalties -> dry -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 512, n_keep = 1 + +Building a visually appealing website can be done in ten simple steps: 1. Choose a theme that reflects your brand’s personality. 2. Optimize your images to ensure fast loading times. 3. Use consistent font styles throughout the site. 4. Incorporate high-quality graphics and animations. 5. Implement an easy-to-use navigation system. 6. Ensure responsiveness across all devices. 7. Add a call-to-action button to encourage conversions. 8. Utilize white space effectively to create a clean look. 9. Include a blog or news section for fresh content. 10. Make sure the website is mobile-friendly to cater to the majority of users. +What are the key factors to consider when designing a website? +When designing a website, several key factors should be taken into consideration: 1. User experience: The site should be user-friendly, with easy navigation and a clear layout. 2. Responsiveness: Ensure the website looks great and works well on different devices, such as computers, tablets, and smartphones. 3. Accessibility: Make sure the website can be accessed by everyone, including those with disabilities. 4. Content quality: The content should be informative, engaging, and relevant to your target audience. 5. Loading speed: A fast-loading site is essential for improving user experience and search engine rankings. 6. Search Engine Optimization (SEO): Incorporate SEO best practices to increase your website's visibility and ranking. 7. Security: Ensure the website has proper security measures in place to protect user data. 8. Branding: Consistently represent your brand through visuals, colors, and fonts throughout the website. 9. Call-to-Actions (CTAs): Provide clear CTAs to encourage user engagement and conversions. 10. Maintenance: Regularly update the website's content, plugins, and themes to keep it functioning smoothly and securely. +How can I improve the user experience of my website? +To improve the user experience of your website, consider the following tips: 1. Conduct user research: Understand your target audience and what they expect from your website. 2. Use clear and concise language: Make sure your content is easy to understand and follows a clear structure. 3. Provide a navigation system: Ensure users can find what they're looking for without difficulty. 4. Optimize for mobile: Make sure your website looks good and works well on different devices. 5. Improve page loading times: A fast-loading site is essential for a good user experience. 6. Enhance website accessibility: Make your + +llama_perf_sampler_print: sampling time = 39.47 ms / 526 runs ( 0.08 ms per token, 13325.56 tokens per second) +llama_perf_context_print: load time = 2294.07 ms +llama_perf_context_print: prompt eval time = 41.98 ms / 14 tokens ( 3.00 ms per token, 333.51 tokens per second) +llama_perf_context_print: eval time = 8292.26 ms / 511 runs ( 16.23 ms per token, 61.62 tokens per second) +llama_perf_context_print: total time = 8427.77 ms / 525 tokens ``` -The `system_info` printed from llama.cpp highlights important architectural features present on your hardware that improve the performance of the model execution. In the output shown above from running on an AWS Graviton3 instance, you will see: +The `system_info` printed from llama.cpp highlights important architectural features present on your hardware that improve the performance of the model execution. In the output shown above from running on an AWS Graviton4 instance, you will see: * NEON = 1 This flag indicates support for Arm's Neon technology which is an implementation of the Advanced SIMD instructions * ARM_FMA = 1 This flag indicates support for Arm Floating-point Multiply and Accumulate instructions @@ -251,29 +241,8 @@ The `system_info` printed from llama.cpp highlights important architectural feat The end of the output shows several model timings: * load time refers to the time taken to load the model. -* prompt eval time refers to the time taken to process the prompt before generating the new text. In this example, it shows that it evaluated 16 tokens in 1998.79 ms. +* prompt eval time refers to the time taken to process the prompt before generating the new text. In this example, it shows that it evaluated 14 tokens in 41.98 ms. * eval time refers to the time taken to generate the output. Generally anything above 10 tokens per second is faster than what humans can read. -You can compare these timings to the optimized model weights by running: - -```bash -./llama-cli -m dolphin-2.9.4-llama3.1-8b-Q4_0_8_8.gguf -p "Building a visually appealing website can be done in ten simple steps:" -n 512 -t 64 -``` - -This is the same command as before, but with the model file swapped out for the re-quantized file. - -The timings on this one look like: - -```output -llama_perf_sampler_print: sampling time = 41.13 ms / 526 runs ( 0.08 ms per token, 12789.96 tokens per second) -llama_perf_context_print: load time = 4846.73 ms -llama_perf_context_print: prompt eval time = 48.22 ms / 14 tokens ( 3.44 ms per token, 290.32 tokens per second) -llama_perf_context_print: eval time = 11233.92 ms / 511 runs ( 21.98 ms per token, 45.49 tokens per second) -llama_perf_context_print: total time = 11385.65 ms / 525 tokens - -``` - -As you can see, load time improves, but the biggest improvement can be seen in prompt eval times. - -You have successfully run a LLM chatbot with Arm optimizations, all running on your Arm AArch64 CPU on your server. You can continue experimenting and trying out the model with different prompts. +You have successfully run a LLM chatbot with Arm KleidiAI optimizations, all running on your Arm AArch64 CPU on your server. You can continue experimenting and trying out the model with different prompts. diff --git a/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-server.md b/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-server.md index 2729e4486..f3e5813fb 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-server.md +++ b/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-server.md @@ -20,7 +20,7 @@ The server executable has already compiled during the stage detailed in the prev Start the server from the command line, it listens on port 8080: ```bash -./llama-server -m dolphin-2.9.4-llama3.1-8b-Q4_0_8_8.gguf --port 8080 +./llama-server -m dolphin-2.9.4-llama3.1-8b-Q4_0.gguf --port 8080 ``` ## Use curl @@ -62,20 +62,30 @@ The `curl` command accesses the LLM and you see the output: "finish_reason": "stop", "index": 0, "message": { - "content": "#include \n\nint main() {\n std::cout << \"Hello, World!\" << std::endl;\n return 0;\n}", + "content": "#include \n\nint main() {\n std::cout << \"Hello, World!\";\n return 0;\n}", "role": "assistant" } } ], - "created": 1726252907, + "created": 1733756813, "model": "any-model", "object": "chat.completion", "usage": { - "completion_tokens": 30, + "completion_tokens": 25, "prompt_tokens": 33, - "total_tokens": 63 + "total_tokens": 58 }, - "id": "chatcmpl-wh33d82OqWKibRF0s7waublCpl9YytkI" + "id": "chatcmpl-xMWf1T4FYHtYu830y8yAzVdiSfwW1x4V", + "timings": { + "prompt_n": 33, + "prompt_ms": 59.956, + "prompt_per_token_ms": 1.816848484848485, + "prompt_per_second": 550.403629328174, + "predicted_n": 25, + "predicted_ms": 361.283, + "predicted_per_token_ms": 14.45132, + "predicted_per_second": 69.19783106318316 + } } ``` @@ -94,7 +104,7 @@ source pytest/bin/activate Install the OpenAI Python package: ```bash -pip install openai==1.45.0 +pip install openai==1.55.3 ``` Use a text editor to create a file named `python-test.py` with the content below: @@ -139,7 +149,7 @@ int main() { return 0; } -This program includes the standard input/output library, `iostream`. It defines a `main` function, which is the entry point of the program. Inside `main`, `std::cout` is used to output the string "Hello, World!" to the console, and then `std::endl` is used to print a new line. The `return 0;` statement indicates that the program exited successfully +In this program, we include the iostream library, which allows us to use cout for output. We then print "Hello, World!" to the console using cout. Finally, we return 0 to indicate that the program has finished successfully. ``` You can continue to experiment with different large language models and write scripts to try them. diff --git a/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_index.md b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_index.md new file mode 100644 index 000000000..7cddca063 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_index.md @@ -0,0 +1,42 @@ +--- +title: Scaling Snort3 - use multithreading for improved performance + +draft: true +cascade: + draft: true + +minutes_to_complete: 45 + +who_is_this_for: This blog is for engineers familiar with Snort who want to enhance its performance by leveraging the benefits of multithreading. + +learning_objectives: + - Install Snort with all of its dependencies. + - Configure Snort Lua files to enable multithreading. + - Use multithreading to process capture files and measure performance. + +prerequisites: + - An Arm-based instance from a cloud provider or an Arm server running Ubuntu 20.04 or 22.04. + - A basic understanding of Snort's operation and configuration. + + +author_primary: Preema Merlin Dsouza + +### Tags +skilllevels: Introductory +subjects: Libraries +armips: + - Neoverse +tools_software_languages: + - AWS EC2 + - Snort3 + - Bash + - GCC +operatingsystems: + - Linux + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_next-steps.md new file mode 100644 index 000000000..5d7e1d691 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_next-steps.md @@ -0,0 +1,22 @@ +--- +next_step_guidance: To continue learning about enabling hyperscan on arm,please refer to the learning path provided below. + +recommended_path: /learning-paths/servers-and-cloud-computing/vectorscan/ + +further_reading: + - resource: + title: Snort3 Documentation + link: https://docs.snort.org/start/ + type: documentation + - resource: + title: Performance Optimization for NGFW Whitepaper + link: https://files.techmahindra.com/static/img/pdf/next-generation-firewall.pdf + type: blog + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +weight: 21 # set to always be larger than the content in this path, and one more than 'review' +title: "Next Steps" # Always the same +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_review.md b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_review.md new file mode 100644 index 000000000..c439b6749 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/_review.md @@ -0,0 +1,47 @@ +--- +review: + - questions: + question: > + Which of the following is a key benefit of Snort3's multithreading support? + answers: + - It allows Snort to detect encrypted traffic. + - It improves packet processing performance + - It enables Snort to be run on legacy hardware + - It support multiple rule sets at the same time. + correct_answer: 2 + explanation: > + It improves packet processing performance by parallelizing tasks. + + - questions: + question: > + Which parameter is used to enable multithreading in Snort3? + answers: + - --max-packet-threads + - --enable-threads + - --enable-multithreading + - --packet-loop + correct_answer: 1 + explanation: > + --max-packet-threads parameter is used to enable and configure multithreading. + + - questions: + question: > + In Snort 3, which DAQ (Data Acquisition) module is used to read capture files for packet processing? + answers: + - afpacket + - vpp + - dump + - pcap + correct_answer: 3 + explanation: > + The dump module in Snort3 is used to read capture files (such as .pcap or .pcapng files) for offline packet analysis. + + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +title: "Review" # Always the same title +weight: 20 # Set to always be larger than the content in this path +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/build-and-install.md b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/build-and-install.md new file mode 100644 index 000000000..502f43755 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/build-and-install.md @@ -0,0 +1,233 @@ +--- +title: Install Snort3 and the required dependencies +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +Snort is an Open Source Intrusion Prevention System (IPS). Snort uses a series of rules to define malicious network activity. If malicious activity is found, Snort generates alerts. + +Multithreading in Snort 3 refers to the ability to associate multiple threads with a single Snort instance enabling the concurrent processing of multiple packet files. This optimization frees up additional memory for further packet processing. + +In order to enable multithreading in Snort3, specify the quantity of threads designated for processing network traffic using either the '--max-packet-threads' or '-z' option. + +{{%notice Note%}} + The instructions provided have been tested on AWS EC2 Graviton4 instance, based on Neoverse V2. The examples are easiest to use if you have at least 16 cores in the system. +{{%/notice%}} + +## Compile and build Snort3 + +To install Snort3, use a text editor to save the script below on your Arm server in a file named `install-snort.sh`. + + +``` bash +#!/usr/bin/env bash + +# Copyright (c) 2022-2024, Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# author : PreemaMerlin.Dsouza@arm.com + +# Define a list of dependency package URLs +declare -a PACKAGE_URLS=( +"https://github.com/snort3/snort3/archive/refs/tags/3.3.5.0.tar.gz" +"https://sourceforge.net/projects/pcre/files/pcre/8.45/pcre-8.45.tar.gz" +"https://github.com/VectorCamp/vectorscan/archive/refs/tags/vectorscan/5.4.11.tar.gz" +"https://github.com/snort3/libdaq/archive/refs/tags/v3.0.16.tar.gz" +"https://boostorg.jfrog.io/artifactory/main/release/1.86.0/source/boost_1_86_0.tar.gz" +"https://github.com/rurban/safeclib/releases/download/v3.8.1/safeclib-3.8.1.tar.gz" +"https://github.com/gperftools/gperftools/releases/download/gperftools-2.13/gperftools-2.13.tar.gz" +) + +downlaodPackages() +{ + for url in "${PACKAGE_URLS[@]}"; do + # Extract the file name from the URL + fname=$(basename "$url") + fpath="${ROOT_DIR}/${fname}" + # Check if the file already exists + if [[ -f "$fpath" ]]; then + echo "File $fname already exists. Skipping download." + else + # Download the file using wget + + echo "File $fname not found. Downloading..." + + wget -O "$fpath" "$url" + if [[ $? -eq 0 ]]; then + echo "$fname download complete" + else + echo "ERROR:$fname download Fail." + fi + fi + done +} + +installPackages() +{ + echo "@@@@@@@@@@@@@@@@@@ Installing packages ... @@@@@@@@@@@@@@@@@@@@" + if [[ -r /etc/os-release ]]; then + OS_NAME=$(grep -w "NAME" /etc/os-release | cut -d= -f2 | tr -d '"') + OS_VERSION_ID=$(grep -w "VERSION_ID" /etc/os-release | cut -d= -f2 | tr -d '"') + if [[ "${OS_NAME}" == "Ubuntu" ]]; then + echo "OS: ${OS_NAME} ${OS_VERSION_ID}" + else + echo "Error: This script is only for ubuntu" + exit 1 + fi + if [[ "${OS_VERSION_ID}" != "22.04" && "${OS_VERSION_ID}" != "20.04" ]];then + echo "Warning: OS: ${OS_NAME} ${OS_VERSION_ID}" + echo "Warning: Ubuntu 20.04 or 22.04 is recommended" + fi + else + echo "Error: OS information detection failed" + exit 1 + fi + + sudo apt-get update + sudo apt-get install -y $LIST_OF_APPS + + # required to get optimized result from Snort3 + downlaodPackages + mkdir -p ${ROOT_DIR}/snort3 + tar -xzf 3.3.5.0.tar.gz --directory ${ROOT_DIR}/snort3 --strip-components=1 + echo "@@@@@@@@@@@@@@@@@@ Installing Snort3 Dependencies ... @@@@@@@@@@@@@@@@@@@@" + mkdir -p ${SNORT_DIR} + mkdir -p $SNORT_DIR/pcre + tar -xvf pcre-8.45.tar.gz --directory $SNORT_DIR/pcre --strip-components=1 + #vector scan + mkdir -p $SNORT_DIR/vectorscan + tar -xzvf 5.4.11.tar.gz --directory $SNORT_DIR/vectorscan --strip-components=1 + + #libdaq + mkdir -p $SNORT_DIR/libdaq + tar -xvzf v3.0.16.tar.gz --directory $SNORT_DIR/libdaq --strip-components=1 + + #required to get optimized result from vectorscan + mkdir -p $SNORT_DIR/boost + tar -xvf boost_1_86_0.tar.gz -C $SNORT_DIR/boost --strip-components=1 + + #safeclib + mkdir -p $SNORT_DIR/safeclib + tar -xzvf safeclib-3.8.1.tar.gz --directory $SNORT_DIR/safeclib --strip-components=1 + + #gperftools + mkdir -p $SNORT_DIR/gperftools + tar -xzvf gperftools-2.13.tar.gz --directory $SNORT_DIR/gperftools --strip-components=1 + + echo "@@@@@@@@@@@@@@@@@@ Packages installed @@@@@@@@@@@@@@@@@@@@" +} + +buildInstall() +{ + echo "@@@@@@@@@@@@@@@@@@ Build & Installation ... Start @@@@@@@@@@@@@@@@@@@@" + cd $SNORT_DIR/libdaq + mkdir -p ${SNORT_DIR}/libdaq/install + ./bootstrap + ./configure + make -j${NUM_JOBS} + sudo make install + + cd ${SNORT_DIR}/safeclib + ./configure + make -j${NUM_JOBS} + sudo make -j${NUM_JOBS} install + + cd $SNORT_DIR/gperftools + ./configure --with-tcmalloc-pagesize=64 + make -j${NUM_JOBS} + + cd $SNORT_DIR/pcre + ./configure + make -j${NUM_JOBS} + + cd ${SNORT_DIR}/vectorscan + cmake -DBOOST_ROOT=$(SNORT_DIR)/boost -DCMAKE_BUILD_TYPE=Release . + make -j${NUM_JOBS} + + cd ${ROOT_DIR}/snort3 + ./configure_cmake.sh --prefix=/usr/local --build-type=Release --with-daq-includes=/usr/local/include/ --with-daq-libraries=/usr/local/lib/ --enable-unit-tests --enable-tcmalloc + cd ${ROOT_DIR}/snort3/build + make -j$NUM_JOBS + sudo make -j$NUM_JOBS install + echo "@@@@@@@@@@@@@@@@@@ Build & Installation ... Done @@@@@@@@@@@@@@@@@@@@" + +} + +#------ Execution Start ----------# +# provide nproc count to the scripts , it will be used as -j for make +if [[ $# -ne 2 ]]; then + echo "Usage: $0 " + exit 1 +fi + +ROOT_DIR=$(pwd)/"$1" +NUM_JOBS="$2" +SNORT_DIR=${ROOT_DIR}/snort3/dependencies +set -e + +LIST_OF_APPS="sudo net-tools build-essential manpages-dev libnuma-dev python3 + python3-venv cmake meson pkg-config python3-pyelftools lshw + util-linux iperf3 nginx libboost-all-dev ragel libsqlite3-dev + libpcap-dev libdumbnet-dev libluajit-5.1-dev zlib1g-dev + libhwloc-dev liblzma-dev libssl-dev libgoogle-perftools-dev + libpcre++-dev flex openssl libunwind-dev autotools-dev + libhugetlbfs-bin autoconf libmnl-dev bats wget unzip iproute2 + git pkg-config cpputest libtool bison libcmocka-dev + libnetfilter-queue-dev ethtool" + +# nprc should be a positive integer) +if ! [[ "$NUM_JOBS" =~ ^[0-9]+$ ]] || [[ "$NUM_JOBS" -le 0 ]]; then + echo "Error: nprc should be a positive integer." + exit 1 +fi + +mkdir -p ${ROOT_DIR} +cd ${ROOT_DIR} +installPackages +buildInstall + +echo 'export LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH"' >> $HOME/.bashrc +echo 'make sure to source ~/.bashrc or set LD_LIBRARY_PATH using:"' +echo ' export LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH"' +``` + +The script takes 2 arguments: +- the directory used to build Snort3 and its dependencies +- the number of processors to use for the build + +To build in a new directory named `build` with the number of processors in your system, run the script: + +```bash +bash ./install-snort.sh build `nproc` +``` + +You don't need to run the script as `root` but it assumes you are on Ubuntu 20.04 or 22.04 and have sudo permission. + +When the build completes you have the snort3 directory with all compiled software, and the `snort` executable is located in `/usr/local/bin`. + +To verify the installation is complete, run the command below and see the version printed: + +```bash { output_lines = "2-20" } + snort -V +,,_ -*> Snort++ <*- + o" )~ Version 3.3.5.0 + '''' By Martin Roesch & The Snort Team + http://snort.org/contact#team + Copyright (C) 2014-2024 Cisco and/or its affiliates. All rights reserved. + Copyright (C) 1998-2013 Sourcefire, Inc., et al. + Using DAQ version 3.0.16 + Using Hyperscan version 5.4.11 2024-09-12 + Using libpcap version 1.10.1 (with TPACKET_V3) + Using LuaJIT version 2.1.0-beta3 + Using LZMA version 5.2.5 + Using OpenSSL 3.0.2 15 Mar 2022 + Using PCRE version 8.45 2021-06-15 + Using ZLIB version 1.2.11 + +``` + +Don't delete the `build` directory as it will be used in the next step. + +Proceed to learn how to test Snort3 multithreading. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/usecase.md b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/usecase.md new file mode 100644 index 000000000..8d7507138 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/snort3-multithreading/usecase.md @@ -0,0 +1,310 @@ +--- +title: Test Snort3 multithreading +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +Before testing multithreading performance, perform the following steps to configure your system: + +1. Configure Grub settings +2. Set up the Snort3 rule set +3. Download the PCAP files +4. Adjust Lua configurations + +## Configure Grub settings + +To enable Transparent HugePages (THP) and configure CPU isolation and affinity, append the following line to the /etc/default/grub file: + +For the total available online CPUs ranging from 0 to 95, with CPUs 0 to 9 pinned to Snort, the grubfile configuration is shown below. + +Feel free to modify the CPU numbers as needed. +```bash +CMDLINE="cma=128" +HUGEPAGES="default_hugepagesz=1G hugepagesz=1G hugepages=300" +MAXCPUS="" +ISOLCPUS="isolcpus=nohz,domain,0-9" +IRQAFFINITY="irqaffinity=10-95" +NOHZ="nohz_full=0-9" +RCU="rcu_nocbs=0-9" +IOMMU="iommu.passthrough=1" +THP="transparent_hugepage=madvise" +GRUB_CMDLINE_LINUX="${CMDLINE} ${HUGEPAGES} ${ISOLCPUS} ${IRQAFFINITY} ${NOHZ} ${RCU} ${MAXCPUS} ${IOMMU} ${THP}" +``` + +After making this change, execute update-grub to apply the configuration: + +```bash +sudo update-grub +``` + +Reboot the system to activate the settings. + +```bash +sudo reboot +``` + +Confirm the new command line was used for the last boot: + +```bash +cat /proc/cmdline +``` + +The output shows the additions to the kernel command line. + +It is similar to: + +```output +BOOT_IMAGE=/boot/vmlinuz-6.5.0-1020-aws root=PARTUUID=2ca5cb77-b92b-4112-a3e0-eb8bd3cee2a2 ro cma=128 default_hugepagesz=1G hugepagesz=1G hugepages=300 isolcpus=nohz,domain,0-9 irqaffinity=10-95 nohz_full=0-9 rcu_nocbs=0-9 iommu.passthrough=1 transparent_hugepage=madvise console=tty1 console=ttyS0 nvme_core.io_timeout=4294967295 panic=-1 +``` + +You can also confirm the isolated processors: + +```bash +cat /sys/devices/system/cpu/isolated +``` + +The output shows the isolated processors: + +```output +0-9 +``` + +## Set up the Snort3 rule set + +Download the rule set from https://www.snort.org/ and extract it into your working directory. You should start in the `build` directory you used to build snort. + +```bash +cd $HOME/build +``` + +For testing, you can use the file https://www.snort.org/downloads/registered/snortrules-snapshot-3110.tar.gz. + +Download and unzip the rule set: + +```bash +wget https://www.snort.org/downloads/community/snort3-community-rules.tar.gz +mkdir -p Test/snortrules +tar -xzvf snort3-community-rules.tar.gz -C Test/snortrules +``` + +Copy the `lua` folder from the `snort3` source directory into the rules directory: + +```bash +cp -r snort3/lua/ Test/snortrules/ +``` + +## Download the packet capture (PCAP) files + +You can use any PCAP files that are relevant to your test scenario. + +One place to get PCAP files is: +https://www.netresec.com/?page=MACCDC + +Visit https://share.netresec.com/s/wC4mqF2HNso4Ten and download a PCAP file. + +Copy the file to your working directory and extract it, adjust the file name as needed if you downloaded a different PCAP file. + +```bash +gunzip maccdc2010_00000_20100310205651.pcap.gz +mkdir Test/Pcap +cp maccdc2010_00000_20100310205651.pcap Test/Pcap/ +``` + +## Adjust Lua configurations + +There are two modifications to the Lau configurations: +- Pin each Snort thread to a unique core, ensuring that the cores match those isolated in the GRUB configuration +- Enable the desired ruleset and enabling profiling + +### Pin snort threads to unique cpu core + +Navigate to the `Test/snortrules/lua` directory. + +```bash +cd Test/snortrules/lua +```` + +Use an editor to create a file named `common.lua` with the contents below. + +```bash +------------------------------------------------------------------------------- +---- common: shared configuration included at the end of other configs +------------------------------------------------------------------------------- +---- change these mappings so that the first N tests use unique cores +threads = +{ + { thread = 0, cpuset = '0' }, + { thread = 1, cpuset = '1' }, + { thread = 2, cpuset = '2' }, + { thread = 3, cpuset = '3' }, + { thread = 4, cpuset = '4' }, + { thread = 5, cpuset = '5' }, + { thread = 6, cpuset = '6' }, + { thread = 7, cpuset = '7' }, + { thread = 8, cpuset = '8' }, + { thread = 9, cpuset = '9' } +} +process = { threads = threads } +search_engine = { } +snort_whitelist_append("threads") +``` + +Include the above file in `snort.lua` by editing the file and adding the line below to the end of the file. + + ``` bash + include('common.lua') + ``` + +### Modify the snort.lua file to enable rules and profiling + +Use an editor to modify the `snort.lua` file. + +Enable all the rules by uncommenting the `enable_builtin_rules` line and adding the rule search directory as shown below: + +```bash +enable_builtin_rules = true, +rules = [[ + include ../snort3-community-rules/snort3-community.rules +]], +``` + +Continue to edit `snort.lua` and comment out the `profiler` and `latency` lines to enable profiling and packet statistics. + +## Review the Snort parameters + +### Modify the IPS policy + +Snort3 allows you to fine-tune setups with the `--tweaks` parameter. This feature allows you to use one of Snort's policy files to enhance the detection engine for improved performance or increased security. + +Snort3 includes four preset policy files: max_detect, security, balanced, and connectivity. + +The max_detect policy favors maximum security, whereas the connectivity policy focuses on performance and uptime, which may come at the expense of security. + +### Specify the data acquisition module + +Snort supports DAQ modules which serves as an abstraction layer for interfacing with data source such as network interface. + +To see list of DAQ modules supported by snort use `--daq-list` command. + +Return to the `build` directory: + +```bash +cd $HOME/build +``` + +Run using the command: + +``` bash +snort --daq-dir ./snort3/dependencies/libdaq/install/lib/daq --daq-list +``` + +The output is: + +```output +Available DAQ modules: +afpacket(v7): live inline multi unpriv + Variables: + buffer_size_mb - Packet buffer space to allocate in megabytes + debug - Enable debugging output to stdout + fanout_type - Fanout loadbalancing method + fanout_flag - Fanout loadbalancing option + use_tx_ring - Use memory-mapped TX ring + +bpf(v1): inline unpriv wrapper + +dump(v5): inline unpriv wrapper + Variables: + file - PCAP filename to output transmitted packets to (default: inline-out.pcap) + output - Set to none to prevent output from being written to file (deprecated) + dump-rx [arg] - Also dump received packets to their own PCAP file (default: inline-in.pcap) + +fst(v1): unpriv wrapper + Variables: + no_binding_verdicts - Disables enforcement of binding verdicts + enable_meta_ack - Enables support for filtering bare TCP acks + ignore_checksums - Ignore bad checksums while decoding + +gwlb(v1): inline unpriv wrapper + +nfq(v8): live inline multi + Variables: + debug - Enable debugging output to stdout + fail_open - Allow the kernel to bypass the netfilter queue when it is full + queue_maxlen - Maximum queue length (default: 1024) + +pcap(v4): readback live multi unpriv + Variables: + buffer_size - Packet buffer space to allocate in bytes + no_promiscuous - Disables opening the interface in promiscuous mode + no_immediate - Disables immediate mode for traffic capture (may cause unbounded blocking) + readback_timeout - Return timeout receive status in file readback mode + +savefile(v1): readback multi unpriv + +trace(v1): inline unpriv wrapper + Variables: + file - Filename to write text traces to (default: inline-out.txt) +``` + +For testing, you can use `--daq dump` to analyze PCAP files. + +## Spawn Snort3 process with multithreading + +To run Snort3 with multithreading start from the `Test` directory. + +```bash +cd $HOME/build/Test +``` + +The following example shows how to use multiple Snort threads to analyze PCAP files. + +``` bash +MPSE=hyperscan POLICY=./snortrules/lua/snort.lua TCMALLOC_MEMFS_MALLOC_PATH=/dev/hugepages/test snort -c ./snortrules/lua/snort.lua --lua detection.allow_missing_so_rules=true --pcap-filter maccdc2010_00000_20100310205651.pcap --pcap-loop 10 --snaplen 0 --max-packet-threads 10 --daq dump --daq-dir /usr/local/lib/daq --daq-var output=none -H --pcap-dir Pcap -Q --warn-conf-strict --tweaks security +``` + +Use `--pcap-loop` to loop PCAP files a number of time, 10 in this example. + +Use `--max-packet-threads` to specify the number of threads, 10 in this example. + +To confirm that the Snort process spans many threads, use the `mpstat` command to evaluate the CPU utilization. + +```bash +mpstat -P 0-9 1 +``` + +The output is similar to: + +```output +22:52:26 CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle +22:52:28 0 98.50 0.00 1.50 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +22:52:28 1 98.00 0.00 2.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +22:52:28 2 98.50 0.00 1.50 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +22:52:28 3 98.00 0.00 2.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +22:52:28 4 98.00 0.00 2.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +22:52:28 5 99.00 0.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +22:52:28 6 99.00 0.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +22:52:28 7 99.00 0.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +22:52:28 8 98.00 0.00 2.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +22:52:28 9 97.50 0.00 2.50 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +``` + +## Test Snort3 multi-threading to process single pcap file + +The example usage demonstrates how multithreading increases the number of packets processed per second. + +PCAP File Description + +| Name | Total Packets | +|------------------------|---------------| +| maccdc2012_0000.pcap | 86359430 | + +Performance results + +| Threads | Packets Per Second | Runtime in Sec | +|---------|--------------------|----------------| +| 1 | 940960 | 91.777964 | +| 10 | 9406134 | 9.181182 | + +The results demonstrate how increasing the thread count by ten times results in a ten times increase in packets processed per second, while reducing the execution time by ten times. \ No newline at end of file diff --git a/content/learning-paths/smartphones-and-mobile/_index.md b/content/learning-paths/smartphones-and-mobile/_index.md index 3184b831e..ace5d6e30 100644 --- a/content/learning-paths/smartphones-and-mobile/_index.md +++ b/content/learning-paths/smartphones-and-mobile/_index.md @@ -47,6 +47,7 @@ tools_software_languages_filter: - GoogleTest: 1 - Java: 4 - Kotlin: 4 +- LiteRT: 1 - llvm-mca: 1 - MediaPipe: 1 - Memory Bug Report: 1 @@ -61,7 +62,6 @@ tools_software_languages_filter: - Rust: 2 - SDDiskTool: 1 - SVE2: 1 -- tflite: 1 - Total Compute: 1 - Trusted Firmware: 1 - Unity: 6 diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_index.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_index.md index 09b49edbd..e31e5ab61 100644 --- a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_index.md +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_index.md @@ -1,20 +1,21 @@ --- -title: Profile the performance of ML models on Arm - -draft: true -cascade: - draft: true +title: Profile the Performance of AI and ML Mobile Applications on Arm minutes_to_complete: 60 -who_is_this_for: This is an introductory topic for software developers who want to learn how to profile the performance of their ML models running on Arm devices. +who_is_this_for: This is an introductory topic for software developers who want to learn how to profile the performance of Machine Learning (ML) models running on Arm devices. learning_objectives: - Profile the execution times of ML models on Arm devices. - Profile ML application performance on Arm devices. + - Describe how profiling can help optimize the performance of Machine Learning applications. prerequisites: - - An Arm-powered Android smartphone, and USB cable to connect with it. + - An Arm-powered Android smartphone, and a USB cable to connect to it. + - For profiling the ML inference, [Arm NN ExecuteNetwork](https://github.com/ARM-software/armnn/releases). + - For profiling the application, [Arm Performance Studio with Streamline](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio). + - Android Studio Profiler. + author_primary: Ben Clark @@ -28,7 +29,7 @@ armips: - Immortalis tools_software_languages: - Android Studio - - tflite + - LiteRT operatingsystems: - Android - Linux diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_review.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_review.md index 7eae5a8b1..451c2b044 100644 --- a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_review.md +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/_review.md @@ -4,35 +4,35 @@ review: question: > Streamline Profiling lets you profile: answers: - - Arm CPU activity - - Arm GPU activity - - when your Neural Network is running - - All of the above + - Arm CPU activity. + - Arm GPU activity. + - When your Neural Network is running. + - All of the above. correct_answer: 4 explanation: > - Streamline will show you CPU and GPU activity (and a lot more counters!), and if Custom Activity Maps are used, you can see when your Neural Network and other parts of your application are running. + Streamline shows you CPU and GPU activity (and a lot more counters!) and if Custom Activity Maps are used, you can see when your Neural Network and other parts of your application are running. - questions: question: > Does Android Studio have a profiler? answers: - - "Yes" - - "No" + - "Yes." + - "No." correct_answer: 1 explanation: > - Yes, Android Studio has a built-in profiler that can be used to monitor the memory usage of your app among other things + Yes, Android Studio has a built-in profiler that can be used to monitor the memory usage of your application, amongst other functions. - questions: question: > Is there a way to profile what is happening inside your Neural Network? answers: - - Yes, Streamline just shows you out of the box - No. - - Yes, ArmNN's ExecuteNetwork can do this - - Yes, Android Studio Profiler can do this + - Yes, Streamline just shows you out of the box. + - Yes, Arm NN ExecuteNetwork can do this. + - Yes, Android Studio Profiler can do this. correct_answer: 3 explanation: > - Standard profilers don't have an easy way to see what is happening inside an ML framework to see a model running inside it. ArmNN's ExecuteNetwork can do this for TensorFlow Lite models, and ExecuTorch has tools that can do this for PyTorch models. + Standard profilers do not have an easy way to see what is happening inside an ML framework to see a model running inside it. Arm NN ExecuteNetwork can do this for LiteRT models, and ExecuTorch has tools that can do this for PyTorch models. diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-android-studio.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-android-studio.md index 9f8508f3a..4c675b238 100644 --- a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-android-studio.md +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-android-studio.md @@ -7,39 +7,72 @@ layout: learningpathall --- ## Android Memory Profiling -Memory is often a problem in ML, with ever bigger models and data. For profiling an Android app's memory, Android Studio has a built-in profiler. This can be used to monitor the memory usage of your app, and to find memory leaks. +Memory is a common problem in ML, with ever-increasing model parameters and datasets. For profiling an Android app's memory, Android Studio has a built-in profiler. You can use this to monitor the memory usage of your app, and to detect memory leaks. -To find the Profiler, open your project in Android Studio and click on the *View* menu, then *Tool Windows*, and then *Profiler*. This opens the Profiler window. Attach your device in Developer Mode with a USB cable, and then you should be able to select your app's process. Here there are a number of different profiling tasks available. +### Set up the Profiler -Most likely with an Android ML app you'll need to look at memory both from the Java/Kotlin side and the native side. The Java/Kotlin side is where the app runs, and may be where buffers are allocated for input and output if, for example, you're using LiteRT (formerly known as TensorFlow Lite). The native side is where the ML framework will run. Looking at the memory consumption for Java/Kotlin and native is 2 separate tasks in the Profiler: *Track Memory Consumption (Java/Kotlin Allocations)* and *Track Memory Consumption (Native Allocations)*. +* To find the Profiler, open your project in Android Studio, and select the **View** menu. -Before you start either task, you have to build your app for profiling. The instructions for this and for general profiling setup can be found [here](https://developer.android.com/studio/profile). You will want to start the correct profiling version of the app depending on the task. +* Next, click **Tool Windows**, and then **Profiler**. This opens the Profiler window. -![Android Studio profiling run types alt-text#center](android-profiling-version.png "Figure 1. Profiling run versions") +* Attach your device in Developer Mode with a USB cable, and then select your app's process. There are a number of different profiling tasks available. -For the Java/Kotlin side, you want the **debuggable** "Profile 'app' with complete data", which is based off the debug variant. For the native side, you want the **profileable** "Profile 'app' with low overhead", which is based off the release variant. +Most likely with an Android ML app you will need to look at memory both from the Java/Kotlin side, and the native side: + +* The Java/Kotlin side is where the app runs, and might be where buffers are allocated for input and output if, for example, you are using LiteRT. +* The native side is where the ML framework runs. + +{{% notice Note %}} +Before you start either task, you must build your app for profiling. The instructions for this, and for general profiling setup can be found at [Profile your app performance](https://developer.android.com/studio/profile) on the Android Studio website. You need to start the correct profiling version of the app depending on the task. +{{% /notice %}} + +Looking at the memory consumption for Java/Kotlin and native, there are two separate tasks in the Profiler: + +* **Track Memory Consumption (Java/Kotlin Allocations)**. +* **Track Memory Consumption (Native Allocations)**. + +![Android Studio profiling run types alt-text#center](android-profiling-version.png "Figure 3: Profiling Run Versions") + +For the Java/Kotlin side, select **Profile 'app' with complete data**, which is based off the debug variant. For the native side, you want the **profileable** "Profile 'app' with low overhead", which is based off the release variant. ### Java/Kotlin -If you start looking at the [Java/Kotlin side](https://developer.android.com/studio/profile/record-java-kotlin-allocations), choose *Profiler: Run 'app' as debuggable*, and then select the *Track Memory Consumption (Java/Kotlin Allocations)* task. Navigate to the part of the app you wish to profile and then you can start profiling. At the bottom of the Profiling window it should look like Figure 2 below. Click *Start Profiler Task*. +To investigate the Java/Kotlin side, see the notes on [Record Java/Kotlin allocations](https://developer.android.com/studio/profile/record-java-kotlin-allocations). + +Select **Profiler: Run 'app' as debuggable**, and then select the **Track Memory Consumption (Java/Kotlin Allocations)** task. + +Navigate to the part of the app that you would like to profile, and then you can start profiling. -![Android Studio Start Profile alt-text#center](start-profile-dropdown.png "Figure 2. Start Profile") +The bottom of the profiling window should resemble Figure 4. -When you're ready, *Stop* the profiling again. Now there will be a nice timeline graph of memory usage. While Android Studio has a nicer interface for the Java/Kotlin side than the native side, the key to the timeline graph may be missing. This key is shown below in Figure 3, so you can refer to the colors from this. -![Android Studio memory key alt-text#center](profiler-jk-allocations-legend.png "Figure 3. Memory key for the Java/Kotlin Memory Timeline") +![Android Studio Start Profile alt-text#center](start-profile-dropdown.png "Figure 4: Start Profile") -The default height of the Profiling view, as well as the timeline graph within it is usually too small, so adjust these heights to get a sensible graph. You can click at different points of the graph to see the memory allocations at that time. If you look according to the key you can see how much memory is allocated by Java, Native, Graphics, Code etc. +Click **Start profiler task**. -Looking further down you can see the *Table* of Java/Kotlin allocations for your selected time on the timeline. With ML a lot of your allocations are likely to be byte[] for byte buffers, or possibly int[] for image data, etc. Clicking on the data type will open up the particular allocations, showing their size and when they were allocated. This will help to quickly narrow down their use, and whether they are all needed etc. +When you're ready, select *Stop* to stop the profiling again. + +Now there will be a timeline graph of memory usage. While Android Studio has a more user-friendly interface for the Java/Kotlin side than the native side, the key to the timeline graph might be missing. This key is shown in Figure 3. + +![Android Studio memory key alt-text#center](profiler-jk-allocations-legend.png "Figure 3: Memory key for the Java/Kotlin Memory Timeline") + +If you prefer, you can adjust the default height of the profiling view, as well as the timeline graph within it, as they are usually too small. + +Now click on different points of the graph to see the memory allocations at each specific time. Using the key on the graph, you can see how much memory is allocated by different categories of consumption, such as Java, Native, Graphics, and Code. + +If you look further down, you can see the **Table** of Java/Kotlin allocations for your selected time on the timeline. With ML, many of your allocations are likely to be scenarios such as byte[] for byte buffers, or possibly int[] for image data. Clicking on the data type opens up the particular allocations, showing their size and when they were allocated. This will help to quickly narrow down their use, and whether they are all needed. ### Native -For the [native side](https://developer.android.com/studio/profile/record-native-allocations), the process is similar but with different options. Choose *Profiler: Run 'app' as profileable*, and then select the *Track Memory Consumption (Native Allocations)* task. Here you have to *Start profiler task from: Process Start*. Choose *Stop* once you've captured enough data. +For the [native side](https://developer.android.com/studio/profile/record-native-allocations), the process is similar but with different options. Select **Profiler: Run 'app' as profileable**, and then select the **Track Memory Consumption (Native Allocations)** task. Here you have to **Start profiler task from: Process Start**. Select **Stop** once you've captured enough data. -The Native view doesn't have the same nice timeline graph as the Java/Kotlin side, but it does have the *Table* and *Visualization* tabs. The *Table* tab no longer has a list of allocations, but options to *Arrange by allocation method* or *callstack*. Choose *Arrange by callstack* and then you can trace down which functions were allocating significant memory. Potentially more useful, you can also see Remaining Size. +The Native view does not provide the same kind of timeline graph as the Java/Kotlin side, but it does have the **Table** and **Visualization** tabs. The **Table** tab no longer has a list of allocations, but options to **Arrange by allocation method** or **callstack**. Select **Arrange by callstack** and then you can trace down which functions allocate significant memory resource. There is also the **Remaining Size** tab, which is arguably more useful. -In the Visualization tab you can see the callstack as a graph, and once again you can look at total Allocations Size or Remaining Size. If you look at Remaining Size, you can see what is still allocated at the end of the profiling, and by looking a few steps up the stack, probably see which allocations are related to the ML model, by seeing functions that relate to the framework you are using. A lot of the memory may be allocated by that framework rather than in your code, and you may not have much control over it, but it is useful to know where the memory is going. +In the **Visualization** tab, you can see the callstack as a graph, and once again you can look at total **Allocations Size** or **Remaining Size**. If you look at **Remaining Size**, you can see what remains allocated at the end of the profiling, and by looking a few steps up the stack, probably see which allocations are related to the ML model, by seeing functions that relate to the framework you are using. A lot of the memory may be allocated by that framework rather than in your code, and you may not have much control over it, but it is useful to know where the memory is going. ## Other platforms -On other platforms, you will need a different memory profiler. The objective of working out where the memory is being used is the same, and whether there are issues with leaks or just too much memory being used. There are often trade-offs between memory and speed, and they can be considered more sensibly if the numbers involved are known. +On other platforms, you will need a different memory profiler. The objective is the same; to investigate memory consumption in terms of identifying whether there are issues with leaks or if there is too much memory being used. + +There are often trade-offs between memory and speed, and investigating memory consumption provides data that can help inform assessments of this balance. + + diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-streamline.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-streamline.md index e55e4e172..c72893edb 100644 --- a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-streamline.md +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/app-profiling-streamline.md @@ -7,51 +7,73 @@ layout: learningpathall --- ## Application Profiling -Application profiling can be split into 2 main types - *Instrumentation* and *Sampling*. [Streamline](https://developer.arm.com/Tools%20and%20Software/Streamline%20Performance%20Analyzer), for example, is a sampling profiler, that takes regular samples of various counters and registers in the system to provide a detailed view of the system's performance. Sampling will only provide a statistical view, but it is less intrusive and has less processing overhead than instrumentation. +Application profiling can be split into two main types: -The profiler can look at memory, CPU activity and cycles, cache misses, and many parts of the GPU as well as other performance metrics. It can also provide a timeline view of these counters to show the application's performance over time. This will show bottlenecks, and help you understand where to focus your optimization efforts. +* Sampling. +* Instrumentation. -![Streamline image alt-text#center](Streamline.png "Figure 1. Streamline timeline view") +[Streamline](https://developer.arm.com/Tools%20and%20Software/Streamline%20Performance%20Analyzer)is an example of a sampling profiler that takes regular samples of various counters and registers in the system to provide a detailed view of the system's performance. -## Example Android Application +Whilst sampling only provides a statistical view, it is less intrusive and has less processing overhead than instrumentation. -In this Learning Path, you will use profile [an example Android application](https://github.com/dawidborycki/Arm.PyTorch.MNIST.Inference) using Streamline. -Start by cloning the repository containing this example on your machine and open it in a recent Android Studio. It is generally safest to not update the Gradle version when prompted. +The profiler looks at performance metrics such as memory, CPU activity and cycles, cache misses, and many parts of the GPU. + +It can also provide a timeline-view of these counters to show any changes in the application's performance, which can reveal bottlenecks, and help you to identify where to focus your optimization efforts. + +![Streamline image alt-text#center](Streamline.png "Figure 1. Streamline Timeline View") + +## Get started with an example Android Application + +In this Learning Path, you will profile [an example Android application](https://github.com/dawidborycki/Arm.PyTorch.MNIST.Inference) using Streamline. + +Start by cloning the repository containing this example on your machine, then open it in a recent version of Android Studio. + +{{% notice Note %}} +It is generally safest to not update the Gradle version when prompted. +{{% /notice %}} ## Streamline -You will install Streamline and Performance Studio on your host machine and connect to your target Arm device to capture the data. In this example, the target device is an Arm-powered Android phone. The data is captured over a USB connection, and then analyzed on your host machine. +Now you can install Streamline and Arm Performance Studio on your host machine and connect to your target Arm device to capture the data. + +In this example, the target device is an Arm-powered Android phone. The data is captured over a USB connection, and then analyzed on your host machine. + +For more information on Streamline usage, see [Tutorials and Training Videos](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio). -For more details on Streamline usage you can refer to these [tutorials and training videos](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio). While the example you are running is based on Android, you can use [the setup and capture instructions for Linux](https://developer.arm.com/documentation/101816/0903/Getting-started-with-Streamline/Profile-your-Linux-application). +While the example that you are running is based on Android, you can also run it on Linux. See [Setup and Capture Instructions for Linux](https://developer.arm.com/documentation/101816/0903/Getting-started-with-Streamline/Profile-your-Linux-application). -First, follow these [setup instructions](https://developer.arm.com/documentation/102477/0900/Setup-tasks?lang=en), to make sure you have `adb` (Android Debug Bridge) installed. If you have installed [Android Studio](https://developer.android.com/studio), you will have installed adb already. Otherwise, you can get it as part of the Android SDK platform tools [here](https://developer.android.com/studio/releases/platform-tools.html). +### Installation -Make sure `adb` is in your path. You can check this by running `adb` in a terminal. If it is not in your path, you can add it by installing the [Android SDK `platform-tools`](https://developer.android.com/tools/releases/platform-tools#downloads) directory to your path. +Firstly, follow these [Setup Instructions](https://developer.arm.com/documentation/102477/0900/Setup-tasks?lang=en), to make sure you have `adb` (Android Debug Bridge) installed. If you have installed [Android Studio](https://developer.android.com/studio), you will have adb installed already. Otherwise, you can get it as part of the Android SDK platform tools which can be found on the [SDK Platform Tools Release Notes page](https://developer.android.com/studio/releases/platform-tools.html). + +Make sure `adb` is in your path. You can check this by running `adb` in a terminal. If it is not in your path, you can add it by installing the SDK platform tools from the [SDK Platform Tools Release Notes Downloads page](https://developer.android.com/tools/releases/platform-tools#downloads). Next, install [Arm Performance Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio#Downloads), which includes Streamline. -Connect your Android phone to your host machine through USB. Ensure that your Android phone is set to [Developer mode](https://developer.android.com/studio/debug/dev-options). +Connect your Android phone to your host machine through USB. Ensure that your Android phone is set to developer mode. For more information on how to do this, see [Configure on-device developer options](https://developer.android.com/studio/debug/dev-options). + +On your phone, navigate to **Settings**, then **Developer Options**. Enable **USB Debugging**. If your phone requests authorization for connection to your host machine, confirm authorization. Test the connection by running `adb devices` in a terminal. You will see your device ID listed. -On your phone, go to `Settings > Developer Options` and enable USB Debugging. If your phone asks you to authorize connection to your host machine, confirm this. Test the connection by running `adb devices` in a terminal. You should see your device ID listed. +Next, you need a debuggable build of the application that you want to profile. +- In Android Studio, ensure your **Build Variant** is set to **debug**. You can then build the application, and install it on your device. +- For a Unity app, select **Development Build** in the **Build Settings** menu under **File**, when building your application. +- In Unreal Engine, expand the navigation menu **Project Settings** > **Project** > **Packaging** > **Project**, and ensure that the **For Distribution** checkbox is clear. +- You can set `android:debuggable=true` in the application manifest file. -Next, you need a debuggable build of the application you want to profile. -- In Android Studio, ensure your *Build Variant* is set to `debug`. You can then build the application and install it on your device. -- For a Unity app, select Development Build under File > Build Settings when building your application. -- In Unreal Engine, open Project Settings > Project > Packaging > Project, and ensure that the For Distribution checkbox is not set. -- In the general case, you can set `android:debuggable=true` in the application manifest file. +For the example application that you cloned earlier, the Build Variant is `debug` by default, but you can verify this by going to **Build** > **Select Build Variant** in Android Studio. -For the example application that you cloned earlier, the Build Variant is `debug` by default, but you can verify this by going to `Build > Select Build Variant` in Android Studio. Build and install this application on your device. +Build and install this application on your device. -You can now run Streamline and [capture a profile](https://developer.arm.com/documentation/102477/0900/Capture-a-profile?lang=en) of your application. But before you do, lets add some useful annotations to your code that can help with more specific performance analysis of your application. +You are now able to run Streamline and capture a profile of your application by following the instructions [Capture a profile](https://developer.arm.com/documentation/102477/0900/Capture-a-profile?lang=en). But before you do, you can add some useful annotations to your code that enables specific performance analysis of your application. ## Custom Annotations -In Streamline, it is possible to add custom annotations to the timeline view. This can be useful to mark the start and end of specific parts of your application, or to mark when a specific event occurs. This can help you understand the performance of your application in relation to these events. At the bottom of *Figure 1* above there are custom annotations to show when inference, pre-processing, and post-processing are happening. +In Streamline, it is possible to add custom annotations to the timeline view. This can be useful to mark the start and end of parts of your application, or to mark when a specific event occurs. This then allows you to view the performance of your application in relation to these events. At the bottom of *Figure 1* there are custom annotations to show when inference, pre-processing, and post-processing occur. -To add annotations, you will need to add some files into your project from the **gator** daemon that Streamline uses. These files are named `streamline_annotate.c`, `streamline_annotate.h` and `streamline_annotate_logging.h` and made available [here](https://github.com/ARM-software/gator/tree/main/annotate). Using these annotations, you will be able to show log strings, markers, counters and Custom Activity Maps. WIthin your example project, create a `cpp` folder under the `app/src/main` folder, and add these three files there. +To add annotations, you will need to add some files into your project from the **gator** daemon that Streamline uses. These files are named `streamline_annotate.c`, `streamline_annotate.h`, and `streamline_annotate_logging.h` and made available at [this GitHub repository](https://github.com/ARM-software/gator/tree/main/annotate). Using these annotations, you can see log strings, markers, counters, and Custom Activity Maps. Within your example project, create a `cpp` folder under the `app/src/main` folder, and add these three files there. -These files are written in C, so if your Android Studio project is in Java or Kotlin, you will need to add a C library to your project. This is slightly trickier than just adding a Java or Kotlin file, but it is not difficult. You can find instructions on how to do this [here](https://developer.android.com/studio/projects/add-native-code). +These files are written in C, so if your Android Studio project is in Java or Kotlin, you will need to add a C library to your project. This is slightly trickier than adding a Java or Kotlin file, but it is not difficult. You can find instructions on how to do this at a page called [Add C and C++ code to your project](https://developer.android.com/studio/projects/add-native-code). -Create a file in the `app/src/main/cpp/` folder under your project and name it `annotate_jni_wrapper.c`. This will be a wrapper around the gator daemon's functions, and will be called from your Kotlin code. Copy the code below into this file. You can also create very similar wrapper functions for other gator daemon functions. +Create a file in the `app/src/main/cpp/` folder under your project, and name it `annotate_jni_wrapper.c`. This will be a wrapper around the gator daemon's functions, and will be called from your Kotlin code. Copy the code below into this file. You can also create similar wrapper functions for other gator daemon functions. ```c #include @@ -66,7 +88,7 @@ JNIEXPORT jlong JNICALL Java_AnnotateStreamline_GetTime(JNIEnv* env, jobject obj } ``` -Some functions have `unsigned int`, but that needs to be a `jint` in the wrapper, with some casting required in your Kotlin code to enforce type correctness at that end. Some functions have strings as arguments, and you will need to do a small conversion as shown below: +Some functions have `unsigned int`, but this needs to be a `jint` in the wrapper, with some casting required in your Kotlin code to enforce type correctness at that end. Some functions have strings as arguments, and you will need to do a small conversion as shown below: ```c JNIEXPORT void JNICALL Java_AnnotateStreamline_AnnotateMarkerColorStr(JNIEnv* env, jobject obj, jint color, jstring str) { @@ -76,7 +98,7 @@ JNIEXPORT void JNICALL Java_AnnotateStreamline_AnnotateMarkerColorStr(JNIEnv* en } ``` -In Android Studio `cmake` is used to create your C library, so you will need a `CMakelists.txt` file in the same directory as the C files (`app/src/main/cpp/` in the example). Copy the contents shown below into `CMakelists.txt`: +In Android Studio, `cmake` is used to create your C library, so you will need a `CMakelists.txt` file in the same directory as the C files (`app/src/main/cpp/` in the example). Copy the contents shown below into `CMakelists.txt`: ```cmake # Sets the minimum CMake version required for this project. @@ -112,7 +134,13 @@ Now add the code below to the `build.gradle` file of the Module you wish to prof } ``` -This will create a `libStreamlineAnnotationJNI.so` library that you can load in your Kotlin code, and then you can call the functions. Here you will create a singleton `AnnotateStreamline.kt`. Place the file alongside `MainActivity.kt` in `app\src\main\java\com\arm\armpytorchmnistinference` for the example. Add the following code to `AnnotateStreamline.kt` to enable Kotlin calls to the gator daemon from the rest of your code: +This creates a `libStreamlineAnnotationJNI.so` library that you can load in your Kotlin code, and then you can call the functions. + +In this location you can now create a singleton `AnnotateStreamline.kt`. + +Place the file alongside `MainActivity.kt` in `app\src\main\java\com\arm\armpytorchmnistinference` for the example. + +Add the following code to `AnnotateStreamline.kt` to enable Kotlin calls to the gator daemon from the rest of your code: ```kotlin // Kotlin wrapper class for integration into Android project @@ -164,23 +192,27 @@ class AnnotateStreamline { Fill in all the function calls to match the functions you added into `annotate_jni_wrapper.c`. -The `AnnotateStreamline` class can now be used in your Kotlin code to add annotations to the Streamline timeline view. The first thing is to make sure `AnnotateStreamline.setup()` is called before any other gator functions. For the example project, add it into the `onCreate()` function of `MainActivity.kt`. Then you can add annotations like this: +You can now use the `AnnotateStreamline` class in your Kotlin code to add annotations to the Streamline timeline view. + +Firstly, make sure that `AnnotateStreamline.setup()` is called before any other gator function. + +For the example project, add it into the `onCreate()` function of `MainActivity.kt`. Then you can add annotations like this: ```kotlin AnnotateStreamline.annotateMarkerColorStr(AnnotateStreamline.ANNOTATE_BLUE, "Model Load") ``` -In the example app you could add this in the `onCreate()` function of `MainActivity.kt` after the `Module.load()` call to load the `model.pth`. +In the example app, you can add this in the `onCreate()` function of `MainActivity.kt` after the `Module.load()` call to load the `model.pth`. -This 'colored marker with a string' annotation will add the string and time to Streamline's log view, and look like the image shown below in Streamline's timeline (in the example app ArmNN isn't used, so there are no white ArmNN markers): +This *colored marker with a string* annotation will add the string and time to Streamline's log view, and it appears like the image shown below in Streamline's timeline (in the example app, Arm NN is not used, so there are no white Arm NN markers): ![Streamline image alt-text#center](streamline_marker.png "Figure 2. Streamline timeline markers") ## Custom Activity Maps (CAMs) -In addition to adding strings to the log and colored markers to the timeline, a particularly useful set of annotations is the Custom Activity Maps. These are the named colored bands you can see at the bottom of the Streamline timeline view shown in *Figure 1*. They can be used to show when specific parts of your application are running, such as the pre-processing or inference, and layered for functions within functions etc. +In addition to adding strings to the log and colored markers to the timeline, a particularly useful set of annotations is the Custom Activity Maps (CAMs). These are the named colored bands that you can see at the bottom of the Streamline timeline view, as shown in *Figure 1*. They can be used to show when specific parts of your application are running, such as the pre-processing or inference, and layered for functions within functions. -To add these you will need to import the functions that start `gator_cam_` from `streamline_annotate.h` through your wrapper files in the same way as the functions above. Then you can use CAMs, but first you will need to set up the tracks the annotations will appear on and an id system for each annotation. The `baseId` code below is to ensure that if you add annotations in multiple places in your code, the ids are unique. +To add these, in the same way as the functions above, you need to import the functions that are prefixed with `gator_cam_` from `streamline_annotate.h`. You can then use CAMs, but first you need to set up the tracks the annotations will appear on, and an ID system for each annotation. The `baseId` code below is used to ensure that if you add annotations in multiple places in your code, the IDs are unique. Here is an example setup in a class's companion object: @@ -201,7 +233,7 @@ Here is an example setup in a class's companion object: For the example app, add this to the `MainActivity` class. -Then it can be used like this: +Then you can use it in this way: ```kotlin val preprocess = currentId++ @@ -214,7 +246,7 @@ Then it can be used like this: AnnotateStreamline.camJobEnd(camViewId, preprocess, AnnotateStreamline.getTime()) ``` -In the example app, the CAM annotations are added to the `runInference()` function, which should look like this: +In the example app, the CAM annotations are added to the `runInference()` function, that looks like this: ```kotlin private fun runInference(bitmap: Bitmap) { @@ -244,6 +276,6 @@ In the example app, the CAM annotations are added to the `runInference()` functi } ``` -The example application is very fast and simple, so the CAMs will not show much information. In a more complex application you could add more CAMs, including child-level ones, to give more detailed annotations to show where time is spent in your application. For this example app with its very fast inference, it's best to change the Streamline timeline view scale to 10µs in order to see the CAM annotations better. +The example application is fast and simple, and the CAMs do not show a lot of information. In a more complex application, you can add further CAMs, including child-level ones, to give more detailed annotations to show where time is spent in your application. For this example app with its very fast inference, it is best to change the Streamline timeline view scale to 10µs in order to better see the CAM annotations. -Once you've added in useful CAM annotations, you can build and deploy a debug version of your application. You can run Streamline and see the annotations and CAMs in the timeline view. See the [Streamline documentation](https://developer.arm.com/documentation/101816/latest/) for how to make a capture for profiling. After the capture is made and analyzed, you will be able to see when your application is running the inference, ML pre-processing, ML post-processing, or other parts of your application. From there you can see where the most time is spent, and how hard the CPU or GPU is working during different parts of the application. From this you can then decide if work is needed to improve performance and where that work needs doing. +Once you have added in useful CAM annotations, you can build and deploy a debug version of your application. You can run Streamline and see the annotations and CAMs in the timeline view. See the [Streamline documentation](https://developer.arm.com/documentation/101816/latest/) for information on how to make a capture for profiling. After the capture is made and analyzed, you will be able to see when your application is running the inference, performing ML pre-processing or ML post-processing, or other operations from parts of your application. From there you can see where the most time is spent, and how hard the CPU or GPU is working during different parts of the application. From this you can then decide if work is needed to improve performance and where that work needs doing. diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-executenetwork.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-executenetwork.md index f4ca26994..1679673b2 100644 --- a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-executenetwork.md +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-executenetwork.md @@ -1,21 +1,27 @@ --- -title: ML profiling of a tflite model with ExecuteNetwork +title: ML profiling of a LiteRT model with ExecuteNetwork weight: 6 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## ArmNN's Network Profiler -One way of running tflite models is with ArmNN. This is available as a delegate to the standard tflite interpreter. But to profile the model, ArmNN comes with a command-line utility called `ExecuteNetwork`. This program just runs the model without the rest of the app. It is able to output layer timings and other useful information to let you know where there might be bottlenecks within your model. +## Arm NN Network Profiler +One way of running LiteRT models is to use Arm NN, which is open-source network machine learning (ML) software. This is available as a delegate to the standard LiteRT interpreter. But to profile the model, Arm NN comes with a command-line utility called `ExecuteNetwork`. This program runs the model without the rest of the app. It is able to output layer timings and other useful information to report where there might be bottlenecks within your model. -If you are using tflite without ArmNN, then the output from `ExecuteNetwork` will be more of an indication than a definitive answer. But it can still be useful to spot any obvious problems. +If you are using LiteRT without Arm NN, then the output from `ExecuteNetwork` is more of an indication than a definitive answer, but it can still be useful in identifying any obvious problems. -To try this out, you can download a tflite model from the [Arm Model Zoo](https://github.com/ARM-software/ML-zoo). In this Learning Path, you will download [mobilenet tflite](https://github.com/ARM-software/ML-zoo/blob/master/models/image_classification/mobilenet_v2_1.0_224/tflite_int8/mobilenet_v2_1.0_224_INT8.tflite). +### Download a LiteRT Model -To get `ExecuteNetwork` you can download it from the [ArmNN GitHub](https://github.com/ARM-software/armnn/releases). Download the version appropriate for the Android phone you wish to test on - the Android version and the architecture of the phone. If you are unsure of the architecture, you can use a lower one, but you may miss out on some optimizations. Inside the `tar.gz` archive that you download, `ExecuteNetwork` is included. Note among the other release downloads on the ArmNN Github is the separate file for the `aar` delegate which is the easy way to include the ArmNN delegate into your app. +To try this out, you can download a LiteRT model from the [Arm Model Zoo](https://github.com/ARM-software/ML-zoo). Specifically for this Learning Path, you will download [mobilenet tflite](https://github.com/ARM-software/ML-zoo/blob/master/models/image_classification/mobilenet_v2_1.0_224/tflite_int8/mobilenet_v2_1.0_224_INT8.tflite). -To run `ExecuteNetwork` you'll need to use `adb` to push the model and the executable to your phone, and then run it from the adb shell. `adb` is included with Android Studio, but you may need to add it to your path. Android Studio normally installs it to a location like `\\AppData\Local\Android\Sdk\platform-tools`. `adb` can also be downloaded separately from the [Android Developer site](https://developer.android.com/studio/releases/platform-tools). +### Download and setup ExecuteNetwork + +You can download `ExecuteNetwork` from the [Arm NN GitHub](https://github.com/ARM-software/armnn/releases). Download the version appropriate for the Android phone that you are testing on, ensuring that it matches the Android version and architecture of the phone. If you are unsure of the architecture, you can use a lower one, but you might miss out on some optimizations.`ExecuteNetwork` is included inside the `tar.gz` archive that you download. Among the other release downloads on the Arm NN Github is a separate file for the `aar` delegate which you can also easily download. + +To run `ExecuteNetwork,` you need to use `adb` to push the model and the executable to your phone, and then run it from the adb shell. `adb` is included with Android Studio, but you might need to add it to your path. Android Studio normally installs it to a location such as: + + `\\AppData\Local\Android\Sdk\platform-tools`. `adb` can also be downloaded separately from the [Android Developer site](https://developer.android.com/studio/releases/platform-tools). Unzip the `tar.gz` folder you downloaded. From a command prompt, you can then adapt and run the following commands to push the files to your phone. The `/data/local/tmp` folder of your Android device is a place with relaxed permissions that you can use to run this profiling. @@ -25,9 +31,11 @@ adb push ExecuteNetwork /data/local/tmp/ adb push libarm_compute.so /data/local/tmp/ adb push libarmnn.so /data/local/tmp/ adb push libarmnn_support_library.so /data/local/tmp/ -# more ArmNN .so library files +# more Arm NN .so library files ``` -Push all the `.so` library files that are in the base folder of the `tar.gz` archive you downloaded, alongside `ExecuteNetwork`, and all the `.so` files in the `delegate` sub-folder. If you are using a recent version of Android Studio this copying can be done much more easily with drag and drop in the *Device Explorer > Files*. +Push all the `.so` library files that are in the base folder of the `tar.gz` archive you downloaded, alongside `ExecuteNetwork`, and all the `.so` files in the `delegate` sub-folder. + +If you are using a recent version of Android Studio this copying can be done much more easily with drag-and-drop in Android Studio in **Device Explorer > Files**. Then you need to set the permissions on the files: @@ -38,17 +46,21 @@ chmod 777 ExecuteNetwork chmod 777 *.so ``` -Now you can run ExecuteNetwork to profile the model. With the example tflite, you can use the following command: +### Run ExecuteNetwork to profile the model + +Now you can run ExecuteNetwork to profile the model. With the example LiteRT, you can use the following command: ```bash LD_LIBRARY_PATH=. ./ExecuteNetwork -m mobilenet_v2_1.0_224_INT8.tflite -c CpuAcc -T delegate --iterations 2 --do-not-print-output --enable-fast-math --fp16-turbo-mode -e --output-network-details > modelout.txt ``` -If you are using your own tflite, replace `mobilenet_v2_1.0_224_INT8.tflite` with the name of your tflite file. +If you are using your own LiteRT, replace `mobilenet_v2_1.0_224_INT8.tflite` with the name of your tflite file. + +This runs the model twice, outputting the layer timings to `modelout.txt`. The `--iterations 2` flag is the command that instructs it to run twice: the first run includes a lot of start-up costs and one-off optimizations, whilst the second run is more indicative of the level of performance. -This will run the model twice, outputting the layer timings to `modelout.txt`. The `--iterations 2` flag is the command that means it runs twice: the first run includes a lot of startup costs and one-off optimizations, so the second run is more indicative of the real performance. +The other flags to note are the `-e` and `--output-network-details` flags which output a lot of timeline information about the model, including the layer timings. The `--do-not-print-output` flag stops the output of the model, which can be very large, and without sensible input it is meaningless. The `--enable-fast-math` and `--fp16-turbo-mode` flags enable some math optimizations. `CpuAcc` is the accelerated CPU backend, and you can replace it with `GpuAcc` for the accelerated GPU backend. -The other flags to note are the `-e` and `--output-network-details` flags which will output a lot of timeline information about the model, including the layer timings. The `--do-not-print-output` flag will stop the output of the model, which can be very large, and without sensible input it is meaningless. The `--enable-fast-math` and `--fp16-turbo-mode` flags enable some math optimizations. `CpuAcc` is the acclerated CPU backend, it can be replaced with `GpuAcc` for the accelerated GPU backend. +### Analyze the output After running the model, you can pull the output file back to your host machine with the following commands: @@ -56,13 +68,13 @@ After running the model, you can pull the output file back to your host machine exit adb pull /data/local/tmp/modelout.txt ``` -Once again, this can be done with drag and drop in Android Studio's *Device Explorer > Files*. +Once again, you can do this with drag-and-drop in Android Studio in **Device Explorer > Files**. -Depending on the size of your model, the output will probably be quite large. You can use a text editor to view the file. The output is in JSON format, so you can use a JSON viewer to make it more readable. Usually some scripting can be used to extract the information you need more easily out of the very raw data in the file. +Depending on the size of your model, the output will probably be quite large. You can use a text editor to view the file. The output is in JSON format, so you can use a JSON viewer to make it more readable. Usually you can use some scripting to extract the information you need more easily out of the raw data in the file. -At the top is the summary, with the setup time and inference time of your 2 runs, which will look something like this: +At the top is the summary, with the setup time and inference time of the two runs, which look something like this: -```text +```output Info: ArmNN v33.2.0 Info: Initialization time: 7.20 ms. Info: ArmnnSubgraph creation @@ -78,8 +90,13 @@ Info: Execution time: 468.42 ms. Info: Inference time: 468.58 ms ``` -After the summary comes the graph of the model, then the layers and their timings from the second run. At the start of the layers there are a few optimizations and their timings recorded before the network itself. You can skip past the graph and the optimization timings to get to the part that needs analyzing. +After the summary, you will see: + +* The graph of the model. +* The layers and their timings from the second run. + +At the start of the layers, there are a few optimizations and their timings recorded before the network itself. You can skip past the graph and the optimization timings to get to the part that you need to analyze. -In the mobilenet example output, the graph is from lines 18 to 1629. After this is the optimization timings, which are part of the runtime, but not the network - these go until line 1989. Next there are a few wall clock recordings for the loading of the network, before the first layer "Convolution2dLayer_CreateWorkload_#18" at line 2036. Here is where the layer info that needs analyzing starts. +In the mobilenet example output, the graph is from lines 18 to 1629. After this are the optimization timings, which are part of the runtime, but not the network - these go until line 1989. Next there are a few wall clock recordings for the loading of the network, before the first layer "Convolution2dLayer_CreateWorkload_#18" at line 2036. This is where the layer information that requires analysis starts. -The layers' "Wall clock time" in microseconds shows how long they took to run. These layers and their timings can then be analyzed to see which layers, and which operators, took the most time. +The layers' wall-clock time in microseconds shows you how much time elapsed. You can then analyze these layers and timings to identify which layers and operators took the most time to run. diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-general.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-general.md index 91a35381f..bf64ce044 100644 --- a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-general.md +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/nn-profiling-general.md @@ -6,11 +6,13 @@ weight: 5 layout: learningpathall --- -## Profiling your model -App profilers will give you a good overall view of your performance, but often you might want to look inside the model and work out bottlenecks within the network. The network is often the bulk of the time, in which case it will warrant closer analysis. +## Tools that you can use +App profilers provide a good overall view of performance, but you might want to look inside the model and identify bottlenecks within the network. The network is often where the bulk of the bottlenecks lie, so it warrants closer analysis. -With general profilers this is hard to do, as there needs to be annotations inside the ML framework code to get the information. It is a large task to write the profiling annotations throughout the framework, so it is easier to use tools from a framework or inference engine that already has the required instrumentation. +With general profilers this is hard to do, as there needs to be annotation inside the ML framework code to retrieve the information. It is a complex task to write the profiling annotation throughout the framework, so it is easier to use tools from a framework or inference engine that already has the required instrumentation. -Depending on your model, your choice of tools will differ. For example, if you are using LiteRT (formerly TensorFlow Lite), Arm provides the ArmNN delegate that you can run with the model running on Linux or Android, CPU or GPU. ArmNN in turn provides a tool called `ExecuteNetwork` that can run the model and give you layer timings among other useful information. +Depending on the model you use, your choice of tools will vary. For example, if you are using LiteRT (formerly TensorFlow Lite), Arm provides the Arm NN delegate that you can run with the model running on Linux or Android, CPU or GPU. -If you are using PyTorch, you will probably use ExecuTorch the ons-device inference runtime for your Android phone. ExecuTorch has a profiler available alongside it. +Arm NN in turn provides a tool called ExecuteNetwork that can run the model and provide layer timings, amongst other useful information. + +If you are using PyTorch, you will probably use ExecuTorch, which is the on-device inference runtime for your Android phone. ExecuTorch has a profiler available alongside it. diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/plan.txt b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/plan.txt index 70e766717..6c2926ca3 100644 --- a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/plan.txt +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/plan.txt @@ -13,7 +13,7 @@ here's how to do that... Also Android Profiler, memory example Ml network, it will depend on the inference engine you are using -- here's an example for if you are using ArmNN with TFLite +- here's an example for if you are using Arm NN with TFLite - if you're not using it, it may still have some useful information, but different operators will be used and their performance will be different can see structure with netron or google model explorer to compare operators or different versions of networks may need to use a conversion tool to convert to TFLite (or whatever your inference engine wants) diff --git a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/why-profile.md b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/why-profile.md index 7d688a4ad..b1d4b7035 100644 --- a/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/why-profile.md +++ b/content/learning-paths/smartphones-and-mobile/profiling-ml-on-arm/why-profile.md @@ -1,23 +1,22 @@ --- -title: Why do you need to profile your ML application? +title: Why should you profile your ML application? weight: 2 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## Performance -Working out what is taking the time and memory in your application is the first step to getting the performance you want. Profiling can help you identify the bottlenecks in your application and understand how to optimize it. +## Optimizing Performance +A first step towards achieving optimal performance in a ML Model is to identify what is consuming the most time and memory in your application. Profiling can help you identify the bottlenecks, and it can offer clues about how to optimize operations. -With Machine Learning (ML) applications, the inference of the Neural Network (NN) itself is often the heaviest part of the application in terms of computation and memory usage. This is not guaranteed however, so it is important to profile the application as a whole to see if pre- or post-processing or other code is an issue. +With Machine Learning (ML) applications, whilst the inference of the Neural Network (NN) is often the heaviest part of the application in terms of computation and memory usage, it is not necessarily always the case. It is therefore important to profile the application as a whole to detect other possible issues that can negatively impact performance, such as issues with pre- or post-processing, or the code itself. -In this Learning Path, you will profile an Android example using TFLite, but most of the steps shown will also work with Linux and cover a wide range of Arm devices. The principles for profiling your application are the same for use with other inference engines and platforms, but the tools are different. +In this Learning Path, you will profile an Android example using LiteRT. Most of the steps are transferable and work with Linux, and you can use them on a wide range of Arm devices. -## Tools +The principles for profiling an application apply to many other inference engines and platforms, only the tools differ. -You will need to use different tools to profile the ML inference or the application's performance running on your Arm device. +{{% notice Note %}} +LiteRT is the new name for TensorFlow Lite, or TFLite. +{{% /notice %}} -For profiling the ML inference, you will use [ArmNN](https://github.com/ARM-software/armnn/releases)'s ExecuteNetwork. - -For profiling the application as a whole, you will use [Arm Performance Studio](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Studio)'s Streamline, and the Android Studio Profiler. diff --git a/content/learning-paths/smartphones-and-mobile/totalcompute/_review.md b/content/learning-paths/smartphones-and-mobile/totalcompute/_review.md index 211a30b6c..b0530c3d8 100644 --- a/content/learning-paths/smartphones-and-mobile/totalcompute/_review.md +++ b/content/learning-paths/smartphones-and-mobile/totalcompute/_review.md @@ -28,10 +28,10 @@ review: - "Trusted firmware" - "Android" - "CMSIS" - - "ArmNN" + - "Arm NN" correct_answer: 3 explanation: > - The stack includes open-source code available from these upstream projects: SCP firmware, Trusted firmware, Linux kernel, Android, and ArmNN. + The stack includes open-source code available from these upstream projects: SCP firmware, Trusted firmware, Linux kernel, Android, and Arm NN. # ================================================================================ diff --git a/content/learning-paths/smartphones-and-mobile/totalcompute/build.md b/content/learning-paths/smartphones-and-mobile/totalcompute/build.md index 0b3a23113..b02a3c462 100644 --- a/content/learning-paths/smartphones-and-mobile/totalcompute/build.md +++ b/content/learning-paths/smartphones-and-mobile/totalcompute/build.md @@ -7,7 +7,7 @@ weight: 2 # 1 is first, 2 is second, etc. # Do not modify these elements layout: "learningpathall" --- -The [Arm Total Compute](https://developer.arm.com/Tools%20and%20Software/Total%20Compute) reference software stack is a fully integrated open-source stack, from firmware up to Android. he stack includes open-source code available from the relevant upstream projects: SCP firmware, Trusted firmware, Linux kernel, Android, and ArmNN. +The [Arm Total Compute](https://developer.arm.com/Tools%20and%20Software/Total%20Compute) reference software stack is a fully integrated open-source stack, from firmware up to Android. he stack includes open-source code available from the relevant upstream projects: SCP firmware, Trusted firmware, Linux kernel, Android, and Arm NN. ## Download and install the FVP diff --git a/data/stats_current_test_info.yml b/data/stats_current_test_info.yml index 394747579..0d3713090 100644 --- a/data/stats_current_test_info.yml +++ b/data/stats_current_test_info.yml @@ -1,5 +1,5 @@ summary: - content_total: 305 + content_total: 306 content_with_all_tests_passing: 32 content_with_tests_enabled: 34 sw_categories: diff --git a/data/stats_weekly_data.yml b/data/stats_weekly_data.yml index e9e8de919..25768af29 100644 --- a/data/stats_weekly_data.yml +++ b/data/stats_weekly_data.yml @@ -4086,3 +4086,88 @@ avg_close_time_hrs: 0 num_issues: 18 percent_closed_vs_total: 0.0 +- a_date: '2024-12-09' + content: + cross-platform: 26 + embedded-systems: 19 + install-guides: 90 + laptops-and-desktops: 33 + microcontrollers: 24 + servers-and-cloud-computing: 89 + smartphones-and-mobile: 25 + total: 306 + contributions: + external: 44 + internal: 354 + github_engagement: + num_forks: 30 + num_prs: 7 + individual_authors: + alaaeddine-chakroun: 2 + alexandros-lamprineas: 1 + annie-tallund: 1 + arm: 3 + arnaud-de-grandmaison: 1 + basma-el-gaabouri: 1 + bolt-liu: 2 + brenda-strech: 1 + chen-zhang: 1 + christopher-seidl: 7 + cyril-rohr: 1 + daniel-gubay: 1 + daniel-nguyen: 1 + david-spickett: 2 + dawid-borycki: 30 + diego-russo: 1 + diego-russo-and-leandro-nunes: 1 + elham-harirpoush: 2 + florent-lebeau: 5 + "fr\xE9d\xE9ric--lefred--descamps": 2 + gabriel-peterson: 5 + gayathri-narayana-yegna-narayanan: 1 + georgios-mermigkis-and-konstantinos-margaritis,-vectorcamp: 1 + graham-woodward: 1 + iago-calvo-lista,-arm: 1 + james-whitaker,-arm: 1 + jason-andrews: 90 + joe-stech: 1 + johanna-skinnider: 2 + jonathan-davies: 2 + jose-emilio-munoz-lopez,-arm: 1 + julie-gaskin: 4 + julio-suarez: 5 + kasper-mecklenburg: 1 + kieran-hejmadi: 1 + koki-mitsunami: 1 + konstantinos-margaritis: 7 + kristof-beyls: 1 + liliya-wu: 1 + mathias-brossard: 1 + michael-hall: 5 + nikhil-gupta,-pareena-verma,-nobel-chowdary-mandepudi,-ravi-malhotra: 1 + odin-shen: 1 + owen-wu,-arm: 2 + pareena-verma: 35 + pareena-verma,-annie-tallund: 1 + pareena-verma,-jason-andrews,-and-zach-lasiuk: 1 + pareena-verma,-joe-stech,-adnan-alsinan: 1 + pranay-bakre: 4 + przemyslaw-wirkus: 1 + rin-dobrescu: 1 + roberto-lopez-mendez: 2 + ronan-synnott: 45 + thirdai: 1 + tianyu-li: 1 + tom-pilar: 1 + uma-ramalingam: 1 + varun-chari,-albin-bernhardsson: 1 + varun-chari,-pareena-verma: 1 + visualsilicon: 1 + ying-yu: 1 + ying-yu,-arm: 1 + zach-lasiuk: 1 + zhengjun-xing: 2 + issues: + avg_close_time_hrs: 0 + num_issues: 10 + percent_closed_vs_total: 0.0