From 7be2b0764d65284a7656fd067a324035e7ce01e6 Mon Sep 17 00:00:00 2001 From: Przemyslaw Wirkus Date: Thu, 10 Oct 2024 13:21:54 +0100 Subject: [PATCH 1/6] docs: Sampling CPython with Arm SPE with WindowsPerf Add new learning path for WindowsPerf and SPE sampling --- .../_index.md | 38 ++++ .../_next-steps.md | 76 +++++++ .../_review.md | 100 ++++++++ .../windowsperf_sampling_cpython_spe.md | 196 ++++++++++++++++ ...dowsperf_sampling_cpython_spe_example_1.md | 215 ++++++++++++++++++ ...dowsperf_sampling_cpython_spe_example_2.md | 35 +++ 6 files changed, 660 insertions(+) create mode 100644 content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_index.md create mode 100644 content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_next-steps.md create mode 100644 content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_review.md create mode 100644 content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md create mode 100644 content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md create mode 100644 content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_index.md b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_index.md new file mode 100644 index 000000000..4c9a30411 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_index.md @@ -0,0 +1,38 @@ +--- +title: Sampling CPython with Arm SPE with WindowsPerf + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for developers keen to understand sampling with ARM SPE extension. + +learning_objectives: + - Use WindowsPerf with native Windows on Arm workload + - Understand the basics of sampling with Arm SPE + - Explore the WindowsPerf command line + - Build CPython from sources for Windows on Arm ARM64 target + +prerequisites: + - Windows on Arm desktop or development machine with [WindowsPerf installed](/install-guides/wperf) + - CPU with Arm Statistical Profiling Extension (SPE) support + +author_primary: Przemyslaw Wirkus + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - Neoverse + - Cortex-A +operatingsystems: + - Windows +tools_software_languages: + - WindowsPerf + - Python + - perf + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_next-steps.md b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_next-steps.md new file mode 100644 index 000000000..b46623df9 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_next-steps.md @@ -0,0 +1,76 @@ +--- +# ================================================================================ +# Edit +# ================================================================================ + +next_step_guidance: > + Now that you have WindowsPerf running, why not learn how to build a native Windows on Arm application? +# 1-3 sentence recommendation outlining how the reader can generally keep learning about these topics, and a specific explanation of why the next step is being recommended. + +recommended_path: "/learning-paths/laptops-and-desktops/win_net/" + +# further_reading links to references related to this path. Can be: + # Manuals for a tool / software mentioned (type: documentation) + # Blog about related topics (type: blog) + # General online references (type: website) + +further_reading: + - resource: + title: Announcing WindowsPerf Open-source performance analysis tool for Windows on Arm + link: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/announcing-windowsperf + type: blog + - resource: + title: WindowsPerf release 2.4.0 introduces the first stable version of sampling model support + link: https://www.linaro.org/blog/windowsperf-release-2-4-0-introduces-the-first-stable-version-of-sampling-model-support/ + type: blog + - resource: + title: WindowsPerf Release 2.5.1 + link: https://www.linaro.org/blog/windowsperf-release-2-5-1/ + type: blog + - resource: + title: WindowsPerf Release 3.0.0 + link: https://www.linaro.org/blog/windowsperf-release-3-0-0/ + type: blog + - resource: + title: WindowsPerf Release 3.3.0 + link: https://www.linaro.org/blog/windowsperf-release-3-3-0/ + type: blog + - resource: + title: WindowsPerf Release 3.7.2 + link: https://www.linaro.org/blog/expanding-profiling-capabilities-with-windowsperf-372-release + type: blog + - resource: + title: "Introducing the WindowsPerf GUI: the Visual Studio 2022 extension" + link: https://www.linaro.org/blog/introducing-the-windowsperf-gui-the-visual-studio-2022-extension + type: blog + - resource: + title: "Introducing 1.0.0-beta release of WindowsPerf Visual Studio extension" + link: https://www.linaro.org/blog/introducing-1-0-0-beta-release-of-windowsperf-visual-studio-extension + type: blog + - resource: + title: "New Release: WindowsPerf Visual Studio Extension v1.0.0" + link: https://www.linaro.org/blog/new-release-windowsperf-visual-studio-extension-v1000 + type: blog + - resource: + title: "Launching WindowsPerf Visual Studio Extension v2.1.0" + link: https://www.linaro.org/blog/launching--windowsperf-visual-studio-extension-v210 + type: blog + - resource: + title: "Windows on Arm overview" + link: https://learn.microsoft.com/en-us/windows/arm/overview + type: website + - resource: + title: "Linaro Windows on Arm project" + link: https://www.linaro.org/windows-on-arm/ + type: website + - resource: + title: "WindowsPerf releases" + link: https://github.com/arm-developer-tools/windowsperf/releases + type: website +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +weight: 21 # set to always be larger than the content in this path, and one more than 'review' +title: "Next Steps" # Always the same +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_review.md b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_review.md new file mode 100644 index 000000000..29beb2948 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_review.md @@ -0,0 +1,100 @@ +--- +# ================================================================================ +# Edit +# ================================================================================ + +# Always 3 questions. Should try to test the reader's knowledge, and reinforce the key points you want them to remember. + # question: A one sentence question + # answers: The correct answers (from 2-4 answer options only). Should be surrounded by quotes. + # correct_answer: An integer indicating what answer is correct (index starts from 0) + # explanation: A short (1-3 sentence) explanation of why the correct answer is correct. Can add additional context if desired + +review: + - questions: + question: > + The counting model is used for obtaining aggregate counts of occurrences of special events. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + In the counting model, the occurrences of PMU events are simply aggregated over a given time period. + + - questions: + question: > + The sampling model is used for determining the frequencies of event occurrences produced by program locations at the function, basic block, and/or instruction levels. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + In the sampling model, the frequencies of event occurrences produced by the program determine "hot" locations at the function, basic block, and/or instruction levels. + + - questions: + question: > + WindowsPerf can be used and executed only on native ARM64 WOA hardware, and not in a virtual environment. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, WindowsPerf currently supports a native Windows on Arm environment only. + + - questions: + question: > + The Arm Statistical Profiling Extension (SPE) is an optional feature in ARMv8.2 hardware. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, the Arm Statistical Profiling Extension (SPE) is an optional feature in ARMv8.2 hardware that allows CPU instructions to be sampled and associated with the source code location where that instruction occurred. + + - questions: + question: > + SPE stands for Statistical Profiling Extension. + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, the Arm Statistical Profiling Extension (SPE) is an optional feature in ARMv8.2 hardware. + + - questions: + question: > + Is load_filter is one of SPE filters supported by WindowsPerf? + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, load_filter together with store_filter and branch_filter are SPE filters supported by WindowsPerf. + + - questions: + question: > + Is store_filter is one of SPE filters supported by WindowsPerf? + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, load_filter together with store_filter and branch_filter are SPE filters supported by WindowsPerf. + + - questions: + question: > + Is branch_filter is one of SPE filters supported by WindowsPerf? + answers: + - "True" + - "False" + correct_answer: 1 + explanation: > + Yes, load_filter together with store_filter and branch_filter are SPE filters supported by WindowsPerf. + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +title: "Review" # Always the same title +weight: 20 # Set to always be larger than the content in this path +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md new file mode 100644 index 000000000..073189f53 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md @@ -0,0 +1,196 @@ +--- +layout: learningpathall +title: CPython Sampling with SPE Example Overview +weight: 2 +--- + +# CPython Sampling with SPE Example + +In this example, you will build a debug build of CPython from sources and then execute simple instructions in the Python interactive mode to obtain WindowsPerf sampling results from a CPython runtime image. + +## The Arm Statistical Profiling Extension Introduction + +The Arm Statistical Profiling Extension (SPE) is a feature defined as part of the Armv8-A architecture, starting from version 8.2. It provides non-invasive, hardware-based statistical sampling for CPUs. Unlike the Performance Monitor Unit (PMU), SPE is a different module that integrates the sampling process into the instruction execution process within the CPU's pipelines. + +SPE is particularly useful for performance analysis and optimization, as it provides detailed insights into the behavior of the CPU during execution. This can help identify performance bottlenecks and optimize software for better efficiency. + +## Introduction + +You will use sampling to determine CPython program "hot" locations provided by Arm Statistical Profiling Extension (SPE). + +WindowsPerf added support (in `record` command) for the [Arm Statistical Profiling Extension (SPE)](https://developer.arm.com/documentation/101136/22-1-3/MAP/Arm-Statistical-Profiling-Extension--SPE-). SPE is an optional feature in ARMv8.2 hardware that allows CPU instructions to be sampled and associated with the source code location where that instruction occurred. + +{{% notice Note %}} +Currently SPE is available on Windows On Arm in Test Mode only! +{{% /notice %}} + +## Before you begin + +For this learning path you will need: +* A Windows on Arm (ARM64) native machine with pre-installed WindowsPerf (both driver and `wperf` CLI tool). See [WindowsPerf Install Guide](/install-guides/wperf/) for more details. +* CPU must support Arm SPE, an optional feature in ARMv8.2 hardware - we will show you how to check your CPU compatibility using WindowsPerf command-line tool. +* Basic knowledge of git and Python. + * See [Install Git on Windows](https://github.com/git-guides/install-git#install-git-on-windows) for more details. + +### How to check if your ARM64 CPU supports Arm SPE extension + +#### SPE hardware support detection: + +You can check if your system supports SPE or if WindowsPerf can detect SPE with `wperf test` command. See below an example of `spe_device.version_name property` value on system with SPE: + +```console +wperf test +``` + +```output + Test Name Result + ========= ====== +... + spe_device.version_name FEAT_SPE +``` + +#### How do I know if your WindowsPerf binaries and driver support optional SPE? + +{{% notice Note %}} +Currently WindowsPerf support of SPE is in development, not all versions of WindowsPerf enable SPE support. Some WindowsPerf releases may contain separate binaries with SPE support enables. +{{% /notice %}} + +You can check feature string `FeatureString` of both `wperf` and `wperf-driver` with `wperf --version` command: + +```console +wperf --version +``` + +```output + Component Version GitVer FeatureString + ========= ======= ====== ============= + wperf 3.8.0 0d3eba3d +etw-app+spe + wperf-driver 3.8.0 0d3eba3d +trace+spe +``` + +If `FeatureString` for both components (`wperf` and `wperf-driver`) contains `+spe` (and `spe_device.version_name` contains `FEAT_SPE`) you are good to go! + +### Build CPython targeting ARM64 + +Note: all steps are done on Windows on Arm system with ARM64 CPU. + +CPython is an open-source project. There is native support in CPython for Windows on Arm starting with version 3.11. In this learning path you will use a debug build of CPython. For this, you will build [CPython](https://github.com/python/cpython) locally from sources in the debug mode on an x86_64 machine and cross-compile it for an ARM64 target. + +{{% notice Note %}} +Use the Visual Studio `Developer Command Prompt for VS 2022` which is already set up in the VS environment. Go to Start and search for "Developer Command Prompt for VS 2022". +{{% /notice %}} + +You should see a prompt as shown below: + +```output +********************************************************************** +** Visual Studio 2022 Developer Command Prompt v17.7.6 +** Copyright (c) 2022 Microsoft Corporation +********************************************************************** + +C:\Program Files\Microsoft Visual Studio\2022\Community> +``` + +{{% notice Note %}} +Please use `Developer Command Prompt for VS 2022` with all of the next steps. +{{% /notice %}} + +--- + +Let's build CPython locally in debug mode using the `build.bat` script. You have the option to build CPython directly on your ARM64 machine or cross-compile it on an x64 machine. Below is an example demonstrating how to build it on an ARM64 machine. + +#### Clone CPython source code + +```command +git clone git@github.com:python/cpython.git +``` + +The output from this command will be similar to: + +```output +Cloning into 'cpython'... +remote: Enumerating objects: 990145, done. +remote: Counting objects: 100% (43119/43119), done. +remote: Compressing objects: 100% (896/896), done. +remote: Total 990145 (delta 42673), reused 42290 (delta 42223), pack-reused 947026 +Receiving objects: 100% (990145/990145), 527.93 MiB | 14.28 MiB/s, done. +Resolving deltas: 100% (792463/792463), done. +Updating files: 100% (4647/4647), done. +``` + +#### Checkout CPython at specific SHA + +{{% notice Note %}} +This step is optional, but please remember that you may encounter build issues unrelated to this example as the CPython mainline source code that you've just checked out is not stable. Therefore, we recommend that you check out SHA to avoid any unexpected issues and to ensure you are working off the same code base. +{{% /notice %}} + +Use a specific CPython commit to match the sampling output in this example: + +```console +cd cpython +git checkout 1ff81c0cb67215694f084e51c4d35ae53b9f5cf9 +``` +The output will be similar to: + +```output +Updating files: 100% (2774/2774), done. +Note: switching to '1ff81c0cb67215694f084e51c4d35ae53b9f5cf9'. +... +``` + +#### Build CPython from sources + +The folder `cpython\PCbuild` contains the `build.bat` script you will use to build CPython from sources. Build CPython with debug symbols by invoking the `-d` command line option and select the ARM64 target with `-p ARM64`. + +{{% notice Note %}} +Make sure you are using `Developer Command Prompt for VS 2022`. +{{% /notice %}} + +```console +cd PCbuild +build.bat -d -p ARM64 +``` +The output will be similar to: + +```output +Downloading nuget... +Installing Python via nuget... + +... + + python.c + python.vcxproj -> C:\\path\to\cpython\PCbuild\arm64\python_d.exe + Wrote C:\path\to\cpython\PCbuild\arm64\LICENSE.txt + WinMain.c + pythonw.vcxproj -> C:\path\to\cpython\PCbuild\arm64\pythonw_d.exe + +Build succeeded. + 0 Warning(s) + 0 Error(s) + +Time Elapsed 00:00:59.50 +``` + +{{% notice Note %}} +The folder `cpython\PCbuild\arm64` should contain all the executables built in this process. You will use `python_d.exe` in this example. +{{% /notice %}} + +##### Execute interactive mode to make sure all the CPython dependencies and libraries are loaded + +On your Windows ARM64 machine, open a command prompt and run: + +```console +cd LearningPath\PCbuild\arm64 +python_d.exe +``` +You should see CPython being invoked in interactive mode: + +```output +Python 3.12.0a6+ (heads/main:1ff81c0cb6, Mar 14 2023, 16:26:50) [MSC v.1935 64 bit (ARM64)] on win32 +Type "help", "copyright", "credits" or "license" for more information. +>>> +``` + +{{% notice Note %}} +Your environment should now be fully set up and you are ready to move on to the next step. +{{% /notice %}} diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md new file mode 100644 index 000000000..3a4762acd --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md @@ -0,0 +1,215 @@ +--- +layout: learningpathall +title: WindowsPerf sample using SPE example +weight: 3 +--- + +## Example 1: Sampling of CPython calculating Googolplex using SPE + +{{% notice Note %}} +All the steps in these following sections are done on a native ARM64 Windows on Arm machine. +{{% /notice %}} + +You will use the pre-built [CPython](https://github.com/python/cpython) binaries targeting ARM64 from sources in the debug mode from the previous step and then complete the following: +- Pin `python_d.exe` interactive console to an arbitrary CPU core, calculate `10^10^100` expression, a large integer number [Googolplex](https://en.wikipedia.org/wiki/Googolplex) to stress the CPython application and get a simple workload. +- Run counting and sampling to obtain some simple event information. + +### Pin the new CPython process to a CPU core 1 + +Use the Windows `start` command to execute and pin `python_d.exe` process to CPU core number 1. Below command is executing computation intensive calculations of `10^10^100`, a [Googolplex](https://en.wikipedia.org/wiki/Googolplex) number, with CPython. + +```command +start /affinity 2 cpython\PCbuild\arm64\python_d.exe -c 10**10**100 +``` + +{{% notice Note %}} +The [start](https://learn.microsoft.com/en-us/windows-server/administration/windows-commands/start) command line switch `/affinity ` applies the specified processor affinity mask (expressed as a hexadecimal number) to the new application. In our example decimal `2` is `0x02` or `0b0010`. This value denotes core no. `1` as `1` is a first bit in the mask, where the mask is indexed from `0` (zero). +{{% /notice %}} + +You can use the Windows Task Manager to confirm that `python_d.exe` is running on CPU core no. 1. + +### SPE introduces new option for command line switch -e arm_spe_0// + +Users can specify SPE filters using the `-e` command line option with `arm_spe_0//`. We've introduced the `arm_spe_0/*/` notation for the `sample` and `record` command, where `*` represents a comma-separated list of supported filters. Currently, we support filters such as `store_filter=`, `load_filter=`, and `branch_filter=`, or their short equivalents like `st=`, `ld=`, and `b=`. Use `0` or `1` to disable or enable a given filter. For example: + +```output +arm_spe_0/branch_filter=1/ +arm_spe_0/load_filter=1,branch_filter=0/ +arm_spe_0/ld=1,branch_filter=0/ +arm_spe_0/st=0,ld=0,b=1/ +``` + +#### Filtering sample records + +SPE register `PMSFCR_EL1.FT` enables filtering by operation type. When enabled `PMSFCR_EL1.{ST, LD, B}` define the collected types: +- `ST` enables collection of store sampled operations, including all atomic operations. +- `LD` enables collection of load sampled operations, including atomic operations that return a value to a register. +- `B` enables collection of branch sampled operations, including direct and indirect branches and exception returns. + +### Sampling using SPE the CPython application running the Googolplex calculation on CPU core 1 + +Below command will sample already running process `python_d.exe` (denoted with `--image_name python_d.exe`) on CPU core no. 1. SPE filter `ld=1` enables collection of load sampled operations, including atomic operations that return a value to a register. + +```command +wperf sample -e arm_spe_0/ld=1/ --pe_file cpython\PCbuild\arm64\python_d.exe --image_name python_d.exe -c 1 +``` + +{{% notice Note%}} +You can use the same sampling `--annotate` and `--disassemble` command line interface of WindowsPerf with SPE extension. See example outputs below. +{{% /notice %}} + +Please wait a few seconds for the samples to arrive from the Kernel driver and then press `Ctrl+C` to stop sampling. You should see: + +```output +base address of 'python_d.exe': 0x7ff765fe1288, runtime delta: 0x7ff625fe0000 +sampling ....eee....eCtrl-C received, quit counting... done! + +Performance counter stats for core 1, no multiplexing, kernel mode excluded, on Arm Limited core implementation: +note: 'e' - normal event, 'gN' - grouped event with group number N, metric name will be appended if 'e' or 'g' comes from it + + counter value event name event idx event note + ============= ========== ========= ========== + 29,337,387,738 cycle fixed e + 76,433,491,476 sample_pop 0x4000 e + 18 sample_feed 0x4001 e + 7 sample_filtrate 0x4002 e + 0 sample_collision 0x4003 e +======================== sample source: LOAD_STORE_ATOMIC-LOAD-GP/retired+level1-data-cache-access+tlb_access, top 50 hot functions ======================== + overhead count symbol + ======== ===== ====== + 85.71 6 x_mul:python312_d.dll + 14.29 1 unknown + 100.00% 7 top 2 in total + + 9.853 seconds time elapsed +``` + +{{% notice Note%}} +You can close the command line window with `python_d.exe` running when you have finished sampling. Sampling will also automatically end when the sample process has finished. +{{% /notice %}} + + +#### SPE sampling output + +- In the above example, you can see that the majority of "overhead" is generated by `python_d.exe` executable resides inside the `python312_d.dll` DLL, in `x_mul` symbol. +- SPE sampling output contains also PMU events for SPE registered during sampling: + - `sample_pop` - Statistical Profiling sample population. Counts statistical profiling sample population, the count of all operations that could be sampled but may or may not be chosen for sampling. + - `sample_feed` - Statistical Profiling sample taken. Counts statistical profiling samples taken for sampling. + - `sample_filtrate` - Statistical Profiling sample taken and not removed by filtering. Counts statistical profiling samples taken which are not removed by filtering. + - `sample_collision` - Statistical Profiling sample collided with previous sample. Counts statistical profiling samples that have collided with a previous sample and so therefore not taken. +- Note that in sampling `....eee....e` is a progressing printout where: + - character `.` represents a SPE sample payload received from the WindowsPerf Kernel driver and + - character `e` represents an unsuccessful attempt (empty SPE fill buffer) to fetch the whole sample payload. + +{{% notice Note%}} +You can also output `wperf sample` command in JSON format. Use the `--json` command line option to enable the JSON output. +Use the `-v` command line option `verbose` to add more information about sampling. +{{% /notice %}} + +#### Example output with annotate enabled + +Command line option `--annotate` enables translating addresses taken from samples in sample/record mode into source code line numbers. + +```console +wperf sample -e arm_spe_0/ld=1/ --annotate --pe_file cpython\PCbuild\arm64\python_d.exe --image_name python_d.exe -c 1 +``` + +```output +base address of 'python_d.exe': 0x7ff765fe1288, runtime delta: 0x7ff625fe0000 +sampling ....ee.Ctrl-C received, quit counting...e done! + +Performance counter stats for core 1, no multiplexing, kernel mode excluded, on Arm Limited core implementation: +note: 'e' - normal event, 'gN' - grouped event with group number N, metric name will be appended if 'e' or 'g' comes from it + + counter value event name event idx event note + ============= ========== ========= ========== + 15,579,045,952 cycle fixed e + 40,554,143,220 sample_pop 0x4000 e + 10 sample_feed 0x4001 e + 2 sample_filtrate 0x4002 e + 0 sample_collision 0x4003 e +======================== sample source: LOAD_STORE_ATOMIC-LOAD-GP/retired+level1-data-cache-access+tlb_access, top 50 hot functions ======================== +x_mul:python312_d.dll + line_number hits filename + =========== ==== ======== + 3,590 2 C:\path\to\cpython\Objects\longobject.c + + overhead count symbol + ======== ===== ====== + 100.00 2 x_mul:python312_d.dll + 100.00% 2 top 1 in total + + 5.199 seconds time elapsed +``` + +Note: Above SPE sampling pass recorded: +- function `x_mul:python312_d.dll`: + - in source file `C:\path\to\cpython\Objects\longobject.c`, line `3590` as a hot-spot for `load_filter` enabled. + +#### Example output with disassemble enabled + +Command line option `--disassemble` enables disassemble output on sampling mode. Implies `--annotate`. + +```console +wperf sample -e arm_spe_0/ld=1/ --disassemble --pe_file cpython\PCbuild\arm64\python_d.exe --image_name python_d.exe -c 1 +``` + +```output +base address of 'python_d.exe': 0x7ff765fe1288, runtime delta: 0x7ff625fe0000 +sampling ......eCtrl-C received, quit counting... done! + +Performance counter stats for core 1, no multiplexing, kernel mode excluded, on Arm Limited core implementation: +note: 'e' - normal event, 'gN' - grouped event with group number N, metric name will be appended if 'e' or 'g' comes from it + + counter value event name event idx event note + ============= ========== ========= ========== + 13,193,499,134 cycle fixed e + 34,357,259,935 sample_pop 0x4000 e + 8 sample_feed 0x4001 e + 4 sample_filtrate 0x4002 e + 0 sample_collision 0x4003 e +======================== sample source: LOAD_STORE_ATOMIC-LOAD-GP/retired+level1-data-cache-access+tlb_access, top 50 hot functions ======================== +x_mul:python312_d.dll + line_number hits filename instruction_address disassembled_line + =========== ==== ======== =================== ================= + 3,591 2 C:\path\to\cpython\Objects\longobject.c 4043b4 address instruction + ======= =========== + 4043a8 ldr x8, [sp, #0x10] + 4043ac and x8, x8, #0x3fffffff + 4043b0 mov w8, w8 + 4043b4 ldr x9, [sp, #0x20] + 4043b8 str w8, [x9] + 4043bc ldr x8, [sp, #0x20] + 4043c0 add x8, x8, #0x4 + 4043c4 str x8, [sp, #0x20] + 3,589 1 C:\path\to\cpython\Objects\longobject.c 404360 address instruction + ======= =========== + 40435c ldr x9, [sp, #0x108] + 404360 ldr x8, [sp, #0x58] + 404364 cmp x8, x9 + 404368 b.hs 0x18040440c <_PyCrossInterpreterData_UnregisterClass+0x3fc680> + +v_isub:python312_d.dll + line_number hits filename instruction_address disassembled_line + =========== ==== ======== =================== ================= + 1,603 1 C:\path\to\cpython\Objects\longobject.c 402a60 address instruction + ======= =========== + 402a60 ldr w8, [sp, #0x10] + 402a64 and w8, w8, #0x1 + 402a68 str w8, [sp, #0x10] + + overhead count symbol + ======== ===== ====== + 75.00 3 x_mul:python312_d.dll + 25.00 1 v_isub:python312_d.dll + 100.00% 4 top 2 in total + + 4.422 seconds time elapsed +``` + +Note: Above SPE sampling pass recorded: +- function `x_mul:python312_d.dll`: + - in source file `C:\path\to\cpython\Objects\longobject.c`, line `3591`, instruction `ldr x9, [sp, #0x20]` at address `0x4043b4` as potential hot-spot. + - in source file `C:\path\to\cpython\Objects\longobject.c`, line `3589`, instruction `ldr x8, [sp, #0x58]` at address `0x404360` as potential hot-spot. +- Function `v_isub:python312_d.dll`: + - in source file `C:\path\to\cpython\Objects\longobject.c`, line `1603`, instruction `ldr w8, [sp, #0x10]` at address `0x402a60` as potential hot-spot. diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md new file mode 100644 index 000000000..76c907585 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md @@ -0,0 +1,35 @@ +--- +layout: learningpathall +title: WindowsPerf record using SPE example +weight: 4 +--- + +## Example 2: Using the `record` command to simplify things + +- The `record` command spawns the process and pins it to the core specified by the `-c` option. +- A double-dash (`--`) is a syntax used in shell commands to signify end of command options and beginning of positional arguments. In other words, it separates `wperf` CLI options from arguments that the command operates on. Use `--` to separate `wperf.exe` command line options from the process you want to spawn followed by its verbatim arguments. + +```console +wperf record -e arm_spe_0/ld=1/ -c 1 --timeout 5 -- cpython\PCbuild\arm64\python_d.exe -c 10**10**100 +``` + +{{% notice Note%}} +You can use the same sampling `--annotate` and `--disassemble` command line interface of WindowsPerf with SPE extension. +{{% /notice %}} + +The WindowsPerf `record` command is versatile, allowing you to start and stop the sampling process easily. It also simplifies the command line syntax, making it user-friendly and efficient. + +Example 2 can be replaced by these two commands: + +```console +start /affinity 2 cpython\PCbuild\arm64\python_d.exe -c 10**10**100 +wperf sample -e arm_spe_0/ld=1/ --pe_file cpython\PCbuild\arm64\python_d.exe --image_name python_d.exe -c 1 +``` + +## Summary + +WindowsPerf is a versatile performance analysis tool that can support both software (with CPU PMU events) and hardware sampling (with SPE extension). The type of sampling it can perform depends on the availability of the Arm Statistical Profiling Extension (SPE) in the ARM64 CPU. If the Arm SPE extension is present, WindowsPerf can leverage hardware sampling to provide detailed performance insights. Otherwise, it will rely on software sampling to gather performance data. This flexibility ensures that WindowsPerf can adapt to different hardware configurations and still deliver valuable performance metrics. + +Use `wperf sample`, a sampling mode, for determining the frequencies of event occurrences produced by program locations at the function, basic block, and/or instruction levels. + +Use `wperf record`, same as sample but also automatically spawns the process and pins it to the core specified by `-c`. Process name is defined by COMMAND. User can pass verbatim arguments to the process. From 4246dca463b55e0206225040148ef7084456cfd6 Mon Sep 17 00:00:00 2001 From: Przemyslaw Wirkus Date: Mon, 14 Oct 2024 11:47:31 +0100 Subject: [PATCH 2/6] windowsperf_sampling_cpython_spe: fix path to python_d.exe --- .../windowsperf_sampling_cpython_spe.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md index 073189f53..3c928820b 100644 --- a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md +++ b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md @@ -180,7 +180,7 @@ The folder `cpython\PCbuild\arm64` should contain all the executables built in t On your Windows ARM64 machine, open a command prompt and run: ```console -cd LearningPath\PCbuild\arm64 +cd c:\path\to\cpython\PCbuild\arm64 python_d.exe ``` You should see CPython being invoked in interactive mode: From 3ce42b0b4a803872a43e73f4aa6241d83917eae3 Mon Sep 17 00:00:00 2001 From: Przemyslaw Wirkus Date: Mon, 14 Oct 2024 11:52:05 +0100 Subject: [PATCH 3/6] windowsperf_sampling_cpython_spe: fix release 3.8.0 GitVer --- .../windowsperf_sampling_cpython_spe.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md index 3c928820b..bdff89fa3 100644 --- a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md +++ b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md @@ -64,8 +64,8 @@ wperf --version ```output Component Version GitVer FeatureString ========= ======= ====== ============= - wperf 3.8.0 0d3eba3d +etw-app+spe - wperf-driver 3.8.0 0d3eba3d +trace+spe + wperf 3.8.0 6d15ddfc +etw-app+spe + wperf-driver 3.8.0 6d15ddfc +trace+spe ``` If `FeatureString` for both components (`wperf` and `wperf-driver`) contains `+spe` (and `spe_device.version_name` contains `FEAT_SPE`) you are good to go! From 1ba602833bdeb7a93e2e3166f79f1ce590917bb8 Mon Sep 17 00:00:00 2001 From: Przemyslaw Wirkus Date: Mon, 14 Oct 2024 12:17:39 +0100 Subject: [PATCH 4/6] windowsperf_sampling_cpython_spe: use HTTPS git clone URL to align with other LPs --- .../windowsperf_sampling_cpython_spe.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md index bdff89fa3..ddaed0e51 100644 --- a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md +++ b/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md @@ -102,7 +102,7 @@ Let's build CPython locally in debug mode using the `build.bat` script. You have #### Clone CPython source code ```command -git clone git@github.com:python/cpython.git +git clone https://github.com/python/cpython.git ``` The output from this command will be similar to: From 4eaff79b899dee216d1a7b13f14861acf684e8f7 Mon Sep 17 00:00:00 2001 From: Przemyslaw Wirkus Date: Wed, 16 Oct 2024 10:56:25 +0100 Subject: [PATCH 5/6] Move this LP from 'laptops-and-desktops' to 'servers-and-cloud-computing' as SPE is a server CPU extension --- .../windowsperf_sampling_cpython_spe/_index.md | 0 .../windowsperf_sampling_cpython_spe/_next-steps.md | 0 .../windowsperf_sampling_cpython_spe/_review.md | 0 .../windowsperf_sampling_cpython_spe.md | 0 .../windowsperf_sampling_cpython_spe_example_1.md | 0 .../windowsperf_sampling_cpython_spe_example_2.md | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename content/learning-paths/{laptops-and-desktops => servers-and-cloud-computing}/windowsperf_sampling_cpython_spe/_index.md (100%) rename content/learning-paths/{laptops-and-desktops => servers-and-cloud-computing}/windowsperf_sampling_cpython_spe/_next-steps.md (100%) rename content/learning-paths/{laptops-and-desktops => servers-and-cloud-computing}/windowsperf_sampling_cpython_spe/_review.md (100%) rename content/learning-paths/{laptops-and-desktops => servers-and-cloud-computing}/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md (100%) rename content/learning-paths/{laptops-and-desktops => servers-and-cloud-computing}/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md (100%) rename content/learning-paths/{laptops-and-desktops => servers-and-cloud-computing}/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md (100%) diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_index.md b/content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/_index.md similarity index 100% rename from content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_index.md rename to content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/_index.md diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/_next-steps.md similarity index 100% rename from content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_next-steps.md rename to content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/_next-steps.md diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_review.md b/content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/_review.md similarity index 100% rename from content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/_review.md rename to content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/_review.md diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md b/content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md similarity index 100% rename from content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md rename to content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md b/content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md similarity index 100% rename from content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md rename to content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_1.md diff --git a/content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md b/content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md similarity index 100% rename from content/learning-paths/laptops-and-desktops/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md rename to content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe_example_2.md From c43d9f09d4ce33eb1dcbd0191a2e04f618bb99de Mon Sep 17 00:00:00 2001 From: Przemyslaw Wirkus Date: Tue, 22 Oct 2024 15:26:03 +0100 Subject: [PATCH 6/6] docs: add info that WindowsPerf release binary 3.8.0 contains separate SPE build inside the asset --- .../windowsperf_sampling_cpython_spe.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md b/content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md index ddaed0e51..339998086 100644 --- a/content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md +++ b/content/learning-paths/servers-and-cloud-computing/windowsperf_sampling_cpython_spe/windowsperf_sampling_cpython_spe.md @@ -28,7 +28,8 @@ Currently SPE is available on Windows On Arm in Test Mode only! For this learning path you will need: * A Windows on Arm (ARM64) native machine with pre-installed WindowsPerf (both driver and `wperf` CLI tool). See [WindowsPerf Install Guide](/install-guides/wperf/) for more details. -* CPU must support Arm SPE, an optional feature in ARMv8.2 hardware - we will show you how to check your CPU compatibility using WindowsPerf command-line tool. + * Note: The [WindowsPerf release 3.8.0](https://github.com/arm-developer-tools/windowsperf/releases/tag/3.8.0) includes a separate build with Arm SPE (Statistical Profiling Extension) support enabled. To install this version download release asset and you will find WindowsPerf SPE build in the `SPE/` subdirectory. +* CPU must support Arm SPE extension, an optional feature in ARMv8.2 hardware - we will show you how to check your CPU compatibility using WindowsPerf command-line tool. * Basic knowledge of git and Python. * See [Install Git on Windows](https://github.com/git-guides/install-git#install-git-on-windows) for more details.