From fb58f9dbc898c059ade9cb1baced82d67a03c9ed Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 19 Oct 2023 18:27:46 +0300 Subject: [PATCH 1/4] WIP: restrict keyword C99 Arm Learning Paths material --- .../restrict-keyword-c99/_index.md | 38 ++++ .../restrict-keyword-c99/_next-steps.md | 23 ++ .../restrict-keyword-c99/_review.md | 48 ++++ .../restrict-example-sve2.md | 95 ++++++++ .../restrict-keyword-c99/what-is-restrict.md | 213 ++++++++++++++++++ .../when-to-use-restrict.md | 15 ++ 6 files changed, 432 insertions(+) create mode 100644 content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_index.md create mode 100644 content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_next-steps.md create mode 100644 content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_review.md create mode 100644 content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/restrict-example-sve2.md create mode 100644 content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/what-is-restrict.md create mode 100644 content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/when-to-use-restrict.md diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_index.md b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_index.md new file mode 100644 index 000000000..17f9bc591 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_index.md @@ -0,0 +1,38 @@ +--- +title: restrict keyword in C99 + +minutes_to_complete: 20 + +who_is_this_for: C developers who are interested in software optimization. + +learning_objectives: + - Learn the importance of using 'restrict' keyword in C correctly + +prerequisites: + - An Arm based system with Linux OS and recent compiler (clang or gcc) + +author_primary: Konstantinos Margaritis, VectorCamp + +### Tags +skilllevels: Advanced +subjects: Programming +armips: + - Aarch64 + - Armv8-a + - Armv9-a +tools_software_languages: + - Linux + - GCC + - Clang + - SVE2 + - Coding +operatingsystems: + - Linux + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_next-steps.md new file mode 100644 index 000000000..ffaa68ad4 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_next-steps.md @@ -0,0 +1,23 @@ +--- +next_step_guidance: PLACEHOLDER TEXT 1 + +recommended_path: /learning-paths/PLACEHOLDER_CATEGORY/PLACEHOLDER_LEARNING_PATH/ + +further_reading: + - resource: + title: Wikipedia restrict entry + link: https://en.wikipedia.org/wiki/Restrict + type: documentation + - resource: + title: Godbolt restrict tests + link: https://godbolt.org/z/PxWxjc1oh + type: website + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +weight: 21 # set to always be larger than the content in this path, and one more than 'review' +title: "Next Steps" # Always the same +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_review.md b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_review.md new file mode 100644 index 000000000..db48157a9 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_review.md @@ -0,0 +1,48 @@ +--- +review: + - questions: + question: > + Where is `restrict` placed in the code? + answers: + - In the function declaration + - As an enum value + - Between the pointer symbol (*) and the argument name + correct_answer: 3 + explanation: > + `restrict` is placed in the arguments list of a function, between the * and the variable name, like this: + `int func(char *restrict arg)` + - questions: + question: > + What does `restrict` do? + answers: + - It increases the performance of the CPU cores, making your program run faster + - It issues a command to clear the cache, leaving more room for your program + - It restricts the standard of the C library used to C99 + - It hints the compiler that the memory pointed to by the variable cannot be accessed through any other means apart from this variable, inside the particular function + correct_answer: 4 + explanation: > + In order for the compiler to better schedule the instructions of a function, it needs to know if there is any + dependency between the argument variables. If there is none, usually the compiler can group together instructions + increasing performance and efficiency. + + - questions: + question: > + Which language supports `restrict` + answers: + - Python + - C and C++ + - C only (after C99) + - Rust + correct_answer: 3 + explanation: > + `restrict` is a C-only keyword, it does nothing on C++. + + + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +title: "Review" # Always the same title +weight: 20 # Set to always be larger than the content in this path +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/restrict-example-sve2.md b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/restrict-example-sve2.md new file mode 100644 index 000000000..334f5aadd --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/restrict-example-sve2.md @@ -0,0 +1,95 @@ +--- +title: Another example with SVE2 +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Example 2: SVE2 unleashed + +Let's try another example, one from [gcc restrict pointer examples](https://www.gnu.org/software/c-intro-and-ref/manual/html_node/restrict-Pointer-Example.html): + +```C +void process_data (const char *in, char *out, size_t size) +{ + for (int i = 0; i < size; i++) + out[i] = in[i] + in[i + 1]; +} +``` + +This example will be easier to demonstrate with SVE2, and we found gcc 13 to have a better result than clang, this is the output of `gcc-13 -O3 -march=armv9-a`: + +``` +process_data: + cbz x2, .L1 + add x5, x0, 1 + cntb x3 + sub x4, x1, x5 + sub x3, x3, #1 + cmp x4, x3 + bls .L6 + mov w4, w2 + mov x3, 0 + whilelo p0.b, wzr, w2 +.L4: + ld1b z0.b, p0/z, [x0, x3] + ld1b z1.b, p0/z, [x5, x3] + add z0.b, z0.b, z1.b + st1b z0.b, p0, [x1, x3] + incb x3 + whilelo p0.b, w3, w4 + b.any .L4 +.L1: + ret +.L6: + mov x3, 0 +.L3: + ldrb w4, [x5, x3] + ldrb w6, [x0, x3] + add w4, w4, w6 + strb w4, [x1, x3] + add x3, x3, 1 + cmp x2, x3 + bne .L3 + ret +``` + +We will not go into explaining the assembly, but we will note that gcc correctly uses the SVE2 `while*` instructions to do the loops, resulting in far smaller code than with Neon. But in order to illustrate our point, let's try adding `restrict` to pointer `in`: + +```C +void process_data (const char *restrict in, char *out, size_t size) +{ + for (int i = 0; i < size; i++) + out[i] = in[i] + in[i + 1]; +} +``` + +This is now the output from gcc-13: +``` +process_data: + cbz x2, .L1 + add x5, x0, 1 + mov w4, w2 + mov x3, 0 + whilelo p0.b, wzr, w2 +.L3: + ld1b z1.b, p0/z, [x0, x3] + ld1b z0.b, p0/z, [x5, x3] + add z0.b, z0.b, z1.b + st1b z0.b, p0, [x1, x3] + incb x3 + whilelo p0.b, w3, w4 + b.any .L3 +.L1: + ret +``` + +This is a huge improvement! Code size reduction is down from 30 lines to 14, less than half the original size, and faster too. In both cases, you will note that the main loop `.L3` is exactly the same, but the entry and exit code of the function are very much simplified, because the compiler was able to distinguish that the memory pointed by `in` does not overlap with memory pointed by `out`, it was able to simplify the conditions for entering and exiting the main loop. + +But I can almost hear the question: "Why is that important if the main loop is still the same?" +And it is a right question. The answer is this: + +If your function is going to be called once and run over tens of billions of elements, then saving a few instructions before and after the main loop does not really matter. + +But if your function is called on smaller sizes millions or even *billions* of times, then saving a few instructions in this function means we are saving a few *billions* of instructions total, which means less time to spend running on the CPU and less energy wasted. diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/what-is-restrict.md b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/what-is-restrict.md new file mode 100644 index 000000000..62dc2d760 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/what-is-restrict.md @@ -0,0 +1,213 @@ +--- +title: What problem does restrict solve? +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## The problem: Overlapping memory regions as pointer arguments + +Before we go into detail of the `restrict` keyword, let's first demonstrate the problem. + +Let's consider this C code, which is a variation of the one in [wikipedia](https://en.wikipedia.org/wiki/Restrict): +```C +#include +#include +#include + +void scaleVectors(int64_t *A, int64_t *B, int64_t *C) { + for (int i = 0; i < 4; i++) { + A[i] *= *C; + B[i] *= *C; + } +} + +void printVector(char *t, int64_t *A) { + printf("%s: ", t); + for (int i=0;i < 4; i++) { + printf("%ld ", A[i]); + } + printf("\n"); +} + +int main() { + int64_t a[] = { 1, 2, 3, 4, 5, 6, 7, 8 }; + int64_t *b = &a[2]; + int64_t c = 2; + + printVector("a(before)", a); + printVector("b(before)", b); + scaleVectors(a, b, &c); + printVector("a(after) ", a); + printVector("b(after) ", b); +} +``` + +So, there are 2 points to make here: +1. `scaleVectors()` is the important function here, it scales two vectors by the same scalefactor `*C` +2. vector `a` overlaps with vector `b`. (`b = &a[2]`). + +this rather simple program produces this output: +``` +a(before): 1 2 3 4 +b(before): 3 4 5 6 +a(after) : 2 4 12 16 +b(after) : 12 16 10 12 +``` + +Notice that after the scaling the contents of `a` are also affected by the scaling of `b` as their elements overlap in memory. + +We will include the assembly output of `scaleVectors` as produced by `clang-17 -O3`: + +``` +scaleVectors: // @scaleVectors + ldr x8, [x2] + ldr x9, [x0] + mul x8, x9, x8 + str x8, [x0] + ldr x8, [x2] + ldr x9, [x1] + mul x8, x9, x8 + str x8, [x1] + ldr x8, [x2] + ldr x9, [x0, #8] + mul x8, x9, x8 + str x8, [x0, #8] + ldr x8, [x2] + ldr x9, [x1, #8] + mul x8, x9, x8 + str x8, [x1, #8] + ldr x8, [x2] + ldr x9, [x0, #16] + mul x8, x9, x8 + str x8, [x0, #16] + ldr x8, [x2] + ldr x9, [x1, #16] + mul x8, x9, x8 + str x8, [x1, #16] + ldr x8, [x2] + ldr x9, [x0, #24] + mul x8, x9, x8 + str x8, [x0, #24] + ldr x8, [x2] + ldr x9, [x1, #24] + mul x8, x9, x8 + str x8, [x1, #24] + ret +``` + +This doesn't look optimal. `scaleVectors` seems to be doing each load,multiplication,store in sequence, surely it can be further optimized? This is because the memory pointers are overlapping, let's try different assignments of `a` and `b` in `main()` to make them explicitly independent, perhaps the compiler can detect that and better schedule the instructions. + +``` + int64_t a[] = { 1, 2, 3, 4 }; + int64_t b[] = { 5, 6, 7, 8 }; +``` + +Unsurprisingly, the disassembled output of `scaleVectors` is the same. The reason for this is that the compiler has no hint of the dependency between the two pointers used in the function so it has no choice than to assume that it has to process one element at a time. The function has no way of knowing with what arguments it is to be called. We see 8 instances of `mul`, which is correct but the number of loads and stores in between indicates that the CPU spends its time waiting for data to arrive from/to the cache. We need a way to be able to hint the compiler that it can assume the buffers passed are independent. + +## The Solution: restrict + +This is what the C99 `restrict` keyword has come to solve. It instructs the compiler that the passed arguments are in no way dependant on each other and access to the memory of each happens only through the respective pointer. This way the compiler can schedule the instructions in a much better way. In essence it can group and schedule the loads and stores. `restrict` only works in C, not in C++. + +Let's add `restrict` to `A` in the parameter list: +```C +void scaleVectors(int64_t *restrict A, int64_t *B, int64_t *C) { + for (int i = 0; i < 4; i++) { + A[i] *= *C; + B[i] *= *C; + } +} +``` + +This is the assembly output with `clang-17` (gcc has a similar output): + +```assembly +scaleVectors: // @scaleVectors + ldp x9, x10, [x1] + ldr x8, [x2] + ldp x11, x12, [x1, #16] + mul x9, x9, x8 + ldp x13, x14, [x0] + str x9, [x1] + ldr x9, [x2] + mul x8, x13, x8 + mul x10, x10, x9 + mul x9, x14, x9 + str x10, [x1, #8] + ldr x10, [x2] + stp x8, x9, [x0] + mul x11, x11, x10 + str x11, [x1, #16] + ldp x15, x11, [x0, #16] + ldr x13, [x2] + mul x10, x15, x10 + mul x11, x11, x13 + mul x12, x12, x13 + stp x10, x11, [x0, #16] + str x12, [x1, #24] + ret +``` + +We see an obvious reduction in the number of instructions, from 32 instructions down to 22! That's 68% of the original count, which is impressive on its own. One can easily see that the loads are grouped, as well as the multiplications. Of course, still 8 multiplications, that cannot change, but far fewer loads and stores as the compiler found the opportunity to use `LDP`/`STP` which load/store in pairs for the pointer `A`. + +Let's try adding `restrict` to `B` as well: +```C +void scaleVectors(int64_t *restrict A, int64_t *restrict B, int64_t *C) { + for (int i = 0; i < 4; i++) { + A[i] *= *C; + B[i] *= *C; + } +} +``` + +And the assembly output with `clang-17`: + +``` +scaleVectors: // @scaleVectors + ldp x9, x10, [x0] + ldr x8, [x2] + ldp x11, x12, [x0, #16] + ldp x13, x14, [x1] + mul x9, x9, x8 + ldp x15, x16, [x1, #16] + mul x10, x10, x8 + mul x11, x11, x8 + mul x12, x12, x8 + mul x13, x13, x8 + stp x9, x10, [x0] + mul x9, x14, x8 + mul x10, x15, x8 + mul x8, x16, x8 + stp x11, x12, [x0, #16] + stp x13, x9, [x1] + stp x10, x8, [x1, #16] + ret +``` + +Another reduction in the number of instructions, down to 17, for a total reduction to 53% the original count. This time, only 5 loads and 4 stores. And as before, all the loads/stores are paired (because the `LDP`/`STP` instructions are used). + +It is interesting to see that in such an example, adding just the `restrict` keyword reduced our code size to almost half. This will have an obvious impact in performance and efficiency. + +## What about SVE2? + +We have shown the obvious benefit of `restrict` in this function, on an armv8-a CPU, but we have new armv9-a CPUs out there with SVE2 as well as Neon/ASIMD. +Could the compiler generate better code in that case using `restrict`? To save time, the output without `restrict` is almost the same, however with `restrict` used, this is the result (we used `clang-17 -O3 -march=armv9-a`): + +``` +scaleVectors: // @scaleVectors + ldp q1, q2, [x0] + ldp q3, q4, [x1] + ld1r { v0.2d }, [x2] + mul z1.d, z1.d, z0.d + mul z2.d, z2.d, z0.d + stp q1, q2, [x0] + mul z1.d, z3.d, z0.d + mul z0.d, z4.d, z0.d + stp q1, q0, [x1] + ret +``` + +This is just 10 instructions, only 31% of the original code size! The compiler made a great use of SVE2 features, combining the multiplications and reducing them to 4, at the same time grouping loads and stores down to 2 each. We have optimized our code more than 3x by only adding a C99 keyword! + +We are going to look at another example next. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/when-to-use-restrict.md b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/when-to-use-restrict.md new file mode 100644 index 000000000..7ed04cb93 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/when-to-use-restrict.md @@ -0,0 +1,15 @@ +--- +title: When can we use restrict +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## So, when can we use restrict? + +This is all very good, but when can we use it? Or put differently, how to recognize we need `restrict` in our code? + +`restrict` as a pointer attribute is rather easy to test. As a rule of thumb, if our function includes one or more pointers to memory objects as arguments, we can use `restrict` if we are certain that the memory pointed by those pointer arguments does not overlap and there is no other way to access it in the body of the function, except by the use of those pointers -eg. there is no other global pointer, or some other indirect way to access these elements. + +If this applies, then it's safe to try `restrict`. Unfortunately, even if the above holds, it is still possible that the compiler will not detect a pattern that is liable for optimization and we might not see any reduction in the code or any speed up. It is up to the compiler, some cases clang handles better or differently than gcc, and vice versa, and that even depends on the version. If you have a particular piece of code that falls in the above criteria that you would care to optimize, before you attempt to refactor it completely, or rewrite it in asm or SIMD, it might be worth a shot to try `restrict`. Even saving a couple of instructions in a critical loop function is worth having to add just one keyword! \ No newline at end of file From b991ad1b90d73062c1b35fce1f0a6db5b3c1934a Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 20 Oct 2023 11:18:40 +0300 Subject: [PATCH 2/4] Changed category, fixed minor issues --- .../restrict-keyword-c99/_index.md | 5 ++- .../restrict-keyword-c99/_next-steps.md | 4 +-- .../restrict-keyword-c99/_review.md | 12 +++---- .../restrict-example-sve2.md | 2 +- .../restrict-keyword-c99/what-is-restrict.md | 8 ++--- .../when-to-use-restrict.md | 35 +++++++++++++++++++ .../when-to-use-restrict.md | 15 -------- 7 files changed, 50 insertions(+), 31 deletions(-) rename content/learning-paths/{servers-and-cloud-computing => embedded-systems}/restrict-keyword-c99/_index.md (95%) rename content/learning-paths/{servers-and-cloud-computing => embedded-systems}/restrict-keyword-c99/_next-steps.md (78%) rename content/learning-paths/{servers-and-cloud-computing => embedded-systems}/restrict-keyword-c99/_review.md (70%) rename content/learning-paths/{servers-and-cloud-computing => embedded-systems}/restrict-keyword-c99/restrict-example-sve2.md (91%) rename content/learning-paths/{servers-and-cloud-computing => embedded-systems}/restrict-keyword-c99/what-is-restrict.md (90%) create mode 100644 content/learning-paths/embedded-systems/restrict-keyword-c99/when-to-use-restrict.md delete mode 100644 content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/when-to-use-restrict.md diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_index.md b/content/learning-paths/embedded-systems/restrict-keyword-c99/_index.md similarity index 95% rename from content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_index.md rename to content/learning-paths/embedded-systems/restrict-keyword-c99/_index.md index 17f9bc591..db51af3da 100644 --- a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_index.md +++ b/content/learning-paths/embedded-systems/restrict-keyword-c99/_index.md @@ -1,9 +1,9 @@ --- title: restrict keyword in C99 -minutes_to_complete: 20 +minutes_to_complete: 30 -who_is_this_for: C developers who are interested in software optimization. +who_is_this_for: C developers who are interested in software optimization learning_objectives: - Learn the importance of using 'restrict' keyword in C correctly @@ -21,7 +21,6 @@ armips: - Armv8-a - Armv9-a tools_software_languages: - - Linux - GCC - Clang - SVE2 diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_next-steps.md b/content/learning-paths/embedded-systems/restrict-keyword-c99/_next-steps.md similarity index 78% rename from content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_next-steps.md rename to content/learning-paths/embedded-systems/restrict-keyword-c99/_next-steps.md index ffaa68ad4..ba23e557c 100644 --- a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_next-steps.md +++ b/content/learning-paths/embedded-systems/restrict-keyword-c99/_next-steps.md @@ -1,7 +1,7 @@ --- -next_step_guidance: PLACEHOLDER TEXT 1 +next_step_guidance: You should now be able to test the `restrict` keyword on your own or other open-source code and discover potential optimizations! -recommended_path: /learning-paths/PLACEHOLDER_CATEGORY/PLACEHOLDER_LEARNING_PATH/ +recommended_path: /learning-paths/embedded-systems/ further_reading: - resource: diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_review.md b/content/learning-paths/embedded-systems/restrict-keyword-c99/_review.md similarity index 70% rename from content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_review.md rename to content/learning-paths/embedded-systems/restrict-keyword-c99/_review.md index db48157a9..d9f0a8080 100644 --- a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/_review.md +++ b/content/learning-paths/embedded-systems/restrict-keyword-c99/_review.md @@ -6,23 +6,23 @@ review: answers: - In the function declaration - As an enum value - - Between the pointer symbol (*) and the argument name + - Between the pointer symbol (*) and the parameter name correct_answer: 3 explanation: > - `restrict` is placed in the arguments list of a function, between the * and the variable name, like this: + `restrict` is placed in the arguments list of a function, between the * and the parameter name, like this: `int func(char *restrict arg)` - questions: question: > What does `restrict` do? answers: - - It increases the performance of the CPU cores, making your program run faster + - It increases the frequency of the CPU cores, making your program run faster - It issues a command to clear the cache, leaving more room for your program - It restricts the standard of the C library used to C99 - - It hints the compiler that the memory pointed to by the variable cannot be accessed through any other means apart from this variable, inside the particular function + - It hints to the compiler that the memory pointed to by the parameter, cannot be accessed through any other means inside the particular function except, using this pointer correct_answer: 4 explanation: > In order for the compiler to better schedule the instructions of a function, it needs to know if there is any - dependency between the argument variables. If there is none, usually the compiler can group together instructions + dependency between the parameter variables. If there is no dependency, usually the compiler can group together instructions increasing performance and efficiency. - questions: @@ -35,7 +35,7 @@ review: - Rust correct_answer: 3 explanation: > - `restrict` is a C-only keyword, it does nothing on C++. + `restrict` is a C-only keyword, it does not exist on C++ (`__restrict__` does, but it is not exactly the same) diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/restrict-example-sve2.md b/content/learning-paths/embedded-systems/restrict-keyword-c99/restrict-example-sve2.md similarity index 91% rename from content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/restrict-example-sve2.md rename to content/learning-paths/embedded-systems/restrict-keyword-c99/restrict-example-sve2.md index 334f5aadd..78111fdf8 100644 --- a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/restrict-example-sve2.md +++ b/content/learning-paths/embedded-systems/restrict-keyword-c99/restrict-example-sve2.md @@ -55,7 +55,7 @@ process_data: ret ``` -We will not go into explaining the assembly, but we will note that gcc correctly uses the SVE2 `while*` instructions to do the loops, resulting in far smaller code than with Neon. But in order to illustrate our point, let's try adding `restrict` to pointer `in`: +Do not worry about each instruction in the assembly here, but notice that gcc correctly uses the SVE2 `while*` instructions to do the loops, resulting in far smaller code than with Neon. But in order to illustrate our point, let's try adding `restrict` to pointer `in`: ```C void process_data (const char *restrict in, char *out, size_t size) diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/what-is-restrict.md b/content/learning-paths/embedded-systems/restrict-keyword-c99/what-is-restrict.md similarity index 90% rename from content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/what-is-restrict.md rename to content/learning-paths/embedded-systems/restrict-keyword-c99/what-is-restrict.md index 62dc2d760..9cc9e9732 100644 --- a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/what-is-restrict.md +++ b/content/learning-paths/embedded-systems/restrict-keyword-c99/what-is-restrict.md @@ -10,7 +10,7 @@ layout: learningpathall Before we go into detail of the `restrict` keyword, let's first demonstrate the problem. -Let's consider this C code, which is a variation of the one in [wikipedia](https://en.wikipedia.org/wiki/Restrict): +Let's consider this C code: ```C #include #include @@ -97,18 +97,18 @@ scaleVectors: // @scaleVectors ret ``` -This doesn't look optimal. `scaleVectors` seems to be doing each load,multiplication,store in sequence, surely it can be further optimized? This is because the memory pointers are overlapping, let's try different assignments of `a` and `b` in `main()` to make them explicitly independent, perhaps the compiler can detect that and better schedule the instructions. +This doesn't look optimal. `scaleVectors` seems to be doing each load, multiplication, store in sequence, surely it can be further optimized? This is because the memory pointers are overlapping, let's try different assignments of `a` and `b` in `main()` to make them explicitly independent, perhaps the compiler can detect that and better schedule the instructions. ``` int64_t a[] = { 1, 2, 3, 4 }; int64_t b[] = { 5, 6, 7, 8 }; ``` -Unsurprisingly, the disassembled output of `scaleVectors` is the same. The reason for this is that the compiler has no hint of the dependency between the two pointers used in the function so it has no choice than to assume that it has to process one element at a time. The function has no way of knowing with what arguments it is to be called. We see 8 instances of `mul`, which is correct but the number of loads and stores in between indicates that the CPU spends its time waiting for data to arrive from/to the cache. We need a way to be able to hint the compiler that it can assume the buffers passed are independent. +Unsurprisingly, the disassembled output of `scaleVectors` is the same. The reason for this is that the compiler has no hint of the dependency between the two pointers used in the function so it has no choice than to assume that it has to process one element at a time. The function has no way of knowing with what arguments it is to be called. We see 8 instances of `mul`, which is correct but the number of loads and stores inbetween indicates that the CPU spends its time waiting for data to arrive from/to the cache. We need a way to be able to hint the compiler that it can assume the buffers passed are independent. ## The Solution: restrict -This is what the C99 `restrict` keyword has come to solve. It instructs the compiler that the passed arguments are in no way dependant on each other and access to the memory of each happens only through the respective pointer. This way the compiler can schedule the instructions in a much better way. In essence it can group and schedule the loads and stores. `restrict` only works in C, not in C++. +This is what the C99 `restrict` keyword has come to solve. It instructs the compiler that the passed arguments are in no way dependant on each other and access to the memory of each happens only through the respective pointer. This way the compiler can schedule the instructions in a much better way. In essence it can group and schedule the loads and stores. As a note, `restrict` only works in C, not in C++. Let's add `restrict` to `A` in the parameter list: ```C diff --git a/content/learning-paths/embedded-systems/restrict-keyword-c99/when-to-use-restrict.md b/content/learning-paths/embedded-systems/restrict-keyword-c99/when-to-use-restrict.md new file mode 100644 index 000000000..65f617558 --- /dev/null +++ b/content/learning-paths/embedded-systems/restrict-keyword-c99/when-to-use-restrict.md @@ -0,0 +1,35 @@ +--- +title: When can we use restrict +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## So, when can we use restrict? + +This is all very good, but when can we use it? Or put differently, how to recognize we need `restrict` in our code? + +`restrict` as a pointer attribute is rather easy to test. As a rule of thumb, if our function includes one or more pointers to memory objects as arguments, we can use `restrict` if we are certain that the memory pointed by those pointer arguments does not overlap and there is no other way to access it in the body of the function, except by the use of those pointers -eg. there is no other global pointer, or some other indirect way to access these elements. + +Let's show a coutner-example: + +``` +int A[10]; + +int f(int *B, size_t n) { + int sum = 0; + for (int i=0; i < n; i++) { + sum += A[i] * B[i]; // B is used in conjunction with A + } +} + +int main() { + int s = f(A, 10); // A is passed to f, so f will be calculating sum of A[i] * A[i] elements + printf("sum = %d", s); +} +``` + +This example does not not benefit from `restrict` at all in both gcc and clang. + +However, there are plenty of cases that are candidates for the `restrict` optimization. And it's safe and easy to try. Nevertheless, even if it looks like a good candidate, it is still possible that the compiler will not detect a pattern that is suited for optimization and we might not see any reduction in the code or speed gain. It is up to the compiler, some cases clang handles better or differently than gcc, and vice versa, and that even depends on the version. If you have a particular piece of code that falls in the above criteria that you would care to optimize, before you attempt to refactor it completely, or rewrite it in assembly or use any SIMD instructions, it might be worth a shot to try `restrict`. Even saving a couple of instructions in a critical loop function is worth having to add just one keyword! \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/when-to-use-restrict.md b/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/when-to-use-restrict.md deleted file mode 100644 index 7ed04cb93..000000000 --- a/content/learning-paths/servers-and-cloud-computing/restrict-keyword-c99/when-to-use-restrict.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: When can we use restrict -weight: 4 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- - -## So, when can we use restrict? - -This is all very good, but when can we use it? Or put differently, how to recognize we need `restrict` in our code? - -`restrict` as a pointer attribute is rather easy to test. As a rule of thumb, if our function includes one or more pointers to memory objects as arguments, we can use `restrict` if we are certain that the memory pointed by those pointer arguments does not overlap and there is no other way to access it in the body of the function, except by the use of those pointers -eg. there is no other global pointer, or some other indirect way to access these elements. - -If this applies, then it's safe to try `restrict`. Unfortunately, even if the above holds, it is still possible that the compiler will not detect a pattern that is liable for optimization and we might not see any reduction in the code or any speed up. It is up to the compiler, some cases clang handles better or differently than gcc, and vice versa, and that even depends on the version. If you have a particular piece of code that falls in the above criteria that you would care to optimize, before you attempt to refactor it completely, or rewrite it in asm or SIMD, it might be worth a shot to try `restrict`. Even saving a couple of instructions in a critical loop function is worth having to add just one keyword! \ No newline at end of file From cc6c2b4c4bb3cf482526f33c9c788e8ca883d600 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 25 Oct 2023 13:42:55 +0300 Subject: [PATCH 3/4] fixed explanations according to comments --- .../restrict-keyword-c99/restrict-example-sve2.md | 4 ++-- .../restrict-keyword-c99/what-is-restrict.md | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/content/learning-paths/embedded-systems/restrict-keyword-c99/restrict-example-sve2.md b/content/learning-paths/embedded-systems/restrict-keyword-c99/restrict-example-sve2.md index 78111fdf8..101bbb33c 100644 --- a/content/learning-paths/embedded-systems/restrict-keyword-c99/restrict-example-sve2.md +++ b/content/learning-paths/embedded-systems/restrict-keyword-c99/restrict-example-sve2.md @@ -55,7 +55,7 @@ process_data: ret ``` -Do not worry about each instruction in the assembly here, but notice that gcc correctly uses the SVE2 `while*` instructions to do the loops, resulting in far smaller code than with Neon. But in order to illustrate our point, let's try adding `restrict` to pointer `in`: +Do not worry about each instruction in the assembly here, but notice that gcc has added 2 loops, one that uses the SVE2 `while*` instructions to the processing (.L4) and one scalar loop (.L3). The latter is executed in case theis any pointer aliasing -if there is any overlap between the memory pointers basically. Let's try adding `restrict` to pointer `in`: ```C void process_data (const char *restrict in, char *out, size_t size) @@ -85,7 +85,7 @@ process_data: ret ``` -This is a huge improvement! Code size reduction is down from 30 lines to 14, less than half the original size, and faster too. In both cases, you will note that the main loop `.L3` is exactly the same, but the entry and exit code of the function are very much simplified, because the compiler was able to distinguish that the memory pointed by `in` does not overlap with memory pointed by `out`, it was able to simplify the conditions for entering and exiting the main loop. +This is a huge improvement! Code size reduction is down from 30 lines to 14, less than half the original size. In both cases, you will note that the main loop (`.L4` in the former case, `.L3` in the latter) is exactly the same, but the entry and exit code of the function are very much simplified, because the compiler was able to distinguish that the memory pointed by `in` does not overlap with memory pointed by `out`, it was able to simplify the code by eliminating the scalar loop. But I can almost hear the question: "Why is that important if the main loop is still the same?" And it is a right question. The answer is this: diff --git a/content/learning-paths/embedded-systems/restrict-keyword-c99/what-is-restrict.md b/content/learning-paths/embedded-systems/restrict-keyword-c99/what-is-restrict.md index 9cc9e9732..e4d08eeb5 100644 --- a/content/learning-paths/embedded-systems/restrict-keyword-c99/what-is-restrict.md +++ b/content/learning-paths/embedded-systems/restrict-keyword-c99/what-is-restrict.md @@ -97,7 +97,7 @@ scaleVectors: // @scaleVectors ret ``` -This doesn't look optimal. `scaleVectors` seems to be doing each load, multiplication, store in sequence, surely it can be further optimized? This is because the memory pointers are overlapping, let's try different assignments of `a` and `b` in `main()` to make them explicitly independent, perhaps the compiler can detect that and better schedule the instructions. +This doesn't look optimal. `scaleVectors` seems to be doing each load, multiplication, store in sequence, surely it can be further optimized? This is because the memory pointers are overlapping, let's try different assignments of `a` and `b` in `main()` to make them explicitly independent, perhaps the compiler can detect that and generate faster instructions to do the same thing. ``` int64_t a[] = { 1, 2, 3, 4 }; @@ -120,7 +120,7 @@ void scaleVectors(int64_t *restrict A, int64_t *B, int64_t *C) { } ``` -This is the assembly output with `clang-17` (gcc has a similar output): +This is the assembly output with `clang-17 -O3` (gcc has a similar output): ```assembly scaleVectors: // @scaleVectors @@ -161,7 +161,7 @@ void scaleVectors(int64_t *restrict A, int64_t *restrict B, int64_t *C) { } ``` -And the assembly output with `clang-17`: +And the assembly output with `clang-17 -O3`: ``` scaleVectors: // @scaleVectors From e59702674db1a18307dde44278e2fffe851d20f5 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 27 Oct 2023 11:07:26 +0300 Subject: [PATCH 4/4] Added explanation --- .../restrict-keyword-c99/restrict-example-sve2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/learning-paths/embedded-systems/restrict-keyword-c99/restrict-example-sve2.md b/content/learning-paths/embedded-systems/restrict-keyword-c99/restrict-example-sve2.md index 101bbb33c..947f0e4a5 100644 --- a/content/learning-paths/embedded-systems/restrict-keyword-c99/restrict-example-sve2.md +++ b/content/learning-paths/embedded-systems/restrict-keyword-c99/restrict-example-sve2.md @@ -85,7 +85,7 @@ process_data: ret ``` -This is a huge improvement! Code size reduction is down from 30 lines to 14, less than half the original size. In both cases, you will note that the main loop (`.L4` in the former case, `.L3` in the latter) is exactly the same, but the entry and exit code of the function are very much simplified, because the compiler was able to distinguish that the memory pointed by `in` does not overlap with memory pointed by `out`, it was able to simplify the code by eliminating the scalar loop. +This is a huge improvement! Code size reduction is down from 30 lines to 14, less than half the original size. In both cases, you will note that the main loop (`.L4` in the former case, `.L3` in the latter) is exactly the same, but the entry and exit code of the function are very much simplified. The compiler was able to distinguish that the memory pointed by `in` does not overlap with memory pointed by `out`, it was able to simplify the code by eliminating the scalar loop and remove the associated code that checked if it needed to enter it. But I can almost hear the question: "Why is that important if the main loop is still the same?" And it is a right question. The answer is this: