From 8d39e7895e62adca00670320658880483bb4a439 Mon Sep 17 00:00:00 2001 From: tomek Date: Mon, 10 Jun 2024 18:58:52 +0200 Subject: [PATCH 1/6] Publish cl_img_matrix_multiply extension specification. --- extensions/cl_img_matrix_multiply.asciidoc | 242 +++++++++++++++++++++ extensions/extensions.txt | 2 + 2 files changed, 244 insertions(+) create mode 100644 extensions/cl_img_matrix_multiply.asciidoc diff --git a/extensions/cl_img_matrix_multiply.asciidoc b/extensions/cl_img_matrix_multiply.asciidoc new file mode 100644 index 00000000..a9537ea8 --- /dev/null +++ b/extensions/cl_img_matrix_multiply.asciidoc @@ -0,0 +1,242 @@ +:data-uri: +:icons: font +include::../config/attribs.txt[] +:source-highlighter: coderay + += cl_img_matrix_multiply + +== Name Strings + +`cl_img_matrix_multiply` + +== Contact + +Imagination Technologies Developer Forum: + +https://forums.imgtec.com/ + +Tomasz Platek, Imagination Technologies (Tomasz.Platek 'at' imgtec.com) + +== Contributors + +CY Cheng, Imagination Technologies. + +Joe Molleson, Imagination Technologies. + +Tomasz Platek, Imagination Technologies. + +== Notice + +Copyright (c) 2024 Imagination Technologies Ltd. All Rights Reserved. + +== Status + +Draft spec, NOT APPROVED!! + +== Version + +Built On: {docdate} + +Version: Major.Minor.Patch + +== Dependencies + +This extension is written against the OpenCL Specification Version 3.0, Version V3.0.16. + +This extension requires the `cl_khr_fp16` extension. + +== Overview + +This extension provides the helper functions that allow to perform matrix multiplication efficiently. + +== New OpenCL C Feature Names + +[source,c] +---- +__opencl_img_dot_interleaved +__opencl_img_matmul_2x4_4x4 +---- + +== New OpenCL C Functions + +Performs the interleaved dot product operation: + +[source,c] +---- +float2 img_dot_interleaved(gentypef a, __local float * b); +float2 img_dot_interleaved_acc(gentypef a, __local float * b, float2 acc); +---- + +Performs the matrix multiplication operation: + +[source,c] +---- +float8 img_matmul_2x4_4x4(half4 a0, half4 a1, __local half * b); +half8 img_matmul_2x4_4x4(half4 a0, half4 a1, __local half * b); +float8 img_matmul_acc_2x4_4x4(half4 a0, half4 a1, __local half * b, float4 acc0, float4 acc1); +half8 img_matmul_acc_2x4_4x4(half4 a0, half4 a1, __local half * b, half4 acc0, half4 acc1); +float8 img_matmul_2x4_4x4transposed(half4 a0, half4 a1, __local half * b); +half8 img_matmul_2x4_4x4transposed(half4 a0, half4 a1, __local half * b); +float8 img_matmul_acc_2x4_4x4transposed(half4 a0, half4 a1, __local half * b, float4 acc0, float4 acc1); +half8 img_matmul_acc_2x4_4x4transposed(half4 a0, half4 a1, __local half * b, half4 acc0, half4 acc1); +---- + +== Modifications to the OpenCL C Specification + +(Add to Table 11 - Built-in Scalar and Vector Argument Math Functions in Section 6.15.2 - Math Functions) :: ++ +-- +[cols="1,2",options="header"] +|==== +| Function | Description +| float2 *img_dot_interleaved*(gentypef _a_, pass:[__local] float * _b_) + a| `img_dot_interleaved` performs the dual dot product operation. + The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`, where `b` is expected to be a pointer to a vector of double the size of `a`. The result is stored into the first element of the output vector. + The input vectors of the second dot product are `a` and the vector containing the odd-indexed elements of `b`. The result is stored into the second element of the output vector. + +For example, given: + +---- +a = [a0 a1] +b = [b0 b1 b2 b3] +---- + +the output vector is: + +---- +[res0 res1] = [a0 a1] x [b0 b1] + [b2 b3] +---- + +Requires that the `__opencl_img_dot_interleaved` feature macro is defined. +| float2 *img_dot_interleaved_acc*(gentypef _a_, pass:[__local] float * _b_, float2 _acc_) + a| `img_dot_interleaved` performs the dual dot product operation with the accumulator `acc`. + The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`, where `b` is expected to be a pointer to a vector of double the size of `a`. The result is stored into the first element of the output vector. + The input vectors of the second dot product are `a` and the vector containing the odd-indexed elements of `b`. The result is stored into the second element of the output vector. + +For example, given: + +---- +a = [a0 a1] +b = [b0 b1 b2 b3] +acc = [acc0 acc1] +---- + +the output vector is: + +---- +[res0 res1] = [a0 a1] x [b0 b1] + [acc0 acc1] + [b2 b3] +---- + +Requires that the `__opencl_img_dot_interleaved` feature macro is defined. +| float8 *img_matmul_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_) + + half8 *img_matmul_2x4_4x4*(half4 _a0_, half4 a1, pass:[__local] half * _b_) + a| `img_matmul_2x4_4x4` performs the matrix multiplication operation of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A. + The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4. + +For example, given: + +---- +A = [a00 a01 a02 a03] + [a10 a11 a12 a13] +B = [b00 b01 b02 b03] + [b10 b11 b12 b13] + [b20 b21 b22 b23] + [b30 b31 b32 b33] +---- + +the output vector is: + +---- +[res0 res1 res2 res3] = A x B +[res4 res5 res6 res7] +---- + +Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined. +| float8 *img_matmul_acc_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 _acc0_, half4 _acc1_) + + half8 *img_matmul_acc_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 acc0, half4 _acc1_) + a| `img_matmul_acc_2x4_4x4` performs the matrix multiplication operation with the accumulator of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator. + The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4. + +For example, given: + +---- +A = [a00 a01 a02 a03] + [a10 a11 a12 a13] +B = [b00 b01 b02 b03] + [b10 b11 b12 b13] + [b20 b21 b22 b23] + [b30 b31 b32 b33] +C = [acc00 acc01 acc02 acc03] + [acc10 acc11 acc12 acc13] +---- + +the output vector is: + +---- +[res0 res1 res2 res3] = A x B + C +[res4 res5 res6 res7] +---- + +Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined. + +| float8 *img_matmul_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_) + + half8 *img_matmul_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_) + a| `img_matmul_2x4_4x4transposed` performs the matrix multiplication operation of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A. + The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4. + +For example, given: + +---- +A = [a00 a01 a02 a03] + [a10 a11 a12 a13] +BT = [b00 b10 b20 b30] + [b01 b11 b21 b31] + [b02 b12 b22 b32] + [b03 b13 b23 b33] +---- + +the output vector is: + +---- +[res0 res1 res2 res3] = A x BT +[res4 res5 res6 res7] +---- + +Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined. +| float8 *img_matmul_acc_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 _acc0_, half4 _acc1_) + + half8 *img_matmul_acc_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 acc0, half4 _acc1_) + a| `img_matmul_acc_2x4_4x4transposed` performs the matrix multiplication operation with the accumulator of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator. + The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4. + +For example, given: + +---- +A = [a00 a01 a02 a03] + [a10 a11 a12 a13] +BT = [b00 b10 b20 b30] + [b01 b11 b21 b31] + [b02 b12 b22 b32] + [b03 b13 b23 b33] +C = [acc00 acc01 acc02 acc03] + [acc10 acc11 acc12 acc13] +---- + +the output vector is: + +---- +[res0 res1 res2 res3] = A x BT + C +[res4 res5 res6 res7] +---- + +Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined. +|==== +-- + +== Version History + +[cols="5,15,15,70"] +[grid="rows"] +[options="header"] +|==== +| Version | Date | Author | Changes +| 1.0.0 | 2024-06-07 | Tomasz Platek | *Initial revision* +|==== + diff --git a/extensions/extensions.txt b/extensions/extensions.txt index 573ec116..46596b9f 100644 --- a/extensions/extensions.txt +++ b/extensions/extensions.txt @@ -67,6 +67,8 @@ include::cl_img_cancel_command.asciidoc[] <<< include::cl_img_generate_mipmap.asciidoc[] <<< +include::cl_img_matrix_multiply.asciidoc[] +<<< include::cl_img_mem_properties.asciidoc[] <<< include::cl_img_use_gralloc_ptr.asciidoc[] From 472cadeb4cf9c8532b1a92b6e3fd304be73c5c0f Mon Sep 17 00:00:00 2001 From: tomek Date: Thu, 4 Jul 2024 11:53:10 +0200 Subject: [PATCH 2/6] The final draft of the cl_img_matrix_multiply extension. --- extensions/cl_img_matrix_multiply.asciidoc | 158 ++++++++++++++------- 1 file changed, 103 insertions(+), 55 deletions(-) diff --git a/extensions/cl_img_matrix_multiply.asciidoc b/extensions/cl_img_matrix_multiply.asciidoc index a9537ea8..58a4e6c7 100644 --- a/extensions/cl_img_matrix_multiply.asciidoc +++ b/extensions/cl_img_matrix_multiply.asciidoc @@ -28,7 +28,7 @@ Copyright (c) 2024 Imagination Technologies Ltd. All Rights Reserved. == Status -Draft spec, NOT APPROVED!! +Final Draft == Version @@ -37,13 +37,13 @@ Version: Major.Minor.Patch == Dependencies -This extension is written against the OpenCL Specification Version 3.0, Version V3.0.16. +This extension is written against the OpenCL C Specification Version V3.0.16. This extension requires the `cl_khr_fp16` extension. == Overview -This extension provides the helper functions that allow to perform matrix multiplication efficiently. +This extension adds built-in functions that exercise hardware capabilities of Imagination GPU IP and allow to implement matrix multiplication in highly efficient and performant manner. == New OpenCL C Feature Names @@ -55,26 +55,32 @@ __opencl_img_matmul_2x4_4x4 == New OpenCL C Functions -Performs the interleaved dot product operation: +Perform the interleaved dot product operation: [source,c] ---- -float2 img_dot_interleaved(gentypef a, __local float * b); -float2 img_dot_interleaved_acc(gentypef a, __local float * b, float2 acc); +float2 img_dot_interleaved(float a,__local float2 * b); +float2 img_dot_interleaved(float2 a,__local float4 * b); +float2 img_dot_interleaved(float4 a,__local float8 * b); +float2 img_dot_interleaved(float8 a,__local float16 * b); +float2 img_dot_interleaved_acc(float a,__local float2 * b, float2 acc); +float2 img_dot_interleaved_acc(float2 a,__local float4 * b, float2 acc); +float2 img_dot_interleaved_acc(float4 a,__local float8 * b, float2 acc); +float2 img_dot_interleaved_acc(float8 a,__local float16 * b, float2 acc); ---- -Performs the matrix multiplication operation: +Perform the matrix multiplication operation: [source,c] ---- -float8 img_matmul_2x4_4x4(half4 a0, half4 a1, __local half * b); -half8 img_matmul_2x4_4x4(half4 a0, half4 a1, __local half * b); -float8 img_matmul_acc_2x4_4x4(half4 a0, half4 a1, __local half * b, float4 acc0, float4 acc1); -half8 img_matmul_acc_2x4_4x4(half4 a0, half4 a1, __local half * b, half4 acc0, half4 acc1); -float8 img_matmul_2x4_4x4transposed(half4 a0, half4 a1, __local half * b); -half8 img_matmul_2x4_4x4transposed(half4 a0, half4 a1, __local half * b); -float8 img_matmul_acc_2x4_4x4transposed(half4 a0, half4 a1, __local half * b, float4 acc0, float4 acc1); -half8 img_matmul_acc_2x4_4x4transposed(half4 a0, half4 a1, __local half * b, half4 acc0, half4 acc1); +float8 img_matmul_2x4_4x4f(half4 a0, half4 a1,__local half16 * b); +half8 img_matmul_2x4_4x4h(half4 a0, half4 a1,__local half16 * b); +float8 img_matmul_acc_2x4_4x4f(half4 a0, half4 a1,__local half16 * b, float4 acc0, float4 acc1); +half8 img_matmul_acc_2x4_4x4h(half4 a0, half4 a1,__local half16 * b, half4 acc0, half4 acc1); +float8 img_matmul_2x4_4x4transposedf(half4 a0, half4 a1,__local half16 * b); +half8 img_matmul_2x4_4x4transposedh(half4 a0, half4 a1,__local half16 * b); +float8 img_matmul_acc_2x4_4x4transposedf(half4 a0, half4 a1,__local half16 * b, float4 acc0, float4 acc1); +half8 img_matmul_acc_2x4_4x4transposedh(half4 a0, half4 a1,__local half16 * b, half4 acc0, half4 acc1); ---- == Modifications to the OpenCL C Specification @@ -85,9 +91,12 @@ half8 img_matmul_acc_2x4_4x4transposed(half4 a0, half4 a1, __local half * b, hal [cols="1,2",options="header"] |==== | Function | Description -| float2 *img_dot_interleaved*(gentypef _a_, pass:[__local] float * _b_) +| float2 *img_dot_interleaved*(float _a_,pass:[__local] float2 * _b_) + + float2 *img_dot_interleaved*(float2 _a_,pass:[__local] float4 * _b_) + + float2 *img_dot_interleaved*(float4 _a_,pass:[__local] float8 * _b_) + + float2 *img_dot_interleaved*(float8 _a_,pass:[__local] float16 * _b_) a| `img_dot_interleaved` performs the dual dot product operation. - The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`, where `b` is expected to be a pointer to a vector of double the size of `a`. The result is stored into the first element of the output vector. + The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`. The result is stored into the first element of the output vector. The input vectors of the second dot product are `a` and the vector containing the odd-indexed elements of `b`. The result is stored into the second element of the output vector. For example, given: @@ -105,9 +114,12 @@ the output vector is: ---- Requires that the `__opencl_img_dot_interleaved` feature macro is defined. -| float2 *img_dot_interleaved_acc*(gentypef _a_, pass:[__local] float * _b_, float2 _acc_) - a| `img_dot_interleaved` performs the dual dot product operation with the accumulator `acc`. - The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`, where `b` is expected to be a pointer to a vector of double the size of `a`. The result is stored into the first element of the output vector. +| float2 *img_dot_interleaved_acc*(float _a_,pass:[__local] float2 * _b_, float2 _acc_) + + float2 *img_dot_interleaved_acc*(float2 _a_,pass:[__local] float4 * _b_, float2 _acc_) + + float2 *img_dot_interleaved_acc*(float4 _a_,pass:[__local] float8 * _b_, float2 _acc_) + + float2 *img_dot_interleaved_acc*(float8 _a_,pass:[__local] float16 * _b_, float2 _acc_) + a| `img_dot_interleaved_acc` performs the dual dot product operation with the accumulator `acc`. + The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`. The result is stored into the first element of the output vector. The input vectors of the second dot product are `a` and the vector containing the odd-indexed elements of `b`. The result is stored into the second element of the output vector. For example, given: @@ -126,20 +138,20 @@ the output vector is: ---- Requires that the `__opencl_img_dot_interleaved` feature macro is defined. -| float8 *img_matmul_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_) + - half8 *img_matmul_2x4_4x4*(half4 _a0_, half4 a1, pass:[__local] half * _b_) - a| `img_matmul_2x4_4x4` performs the matrix multiplication operation of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A. - The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4. +| float8 *img_matmul_2x4_4x4f*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_) + + half8 *img_matmul_2x4_4x4h*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_) + a| `img_matmul_2x4_4x4f` and `img_matmul_2x4_4x4h` perform the matrix multiplication operation of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A. + The first row of the matrix B is represented by the elements 0-3 of `b`, the second row by the elements 4-7, the third row by the elements 8-11, and the fourth row by the elements 12-15. For example, given: ---- A = [a00 a01 a02 a03] [a10 a11 a12 a13] -B = [b00 b01 b02 b03] - [b10 b11 b12 b13] - [b20 b21 b22 b23] - [b30 b31 b32 b33] +B = [b0 b1 b2 b3] + [b4 b5 b6 b7] + [b8 b9 b10 b11] + [b12 b13 b14 b15] ---- the output vector is: @@ -150,20 +162,20 @@ the output vector is: ---- Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined. -| float8 *img_matmul_acc_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 _acc0_, half4 _acc1_) + - half8 *img_matmul_acc_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 acc0, half4 _acc1_) - a| `img_matmul_acc_2x4_4x4` performs the matrix multiplication operation with the accumulator of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator. - The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4. +| float8 *img_matmul_acc_2x4_4x4f*(half4 _a0_, half4 _a1_,pass:[__local] half16 _b_, float4 _acc0_, float4 _acc1_) + + half8 *img_matmul_acc_2x4_4x4h*(half4 _a0_, half4 _a1_,pass:[__local] half16 _b_, half4 _acc0_, half4 _acc1_) + a| `img_matmul_acc_2x4_4x4f` and `img_matmul_acc_2x4_4x4h` perform the matrix multiplication operation with the accumulator of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator. + The first row of the matrix B is represented by the elements 0-3 of `b`, the second row by the elements 4-7, the third row by the elements 8-11, and the fourth row by the elements 12-15. For example, given: ---- A = [a00 a01 a02 a03] [a10 a11 a12 a13] -B = [b00 b01 b02 b03] - [b10 b11 b12 b13] - [b20 b21 b22 b23] - [b30 b31 b32 b33] +B = [b0 b1 b2 b3] + [b4 b5 b6 b7] + [b8 b9 b10 b11] + [b12 b13 b14 b15] C = [acc00 acc01 acc02 acc03] [acc10 acc11 acc12 acc13] ---- @@ -172,25 +184,25 @@ the output vector is: ---- [res0 res1 res2 res3] = A x B + C -[res4 res5 res6 res7] +[res4 res5 res6 res7] ---- Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined. -| float8 *img_matmul_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_) + - half8 *img_matmul_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_) - a| `img_matmul_2x4_4x4transposed` performs the matrix multiplication operation of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A. - The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4. +| float8 *img_matmul_2x4_4x4transposedf*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_) + + half8 *img_matmul_2x4_4x4transposedh*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_) + a| `img_matmul_2x4_4x4transposedf` and `img_matmul_2x4_4x4transposedh` perform the matrix multiplication operation of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A. + The first row of the matrix B is represented by the elements 0-3 of `b`, the second row by the elements 4-7, the third row by the elements 8-11, and the fourth row by the elements 12-15. For example, given: ---- A = [a00 a01 a02 a03] [a10 a11 a12 a13] -BT = [b00 b10 b20 b30] - [b01 b11 b21 b31] - [b02 b12 b22 b32] - [b03 b13 b23 b33] +BT = [b0 b4 b8 b12] + [b1 b5 b9 b13] + [b2 b6 b10 b14] + [b3 b7 b11 b15] ---- the output vector is: @@ -201,35 +213,71 @@ the output vector is: ---- Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined. -| float8 *img_matmul_acc_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 _acc0_, half4 _acc1_) + - half8 *img_matmul_acc_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 acc0, half4 _acc1_) - a| `img_matmul_acc_2x4_4x4transposed` performs the matrix multiplication operation with the accumulator of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator. - The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4. +| float8 *img_matmul_acc_2x4_4x4transposedf*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_, float4 _acc0_, float4 _acc1_) + + half8 *img_matmul_acc_2x4_4x4transposedh*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_, half4 _acc0_, half4 _acc1_) + a| `img_matmul_acc_2x4_4x4transposedf` and `img_matmul_acc_2x4_4x4transposedh` perform the matrix multiplication operation with the accumulator of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator. + The first row of the matrix B is represented by the elements 0-3 of `b`, the second row by the elements 4-7, the third row by the elements 8-11, and the fourth row by the elements 12-15. For example, given: ---- A = [a00 a01 a02 a03] [a10 a11 a12 a13] -BT = [b00 b10 b20 b30] - [b01 b11 b21 b31] - [b02 b12 b22 b32] - [b03 b13 b23 b33] +BT = [b0 b4 b8 b12] + [b1 b5 b9 b13] + [b2 b6 b10 b14] + [b3 b7 b11 b15] C = [acc00 acc01 acc02 acc03] - [acc10 acc11 acc12 acc13] + [acc10 acc11 acc12 acc13] ---- the output vector is: ---- [res0 res1 res2 res3] = A x BT + C -[res4 res5 res6 res7] +[res4 res5 res6 res7] ---- Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined. |==== -- +== Coding Sample + +This coding sample shows how to initialize the input vectors, use the *img_dot_interleaved_acc* function, and access the output vector: +[source] +---- +float4 a = (float4) (1.0f, 1.0f, 1.0f, 1.0f); +__local float8 b; +b = (float8) (0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f); + +float2 acc = (float2) (1.0f, 1.0f); +float2 res = img_dot_interleaved_acc(a, &b, acc); + +printf("res = [ %f %f ]\n", res.s0, res.s1); +---- + +This coding sample shows how to initialize the input vectors, use the *img_matmul_acc_2x4_4x4f* function, and access the output vector: +[source] +---- +half4 a0 = (half4) (1.0h, 0.0h, 0.0h, 0.0h); +half4 a1 = (half4) (0.0h, 1.0h, 0.0h, 0.0h); + +local half16 b; +b = (half16) (0.0h, 1.0h, 2.0h, 3.0h, + 4.0h, 5.0h, 6.0h, 7.0h, + 8.0h, 9.0h, 10.0h, 11.0h, + 12.0h, 13.0h, 14.0h, 15.0h); + +float4 acc0 = (float4) (1.0f, 1.0f, 1.0f, 1.0f); +float4 acc1 = (float4) (1.0f, 1.0f, 1.0f, 1.0f); + +float8 res = img_matmul_acc_2x4_4x4f(a0, a1, &b, acc0, acc1); + +printf("res = [ %f %f %f %f ]\n", res.s0, res.s1, res.s2, res.s3); +printf(" [ %f %f %f %f ]\n", res.s4, res.s5, res.s6, res.s7); +---- + == Version History [cols="5,15,15,70"] From b17a1f7b3596601b314bdd3dd599c5b1afd85afd Mon Sep 17 00:00:00 2001 From: tomasz-platek Date: Thu, 4 Jul 2024 14:05:05 +0200 Subject: [PATCH 3/6] Publish the cl_img_bitwise_ops extension specification. --- extensions/cl_img_bitwise_ops.asciidoc | 118 +++++++++++++++++++++++++ extensions/extensions.txt | 2 + 2 files changed, 120 insertions(+) create mode 100644 extensions/cl_img_bitwise_ops.asciidoc diff --git a/extensions/cl_img_bitwise_ops.asciidoc b/extensions/cl_img_bitwise_ops.asciidoc new file mode 100644 index 00000000..43d7c7d3 --- /dev/null +++ b/extensions/cl_img_bitwise_ops.asciidoc @@ -0,0 +1,118 @@ +:data-uri: +:icons: font +include::../config/attribs.txt[] +:source-highlighter: coderay + += cl_img_bitwise_ops + +== Name Strings + +`cl_img_bitwise_ops` + +== Contact + +Imagination Technologies Developer Forum: + +https://forums.imgtec.com/ + +Tomasz Platek, Imagination Technologies (Tomasz.Platek 'at' imgtec.com) + +== Contributors + +CY Cheng, Imagination Technologies. + +Tomasz Platek, Imagination Technologies. + +== Notice + +Copyright (c) 2024 Imagination Technologies Ltd. All Rights Reserved. + +== Status + +Final Draft + +== Version + +Built On: {docdate} + +Version: Major.Minor.Patch + +== Dependencies + +This extension is written against the OpenCL C Specification Version V3.0.16. + +== Overview + +This extension adds built-in functions that expose the bitwise operations of Imagination GPU IP that are not accessible by standard OpenCL C functions. + +== New OpenCL C Feature Names + +[source,c] +---- +__opencl_img_bit_interleave +---- + +== New OpenCL C Functions + +Performs the bit interleave operation: + +[source,c] +---- +gentype img_bit_interleave(gentype a, gentype b); +---- + +== Modifications to the OpenCL C Specification + +(Add to Table 16 - Built-in Scalar and Vector Argument Common Functions in Section 6.15.4 - Common Functions) :: ++ +-- +[cols="1,2",options="header"] +|==== +| Function | Description +| gentype *img_bit_interleave*(gentype a, gentype b) + a| `img_bit_interleave` interleaves the first `n` bits from two sources where `n` is half of the size of gentype in bits. + +For `a` and `b`, where a0 and b0 are the least significant bits: +[source] +---- +a = a(N-1)\|a(N-2)\|a(N-3)\|...\|a3\|a2\|a1\|a0 +b = b(N-1)\|b(N-2)\|b(N-3)\|...\|b3\|b2\|b1\|b0 +---- + +the output is: +[source] +---- +res = b(N/2-1)\|a(N/2-1)\|b(N/2-2)\|a(N/2-2)\|b(N/2-3)\|a(N/2-3)\|...\|b3\|a3\|b2\|a2\|b1\|a1\|b0\|a0 +---- +so the sizes of `a`,`b`, and `res` are equal. + +Requires that the `__opencl_img_bit_interleave` feature macro is defined. +|==== +-- + +== Coding Sample + +This coding sample shows how to use the *img_bit_interleave* function: +[source] +---- +int4 a = (int4) ( 0x00000000, 0x00000000, 0x0000FFFF, 0xFFFFFFFF); +int4 b = (int4) ( 0xFFFFFFFF, 0x0000FFFF, 0x00000000, 0x00000000); + +int4 res = img_bit_interleave(a,b); + +printf("res = [ 0x%x 0x%x 0x%x 0x%x]\n", res.s0, res.s1, res.s2, res.s3); +---- + +Executing a work-item of this kernel gives the following result: +[source] +---- +res = [ 0xaaaaaaaa 0xaaaaaaaa 0x55555555 0x55555555] +---- + +== Version History + +[cols="5,15,15,70"] +[grid="rows"] +[options="header"] +|==== +| Version | Date | Author | Changes +| 1.0.0 | 2024-06-19 | Tomasz Platek | *Initial revision* +|==== + diff --git a/extensions/extensions.txt b/extensions/extensions.txt index 46596b9f..aae06c84 100644 --- a/extensions/extensions.txt +++ b/extensions/extensions.txt @@ -61,6 +61,8 @@ include::cl_arm_scheduling_controls.asciidoc[] == Imagination Technologies Extensions :leveloffset: 2 <<< +include::cl_img_bitwise_ops.asciidoc[] +<<< include::cl_img_cached_allocations.asciidoc[] <<< include::cl_img_cancel_command.asciidoc[] From 4de81270b0552e711258523a6a09e0326c223454 Mon Sep 17 00:00:00 2001 From: tomasz-platek Date: Thu, 4 Jul 2024 15:10:50 +0200 Subject: [PATCH 4/6] Revert "Publish the cl_img_bitwise_ops extension specification." This reverts commit b17a1f7b3596601b314bdd3dd599c5b1afd85afd. --- extensions/cl_img_bitwise_ops.asciidoc | 118 ------------------------- extensions/extensions.txt | 2 - 2 files changed, 120 deletions(-) delete mode 100644 extensions/cl_img_bitwise_ops.asciidoc diff --git a/extensions/cl_img_bitwise_ops.asciidoc b/extensions/cl_img_bitwise_ops.asciidoc deleted file mode 100644 index 43d7c7d3..00000000 --- a/extensions/cl_img_bitwise_ops.asciidoc +++ /dev/null @@ -1,118 +0,0 @@ -:data-uri: -:icons: font -include::../config/attribs.txt[] -:source-highlighter: coderay - -= cl_img_bitwise_ops - -== Name Strings - -`cl_img_bitwise_ops` - -== Contact - -Imagination Technologies Developer Forum: + -https://forums.imgtec.com/ - -Tomasz Platek, Imagination Technologies (Tomasz.Platek 'at' imgtec.com) - -== Contributors - -CY Cheng, Imagination Technologies. + -Tomasz Platek, Imagination Technologies. - -== Notice - -Copyright (c) 2024 Imagination Technologies Ltd. All Rights Reserved. - -== Status - -Final Draft - -== Version - -Built On: {docdate} + -Version: Major.Minor.Patch - -== Dependencies - -This extension is written against the OpenCL C Specification Version V3.0.16. - -== Overview - -This extension adds built-in functions that expose the bitwise operations of Imagination GPU IP that are not accessible by standard OpenCL C functions. - -== New OpenCL C Feature Names - -[source,c] ----- -__opencl_img_bit_interleave ----- - -== New OpenCL C Functions - -Performs the bit interleave operation: - -[source,c] ----- -gentype img_bit_interleave(gentype a, gentype b); ----- - -== Modifications to the OpenCL C Specification - -(Add to Table 16 - Built-in Scalar and Vector Argument Common Functions in Section 6.15.4 - Common Functions) :: -+ --- -[cols="1,2",options="header"] -|==== -| Function | Description -| gentype *img_bit_interleave*(gentype a, gentype b) - a| `img_bit_interleave` interleaves the first `n` bits from two sources where `n` is half of the size of gentype in bits. - -For `a` and `b`, where a0 and b0 are the least significant bits: -[source] ----- -a = a(N-1)\|a(N-2)\|a(N-3)\|...\|a3\|a2\|a1\|a0 -b = b(N-1)\|b(N-2)\|b(N-3)\|...\|b3\|b2\|b1\|b0 ----- - -the output is: -[source] ----- -res = b(N/2-1)\|a(N/2-1)\|b(N/2-2)\|a(N/2-2)\|b(N/2-3)\|a(N/2-3)\|...\|b3\|a3\|b2\|a2\|b1\|a1\|b0\|a0 ----- -so the sizes of `a`,`b`, and `res` are equal. - -Requires that the `__opencl_img_bit_interleave` feature macro is defined. -|==== --- - -== Coding Sample - -This coding sample shows how to use the *img_bit_interleave* function: -[source] ----- -int4 a = (int4) ( 0x00000000, 0x00000000, 0x0000FFFF, 0xFFFFFFFF); -int4 b = (int4) ( 0xFFFFFFFF, 0x0000FFFF, 0x00000000, 0x00000000); - -int4 res = img_bit_interleave(a,b); - -printf("res = [ 0x%x 0x%x 0x%x 0x%x]\n", res.s0, res.s1, res.s2, res.s3); ----- - -Executing a work-item of this kernel gives the following result: -[source] ----- -res = [ 0xaaaaaaaa 0xaaaaaaaa 0x55555555 0x55555555] ----- - -== Version History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|==== -| Version | Date | Author | Changes -| 1.0.0 | 2024-06-19 | Tomasz Platek | *Initial revision* -|==== - diff --git a/extensions/extensions.txt b/extensions/extensions.txt index aae06c84..46596b9f 100644 --- a/extensions/extensions.txt +++ b/extensions/extensions.txt @@ -61,8 +61,6 @@ include::cl_arm_scheduling_controls.asciidoc[] == Imagination Technologies Extensions :leveloffset: 2 <<< -include::cl_img_bitwise_ops.asciidoc[] -<<< include::cl_img_cached_allocations.asciidoc[] <<< include::cl_img_cancel_command.asciidoc[] From 4b29fd1b5620ce10d101d093405dfee7bbc65b6b Mon Sep 17 00:00:00 2001 From: tomasz-platek <165791413+tomasz-platek@users.noreply.github.com> Date: Wed, 10 Jul 2024 09:42:18 +0200 Subject: [PATCH 5/6] Update extensions/cl_img_matrix_multiply.asciidoc Listing the initial extension version. Co-authored-by: Ben Ashbaugh --- extensions/cl_img_matrix_multiply.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions/cl_img_matrix_multiply.asciidoc b/extensions/cl_img_matrix_multiply.asciidoc index 58a4e6c7..a7fa8807 100644 --- a/extensions/cl_img_matrix_multiply.asciidoc +++ b/extensions/cl_img_matrix_multiply.asciidoc @@ -33,7 +33,7 @@ Final Draft == Version Built On: {docdate} + -Version: Major.Minor.Patch +Version: 1.0.0 == Dependencies From 0b673c577afc501b8bfd2e30d23c782d6a024e1a Mon Sep 17 00:00:00 2001 From: tomasz-platek <165791413+tomasz-platek@users.noreply.github.com> Date: Wed, 10 Jul 2024 14:42:58 +0200 Subject: [PATCH 6/6] Update cl_img_matrix_multiply.asciidoc Adding execution results to the coding samples --- extensions/cl_img_matrix_multiply.asciidoc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/extensions/cl_img_matrix_multiply.asciidoc b/extensions/cl_img_matrix_multiply.asciidoc index a7fa8807..06883028 100644 --- a/extensions/cl_img_matrix_multiply.asciidoc +++ b/extensions/cl_img_matrix_multiply.asciidoc @@ -257,6 +257,12 @@ float2 res = img_dot_interleaved_acc(a, &b, acc); printf("res = [ %f %f ]\n", res.s0, res.s1); ---- +Executing a work-item containing this code gives the following result: +[source] +---- +res = [ 1.000000 5.000000 ] +---- + This coding sample shows how to initialize the input vectors, use the *img_matmul_acc_2x4_4x4f* function, and access the output vector: [source] ---- @@ -278,6 +284,13 @@ printf("res = [ %f %f %f %f ]\n", res.s0, res.s1, res.s2, res.s3); printf(" [ %f %f %f %f ]\n", res.s4, res.s5, res.s6, res.s7); ---- +Executing a work-item containing this code gives the following result: +[source] +---- +res = [ 1.000000 2.000000 3.000000 4.000000 ] + [ 5.000000 6.000000 7.000000 8.000000 ] +---- + == Version History [cols="5,15,15,70"]