From 8d39e7895e62adca00670320658880483bb4a439 Mon Sep 17 00:00:00 2001
From: tomek <tomasz.platek@imgtec.com>
Date: Mon, 10 Jun 2024 18:58:52 +0200
Subject: [PATCH 1/6] Publish cl_img_matrix_multiply extension specification.

---
 extensions/cl_img_matrix_multiply.asciidoc | 242 +++++++++++++++++++++
 extensions/extensions.txt                  |   2 +
 2 files changed, 244 insertions(+)
 create mode 100644 extensions/cl_img_matrix_multiply.asciidoc

diff --git a/extensions/cl_img_matrix_multiply.asciidoc b/extensions/cl_img_matrix_multiply.asciidoc
new file mode 100644
index 00000000..a9537ea8
--- /dev/null
+++ b/extensions/cl_img_matrix_multiply.asciidoc
@@ -0,0 +1,242 @@
+:data-uri:
+:icons: font
+include::../config/attribs.txt[]
+:source-highlighter: coderay
+
+= cl_img_matrix_multiply
+
+== Name Strings
+
+`cl_img_matrix_multiply`
+
+== Contact
+
+Imagination Technologies Developer Forum: +
+https://forums.imgtec.com/
+
+Tomasz Platek, Imagination Technologies (Tomasz.Platek 'at' imgtec.com)
+
+== Contributors
+
+CY Cheng, Imagination Technologies. +
+Joe Molleson, Imagination Technologies. +
+Tomasz Platek, Imagination Technologies.
+
+== Notice
+
+Copyright (c) 2024 Imagination Technologies Ltd. All Rights Reserved.
+
+== Status
+
+Draft spec, NOT APPROVED!!
+
+== Version
+
+Built On: {docdate} +
+Version: Major.Minor.Patch
+
+== Dependencies
+
+This extension is written against the OpenCL Specification Version 3.0, Version V3.0.16.
+
+This extension requires the `cl_khr_fp16` extension.
+
+== Overview
+
+This extension provides the helper functions that allow to perform matrix multiplication efficiently.
+
+== New OpenCL C Feature Names
+
+[source,c]
+----
+__opencl_img_dot_interleaved
+__opencl_img_matmul_2x4_4x4
+----
+
+== New OpenCL C Functions
+
+Performs the interleaved dot product operation:
+
+[source,c]
+----
+float2 img_dot_interleaved(gentypef a, __local float * b);
+float2 img_dot_interleaved_acc(gentypef a, __local float * b, float2 acc);
+----
+
+Performs the matrix multiplication operation:
+
+[source,c]
+----
+float8 img_matmul_2x4_4x4(half4 a0, half4 a1, __local half * b);
+half8 img_matmul_2x4_4x4(half4 a0, half4 a1, __local half * b);
+float8 img_matmul_acc_2x4_4x4(half4 a0, half4 a1, __local half * b, float4 acc0, float4 acc1);
+half8 img_matmul_acc_2x4_4x4(half4 a0, half4 a1, __local half * b, half4 acc0, half4 acc1);
+float8 img_matmul_2x4_4x4transposed(half4 a0, half4 a1, __local half * b);
+half8 img_matmul_2x4_4x4transposed(half4 a0, half4 a1, __local half * b);
+float8 img_matmul_acc_2x4_4x4transposed(half4 a0, half4 a1, __local half * b, float4 acc0, float4 acc1);
+half8 img_matmul_acc_2x4_4x4transposed(half4 a0, half4 a1, __local half * b, half4 acc0, half4 acc1);
+----
+
+== Modifications to the OpenCL C Specification
+
+(Add to Table 11 - Built-in Scalar and Vector Argument Math Functions in Section 6.15.2 - Math Functions) ::
++
+--
+[cols="1,2",options="header"]
+|====
+| Function | Description
+| float2 *img_dot_interleaved*(gentypef _a_, pass:[__local] float * _b_)
+    a| `img_dot_interleaved` performs the dual dot product operation. 
+    The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`, where `b` is expected to be a pointer to a vector of double the size of `a`. The result is stored into the first element of the output vector.
+    The input vectors of the second dot product are `a` and the vector containing the odd-indexed elements of `b`. The result is stored into the second element of the output vector.
+    
+For example, given:
+ 
+----
+a = [a0 a1]
+b = [b0 b1 b2 b3]
+----
+
+the output vector is:
+
+----
+[res0 res1] = [a0 a1] x [b0 b1]
+                        [b2 b3]
+----
+
+Requires that the `__opencl_img_dot_interleaved` feature macro is defined.
+| float2 *img_dot_interleaved_acc*(gentypef _a_, pass:[__local] float * _b_, float2 _acc_)
+    a| `img_dot_interleaved` performs the dual dot product operation with the accumulator `acc`. 
+    The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`, where `b` is expected to be a pointer to a vector of double the size of `a`. The result is stored into the first element of the output vector.
+    The input vectors of the second dot product are `a` and the vector containing the odd-indexed elements of `b`. The result is stored into the second element of the output vector.
+
+For example, given:
+
+----
+a = [a0 a1]
+b = [b0 b1 b2 b3]
+acc = [acc0 acc1]
+----
+
+the output vector is:
+
+----
+[res0 res1] = [a0 a1] x [b0 b1] + [acc0 acc1]
+                        [b2 b3]
+----
+
+Requires that the `__opencl_img_dot_interleaved` feature macro is defined.
+| float8 *img_matmul_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_) +
+  half8 *img_matmul_2x4_4x4*(half4 _a0_, half4 a1, pass:[__local] half * _b_)
+    a| `img_matmul_2x4_4x4` performs the matrix multiplication operation of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A.
+    The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4.
+
+For example, given:
+
+----
+A = [a00 a01 a02 a03]
+    [a10 a11 a12 a13]
+B = [b00 b01 b02 b03]
+    [b10 b11 b12 b13]
+    [b20 b21 b22 b23]
+    [b30 b31 b32 b33]
+----
+
+the output vector is:
+
+----
+[res0 res1 res2 res3] = A x B
+[res4 res5 res6 res7]                                                        
+----
+
+Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
+| float8 *img_matmul_acc_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 _acc0_, half4 _acc1_) +
+  half8 *img_matmul_acc_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 acc0, half4 _acc1_)
+    a| `img_matmul_acc_2x4_4x4` performs the matrix multiplication operation with the accumulator of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator.
+    The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4.
+
+For example, given:
+
+----
+A = [a00 a01 a02 a03]
+    [a10 a11 a12 a13]
+B = [b00 b01 b02 b03]
+    [b10 b11 b12 b13]
+    [b20 b21 b22 b23]
+    [b30 b31 b32 b33]
+C = [acc00 acc01 acc02 acc03]
+    [acc10 acc11 acc12 acc13]
+----
+
+the output vector is:
+
+----
+[res0 res1 res2 res3] = A x B + C
+[res4 res5 res6 res7]                                                        
+----
+
+Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
+
+| float8 *img_matmul_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_) +
+  half8 *img_matmul_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_)
+    a| `img_matmul_2x4_4x4transposed` performs the matrix multiplication operation of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A.
+    The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4.
+
+For example, given:
+
+----
+A = [a00 a01 a02 a03]
+    [a10 a11 a12 a13]
+BT = [b00 b10 b20 b30]
+     [b01 b11 b21 b31]
+     [b02 b12 b22 b32]
+     [b03 b13 b23 b33]
+----
+
+the output vector is:
+
+----
+[res0 res1 res2 res3] = A x BT
+[res4 res5 res6 res7]                                                        
+----
+
+Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
+| float8 *img_matmul_acc_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 _acc0_, half4 _acc1_) +
+  half8 *img_matmul_acc_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 acc0, half4 _acc1_)
+    a| `img_matmul_acc_2x4_4x4transposed` performs the matrix multiplication operation with the accumulator of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator.
+    The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4.
+
+For example, given:
+
+----
+A = [a00 a01 a02 a03]
+    [a10 a11 a12 a13]
+BT = [b00 b10 b20 b30]
+     [b01 b11 b21 b31]
+     [b02 b12 b22 b32]
+     [b03 b13 b23 b33]
+C = [acc00 acc01 acc02 acc03]
+    [acc10 acc11 acc12 acc13]     
+----
+
+the output vector is:
+
+----
+[res0 res1 res2 res3] = A x BT + C
+[res4 res5 res6 res7]                                                        
+----
+
+Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
+|====
+--
+
+== Version History
+
+[cols="5,15,15,70"]
+[grid="rows"]
+[options="header"]
+|====
+| Version | Date       | Author        | Changes
+| 1.0.0   | 2024-06-07 | Tomasz Platek | *Initial revision*
+|====
+
diff --git a/extensions/extensions.txt b/extensions/extensions.txt
index 573ec116..46596b9f 100644
--- a/extensions/extensions.txt
+++ b/extensions/extensions.txt
@@ -67,6 +67,8 @@ include::cl_img_cancel_command.asciidoc[]
 <<<
 include::cl_img_generate_mipmap.asciidoc[]
 <<<
+include::cl_img_matrix_multiply.asciidoc[]
+<<<
 include::cl_img_mem_properties.asciidoc[]
 <<<
 include::cl_img_use_gralloc_ptr.asciidoc[]

From 472cadeb4cf9c8532b1a92b6e3fd304be73c5c0f Mon Sep 17 00:00:00 2001
From: tomek <tomasz.platek@imgtec.com>
Date: Thu, 4 Jul 2024 11:53:10 +0200
Subject: [PATCH 2/6] The final draft of the cl_img_matrix_multiply extension.

---
 extensions/cl_img_matrix_multiply.asciidoc | 158 ++++++++++++++-------
 1 file changed, 103 insertions(+), 55 deletions(-)

diff --git a/extensions/cl_img_matrix_multiply.asciidoc b/extensions/cl_img_matrix_multiply.asciidoc
index a9537ea8..58a4e6c7 100644
--- a/extensions/cl_img_matrix_multiply.asciidoc
+++ b/extensions/cl_img_matrix_multiply.asciidoc
@@ -28,7 +28,7 @@ Copyright (c) 2024 Imagination Technologies Ltd. All Rights Reserved.
 
 == Status
 
-Draft spec, NOT APPROVED!!
+Final Draft
 
 == Version
 
@@ -37,13 +37,13 @@ Version: Major.Minor.Patch
 
 == Dependencies
 
-This extension is written against the OpenCL Specification Version 3.0, Version V3.0.16.
+This extension is written against the OpenCL C Specification Version V3.0.16.
 
 This extension requires the `cl_khr_fp16` extension.
 
 == Overview
 
-This extension provides the helper functions that allow to perform matrix multiplication efficiently.
+This extension adds built-in functions that exercise hardware capabilities of Imagination GPU IP and allow to implement matrix multiplication in highly efficient and performant manner.
 
 == New OpenCL C Feature Names
 
@@ -55,26 +55,32 @@ __opencl_img_matmul_2x4_4x4
 
 == New OpenCL C Functions
 
-Performs the interleaved dot product operation:
+Perform the interleaved dot product operation:
 
 [source,c]
 ----
-float2 img_dot_interleaved(gentypef a, __local float * b);
-float2 img_dot_interleaved_acc(gentypef a, __local float * b, float2 acc);
+float2 img_dot_interleaved(float a,__local float2 * b);
+float2 img_dot_interleaved(float2 a,__local float4 * b);
+float2 img_dot_interleaved(float4 a,__local float8 * b);
+float2 img_dot_interleaved(float8 a,__local float16 * b);
+float2 img_dot_interleaved_acc(float a,__local float2 * b, float2 acc);
+float2 img_dot_interleaved_acc(float2 a,__local float4 * b, float2 acc);
+float2 img_dot_interleaved_acc(float4 a,__local float8 * b, float2 acc);
+float2 img_dot_interleaved_acc(float8 a,__local float16 * b, float2 acc);
 ----
 
-Performs the matrix multiplication operation:
+Perform the matrix multiplication operation:
 
 [source,c]
 ----
-float8 img_matmul_2x4_4x4(half4 a0, half4 a1, __local half * b);
-half8 img_matmul_2x4_4x4(half4 a0, half4 a1, __local half * b);
-float8 img_matmul_acc_2x4_4x4(half4 a0, half4 a1, __local half * b, float4 acc0, float4 acc1);
-half8 img_matmul_acc_2x4_4x4(half4 a0, half4 a1, __local half * b, half4 acc0, half4 acc1);
-float8 img_matmul_2x4_4x4transposed(half4 a0, half4 a1, __local half * b);
-half8 img_matmul_2x4_4x4transposed(half4 a0, half4 a1, __local half * b);
-float8 img_matmul_acc_2x4_4x4transposed(half4 a0, half4 a1, __local half * b, float4 acc0, float4 acc1);
-half8 img_matmul_acc_2x4_4x4transposed(half4 a0, half4 a1, __local half * b, half4 acc0, half4 acc1);
+float8 img_matmul_2x4_4x4f(half4 a0, half4 a1,__local half16 * b);
+half8 img_matmul_2x4_4x4h(half4 a0, half4 a1,__local half16 * b);
+float8 img_matmul_acc_2x4_4x4f(half4 a0, half4 a1,__local half16 * b, float4 acc0, float4 acc1);
+half8 img_matmul_acc_2x4_4x4h(half4 a0, half4 a1,__local half16 * b, half4 acc0, half4 acc1);
+float8 img_matmul_2x4_4x4transposedf(half4 a0, half4 a1,__local half16 * b);
+half8 img_matmul_2x4_4x4transposedh(half4 a0, half4 a1,__local half16 * b);
+float8 img_matmul_acc_2x4_4x4transposedf(half4 a0, half4 a1,__local half16 * b, float4 acc0, float4 acc1);
+half8 img_matmul_acc_2x4_4x4transposedh(half4 a0, half4 a1,__local half16 * b, half4 acc0, half4 acc1);
 ----
 
 == Modifications to the OpenCL C Specification
@@ -85,9 +91,12 @@ half8 img_matmul_acc_2x4_4x4transposed(half4 a0, half4 a1, __local half * b, hal
 [cols="1,2",options="header"]
 |====
 | Function | Description
-| float2 *img_dot_interleaved*(gentypef _a_, pass:[__local] float * _b_)
+| float2 *img_dot_interleaved*(float _a_,pass:[__local] float2 * _b_) +
+  float2 *img_dot_interleaved*(float2 _a_,pass:[__local] float4 * _b_) +
+  float2 *img_dot_interleaved*(float4 _a_,pass:[__local] float8 * _b_) +
+  float2 *img_dot_interleaved*(float8 _a_,pass:[__local] float16 * _b_)
     a| `img_dot_interleaved` performs the dual dot product operation. 
-    The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`, where `b` is expected to be a pointer to a vector of double the size of `a`. The result is stored into the first element of the output vector.
+    The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`. The result is stored into the first element of the output vector.
     The input vectors of the second dot product are `a` and the vector containing the odd-indexed elements of `b`. The result is stored into the second element of the output vector.
     
 For example, given:
@@ -105,9 +114,12 @@ the output vector is:
 ----
 
 Requires that the `__opencl_img_dot_interleaved` feature macro is defined.
-| float2 *img_dot_interleaved_acc*(gentypef _a_, pass:[__local] float * _b_, float2 _acc_)
-    a| `img_dot_interleaved` performs the dual dot product operation with the accumulator `acc`. 
-    The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`, where `b` is expected to be a pointer to a vector of double the size of `a`. The result is stored into the first element of the output vector.
+| float2 *img_dot_interleaved_acc*(float _a_,pass:[__local] float2 * _b_, float2 _acc_) +
+  float2 *img_dot_interleaved_acc*(float2 _a_,pass:[__local] float4 * _b_, float2 _acc_) +
+  float2 *img_dot_interleaved_acc*(float4 _a_,pass:[__local] float8 * _b_, float2 _acc_) +
+  float2 *img_dot_interleaved_acc*(float8 _a_,pass:[__local] float16 * _b_, float2 _acc_)
+    a| `img_dot_interleaved_acc` performs the dual dot product operation with the accumulator `acc`. 
+    The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`. The result is stored into the first element of the output vector.
     The input vectors of the second dot product are `a` and the vector containing the odd-indexed elements of `b`. The result is stored into the second element of the output vector.
 
 For example, given:
@@ -126,20 +138,20 @@ the output vector is:
 ----
 
 Requires that the `__opencl_img_dot_interleaved` feature macro is defined.
-| float8 *img_matmul_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_) +
-  half8 *img_matmul_2x4_4x4*(half4 _a0_, half4 a1, pass:[__local] half * _b_)
-    a| `img_matmul_2x4_4x4` performs the matrix multiplication operation of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A.
-    The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4.
+| float8 *img_matmul_2x4_4x4f*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_) +
+  half8 *img_matmul_2x4_4x4h*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_)
+    a| `img_matmul_2x4_4x4f` and `img_matmul_2x4_4x4h` perform the matrix multiplication operation of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A.
+    The first row of the matrix B is represented by the elements 0-3 of `b`, the second row by the elements 4-7, the third row by the elements 8-11, and the fourth row by the elements 12-15.
 
 For example, given:
 
 ----
 A = [a00 a01 a02 a03]
     [a10 a11 a12 a13]
-B = [b00 b01 b02 b03]
-    [b10 b11 b12 b13]
-    [b20 b21 b22 b23]
-    [b30 b31 b32 b33]
+B = [b0  b1  b2  b3]
+    [b4  b5  b6  b7]
+    [b8  b9  b10 b11]
+    [b12 b13 b14 b15]
 ----
 
 the output vector is:
@@ -150,20 +162,20 @@ the output vector is:
 ----
 
 Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
-| float8 *img_matmul_acc_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 _acc0_, half4 _acc1_) +
-  half8 *img_matmul_acc_2x4_4x4*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 acc0, half4 _acc1_)
-    a| `img_matmul_acc_2x4_4x4` performs the matrix multiplication operation with the accumulator of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator.
-    The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4.
+| float8 *img_matmul_acc_2x4_4x4f*(half4 _a0_, half4 _a1_,pass:[__local] half16 _b_, float4 _acc0_, float4 _acc1_) +
+  half8 *img_matmul_acc_2x4_4x4h*(half4 _a0_, half4 _a1_,pass:[__local] half16 _b_, half4 _acc0_, half4 _acc1_)
+    a| `img_matmul_acc_2x4_4x4f` and `img_matmul_acc_2x4_4x4h` perform the matrix multiplication operation with the accumulator of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator.
+   The first row of the matrix B is represented by the elements 0-3 of `b`, the second row by the elements 4-7, the third row by the elements 8-11, and the fourth row by the elements 12-15.
 
 For example, given:
 
 ----
 A = [a00 a01 a02 a03]
     [a10 a11 a12 a13]
-B = [b00 b01 b02 b03]
-    [b10 b11 b12 b13]
-    [b20 b21 b22 b23]
-    [b30 b31 b32 b33]
+B = [b0  b1  b2  b3]
+    [b4  b5  b6  b7]
+    [b8  b9  b10 b11]
+    [b12 b13 b14 b15]
 C = [acc00 acc01 acc02 acc03]
     [acc10 acc11 acc12 acc13]
 ----
@@ -172,25 +184,25 @@ the output vector is:
 
 ----
 [res0 res1 res2 res3] = A x B + C
-[res4 res5 res6 res7]                                                        
+[res4 res5 res6 res7]                                                   
 ----
 
 Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
 
-| float8 *img_matmul_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_) +
-  half8 *img_matmul_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_)
-    a| `img_matmul_2x4_4x4transposed` performs the matrix multiplication operation of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A.
-    The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4.
+| float8 *img_matmul_2x4_4x4transposedf*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_) +
+  half8 *img_matmul_2x4_4x4transposedh*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_)
+    a| `img_matmul_2x4_4x4transposedf` and `img_matmul_2x4_4x4transposedh` perform the matrix multiplication operation of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A.
+    The first row of the matrix B is represented by the elements 0-3 of `b`, the second row by the elements 4-7, the third row by the elements 8-11, and the fourth row by the elements 12-15.
 
 For example, given:
 
 ----
 A = [a00 a01 a02 a03]
     [a10 a11 a12 a13]
-BT = [b00 b10 b20 b30]
-     [b01 b11 b21 b31]
-     [b02 b12 b22 b32]
-     [b03 b13 b23 b33]
+BT = [b0 b4 b8  b12]
+     [b1 b5 b9  b13]
+     [b2 b6 b10 b14]
+     [b3 b7 b11 b15]
 ----
 
 the output vector is:
@@ -201,35 +213,71 @@ the output vector is:
 ----
 
 Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
-| float8 *img_matmul_acc_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 _acc0_, half4 _acc1_) +
-  half8 *img_matmul_acc_2x4_4x4transposed*(half4 _a0_, half4 _a1_, pass:[__local] half * _b_, half4 acc0, half4 _acc1_)
-    a| `img_matmul_acc_2x4_4x4transposed` performs the matrix multiplication operation with the accumulator of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator.
-    The matrix B is represented by `b` that is expected to be a pointer to a 4-element array of half4.
+| float8 *img_matmul_acc_2x4_4x4transposedf*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_, float4 _acc0_, float4 _acc1_) +
+  half8 *img_matmul_acc_2x4_4x4transposedh*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_, half4 _acc0_, half4 _acc1_)
+    a| `img_matmul_acc_2x4_4x4transposedf` and `img_matmul_acc_2x4_4x4transposedh` perform the matrix multiplication operation with the accumulator of matrix A and transposed matrix B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A, and where `acc0` is the first row and `acc1` is the second row of the accumulator.
+    The first row of the matrix B is represented by the elements 0-3 of `b`, the second row by the elements 4-7, the third row by the elements 8-11, and the fourth row by the elements 12-15.
 
 For example, given:
 
 ----
 A = [a00 a01 a02 a03]
     [a10 a11 a12 a13]
-BT = [b00 b10 b20 b30]
-     [b01 b11 b21 b31]
-     [b02 b12 b22 b32]
-     [b03 b13 b23 b33]
+BT = [b0 b4 b8  b12]
+     [b1 b5 b9  b13]
+     [b2 b6 b10 b14]
+     [b3 b7 b11 b15]
 C = [acc00 acc01 acc02 acc03]
-    [acc10 acc11 acc12 acc13]     
+    [acc10 acc11 acc12 acc13]  
 ----
 
 the output vector is:
 
 ----
 [res0 res1 res2 res3] = A x BT + C
-[res4 res5 res6 res7]                                                        
+[res4 res5 res6 res7]                                                       
 ----
 
 Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
 |====
 --
 
+== Coding Sample
+
+This coding sample shows how to initialize the input vectors, use the *img_dot_interleaved_acc* function, and access the output vector:
+[source]
+----
+float4 a = (float4) (1.0f, 1.0f, 1.0f, 1.0f);
+__local float8 b;
+b = (float8) (0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f);
+
+float2 acc = (float2) (1.0f, 1.0f);
+float2 res = img_dot_interleaved_acc(a, &b, acc);
+
+printf("res = [ %f %f ]\n", res.s0, res.s1);
+----
+
+This coding sample shows how to initialize the input vectors, use the *img_matmul_acc_2x4_4x4f* function, and access the output vector:
+[source]
+----
+half4  a0 = (half4) (1.0h, 0.0h, 0.0h, 0.0h);
+half4  a1 = (half4) (0.0h, 1.0h, 0.0h, 0.0h);
+
+local half16 b;
+b = (half16) (0.0h,  1.0h,  2.0h,  3.0h,
+              4.0h,  5.0h,  6.0h,  7.0h,
+              8.0h,  9.0h,  10.0h, 11.0h,
+              12.0h, 13.0h, 14.0h, 15.0h);
+
+float4 acc0 = (float4) (1.0f, 1.0f, 1.0f, 1.0f);
+float4 acc1 = (float4) (1.0f, 1.0f, 1.0f, 1.0f);
+
+float8 res = img_matmul_acc_2x4_4x4f(a0, a1, &b, acc0, acc1);
+
+printf("res = [ %f %f %f %f ]\n", res.s0, res.s1, res.s2, res.s3);
+printf("      [ %f %f %f %f ]\n", res.s4, res.s5, res.s6, res.s7);
+----
+
 == Version History
 
 [cols="5,15,15,70"]

From b17a1f7b3596601b314bdd3dd599c5b1afd85afd Mon Sep 17 00:00:00 2001
From: tomasz-platek <tomasz.platek@imgtec.com>
Date: Thu, 4 Jul 2024 14:05:05 +0200
Subject: [PATCH 3/6] Publish the cl_img_bitwise_ops extension specification.

---
 extensions/cl_img_bitwise_ops.asciidoc | 118 +++++++++++++++++++++++++
 extensions/extensions.txt              |   2 +
 2 files changed, 120 insertions(+)
 create mode 100644 extensions/cl_img_bitwise_ops.asciidoc

diff --git a/extensions/cl_img_bitwise_ops.asciidoc b/extensions/cl_img_bitwise_ops.asciidoc
new file mode 100644
index 00000000..43d7c7d3
--- /dev/null
+++ b/extensions/cl_img_bitwise_ops.asciidoc
@@ -0,0 +1,118 @@
+:data-uri:
+:icons: font
+include::../config/attribs.txt[]
+:source-highlighter: coderay
+
+= cl_img_bitwise_ops
+
+== Name Strings
+
+`cl_img_bitwise_ops`
+
+== Contact
+
+Imagination Technologies Developer Forum: +
+https://forums.imgtec.com/
+
+Tomasz Platek, Imagination Technologies (Tomasz.Platek 'at' imgtec.com)
+
+== Contributors
+
+CY Cheng, Imagination Technologies. +
+Tomasz Platek, Imagination Technologies.
+
+== Notice
+
+Copyright (c) 2024 Imagination Technologies Ltd. All Rights Reserved.
+
+== Status
+
+Final Draft
+
+== Version
+
+Built On: {docdate} +
+Version: Major.Minor.Patch
+
+== Dependencies
+
+This extension is written against the OpenCL C Specification Version V3.0.16.
+
+== Overview
+
+This extension adds built-in functions that expose the bitwise operations of Imagination GPU IP that are not accessible by standard OpenCL C functions.
+
+== New OpenCL C Feature Names
+
+[source,c]
+----
+__opencl_img_bit_interleave
+----
+
+== New OpenCL C Functions
+
+Performs the bit interleave operation:
+
+[source,c]
+----
+gentype img_bit_interleave(gentype a, gentype b);
+----
+
+== Modifications to the OpenCL C Specification
+
+(Add to Table 16 - Built-in Scalar and Vector Argument Common Functions in Section 6.15.4 - Common Functions) ::
++
+--
+[cols="1,2",options="header"]
+|====
+| Function | Description
+| gentype *img_bit_interleave*(gentype a, gentype b)
+    a| `img_bit_interleave` interleaves the first `n` bits from two sources where `n` is half of the size of gentype in bits.
+
+For `a` and `b`, where a0 and b0 are the least significant bits:
+[source]
+----
+a = a(N-1)\|a(N-2)\|a(N-3)\|...\|a3\|a2\|a1\|a0
+b = b(N-1)\|b(N-2)\|b(N-3)\|...\|b3\|b2\|b1\|b0
+----
+
+the output is:
+[source]
+----
+res = b(N/2-1)\|a(N/2-1)\|b(N/2-2)\|a(N/2-2)\|b(N/2-3)\|a(N/2-3)\|...\|b3\|a3\|b2\|a2\|b1\|a1\|b0\|a0
+----
+so the sizes of `a`,`b`, and `res` are equal.
+
+Requires that the `__opencl_img_bit_interleave` feature macro is defined.
+|====
+--
+
+== Coding Sample
+
+This coding sample shows how to use the *img_bit_interleave* function:
+[source]
+----
+int4 a = (int4) ( 0x00000000, 0x00000000, 0x0000FFFF, 0xFFFFFFFF);
+int4 b = (int4) ( 0xFFFFFFFF, 0x0000FFFF, 0x00000000, 0x00000000);
+
+int4 res = img_bit_interleave(a,b);
+
+printf("res = [ 0x%x 0x%x 0x%x 0x%x]\n", res.s0, res.s1, res.s2, res.s3);
+----
+
+Executing a work-item of this kernel gives the following result:
+[source]
+----
+res = [ 0xaaaaaaaa 0xaaaaaaaa 0x55555555 0x55555555]
+----
+
+== Version History
+
+[cols="5,15,15,70"]
+[grid="rows"]
+[options="header"]
+|====
+| Version | Date       | Author        | Changes
+| 1.0.0   | 2024-06-19 | Tomasz Platek | *Initial revision*
+|====
+
diff --git a/extensions/extensions.txt b/extensions/extensions.txt
index 46596b9f..aae06c84 100644
--- a/extensions/extensions.txt
+++ b/extensions/extensions.txt
@@ -61,6 +61,8 @@ include::cl_arm_scheduling_controls.asciidoc[]
 == Imagination Technologies Extensions
 :leveloffset: 2
 <<<
+include::cl_img_bitwise_ops.asciidoc[]
+<<<
 include::cl_img_cached_allocations.asciidoc[]
 <<<
 include::cl_img_cancel_command.asciidoc[]

From 4de81270b0552e711258523a6a09e0326c223454 Mon Sep 17 00:00:00 2001
From: tomasz-platek <tomasz.platek@imgtec.com>
Date: Thu, 4 Jul 2024 15:10:50 +0200
Subject: [PATCH 4/6] Revert "Publish the cl_img_bitwise_ops extension
 specification."

This reverts commit b17a1f7b3596601b314bdd3dd599c5b1afd85afd.
---
 extensions/cl_img_bitwise_ops.asciidoc | 118 -------------------------
 extensions/extensions.txt              |   2 -
 2 files changed, 120 deletions(-)
 delete mode 100644 extensions/cl_img_bitwise_ops.asciidoc

diff --git a/extensions/cl_img_bitwise_ops.asciidoc b/extensions/cl_img_bitwise_ops.asciidoc
deleted file mode 100644
index 43d7c7d3..00000000
--- a/extensions/cl_img_bitwise_ops.asciidoc
+++ /dev/null
@@ -1,118 +0,0 @@
-:data-uri:
-:icons: font
-include::../config/attribs.txt[]
-:source-highlighter: coderay
-
-= cl_img_bitwise_ops
-
-== Name Strings
-
-`cl_img_bitwise_ops`
-
-== Contact
-
-Imagination Technologies Developer Forum: +
-https://forums.imgtec.com/
-
-Tomasz Platek, Imagination Technologies (Tomasz.Platek 'at' imgtec.com)
-
-== Contributors
-
-CY Cheng, Imagination Technologies. +
-Tomasz Platek, Imagination Technologies.
-
-== Notice
-
-Copyright (c) 2024 Imagination Technologies Ltd. All Rights Reserved.
-
-== Status
-
-Final Draft
-
-== Version
-
-Built On: {docdate} +
-Version: Major.Minor.Patch
-
-== Dependencies
-
-This extension is written against the OpenCL C Specification Version V3.0.16.
-
-== Overview
-
-This extension adds built-in functions that expose the bitwise operations of Imagination GPU IP that are not accessible by standard OpenCL C functions.
-
-== New OpenCL C Feature Names
-
-[source,c]
-----
-__opencl_img_bit_interleave
-----
-
-== New OpenCL C Functions
-
-Performs the bit interleave operation:
-
-[source,c]
-----
-gentype img_bit_interleave(gentype a, gentype b);
-----
-
-== Modifications to the OpenCL C Specification
-
-(Add to Table 16 - Built-in Scalar and Vector Argument Common Functions in Section 6.15.4 - Common Functions) ::
-+
---
-[cols="1,2",options="header"]
-|====
-| Function | Description
-| gentype *img_bit_interleave*(gentype a, gentype b)
-    a| `img_bit_interleave` interleaves the first `n` bits from two sources where `n` is half of the size of gentype in bits.
-
-For `a` and `b`, where a0 and b0 are the least significant bits:
-[source]
-----
-a = a(N-1)\|a(N-2)\|a(N-3)\|...\|a3\|a2\|a1\|a0
-b = b(N-1)\|b(N-2)\|b(N-3)\|...\|b3\|b2\|b1\|b0
-----
-
-the output is:
-[source]
-----
-res = b(N/2-1)\|a(N/2-1)\|b(N/2-2)\|a(N/2-2)\|b(N/2-3)\|a(N/2-3)\|...\|b3\|a3\|b2\|a2\|b1\|a1\|b0\|a0
-----
-so the sizes of `a`,`b`, and `res` are equal.
-
-Requires that the `__opencl_img_bit_interleave` feature macro is defined.
-|====
---
-
-== Coding Sample
-
-This coding sample shows how to use the *img_bit_interleave* function:
-[source]
-----
-int4 a = (int4) ( 0x00000000, 0x00000000, 0x0000FFFF, 0xFFFFFFFF);
-int4 b = (int4) ( 0xFFFFFFFF, 0x0000FFFF, 0x00000000, 0x00000000);
-
-int4 res = img_bit_interleave(a,b);
-
-printf("res = [ 0x%x 0x%x 0x%x 0x%x]\n", res.s0, res.s1, res.s2, res.s3);
-----
-
-Executing a work-item of this kernel gives the following result:
-[source]
-----
-res = [ 0xaaaaaaaa 0xaaaaaaaa 0x55555555 0x55555555]
-----
-
-== Version History
-
-[cols="5,15,15,70"]
-[grid="rows"]
-[options="header"]
-|====
-| Version | Date       | Author        | Changes
-| 1.0.0   | 2024-06-19 | Tomasz Platek | *Initial revision*
-|====
-
diff --git a/extensions/extensions.txt b/extensions/extensions.txt
index aae06c84..46596b9f 100644
--- a/extensions/extensions.txt
+++ b/extensions/extensions.txt
@@ -61,8 +61,6 @@ include::cl_arm_scheduling_controls.asciidoc[]
 == Imagination Technologies Extensions
 :leveloffset: 2
 <<<
-include::cl_img_bitwise_ops.asciidoc[]
-<<<
 include::cl_img_cached_allocations.asciidoc[]
 <<<
 include::cl_img_cancel_command.asciidoc[]

From 4b29fd1b5620ce10d101d093405dfee7bbc65b6b Mon Sep 17 00:00:00 2001
From: tomasz-platek <165791413+tomasz-platek@users.noreply.github.com>
Date: Wed, 10 Jul 2024 09:42:18 +0200
Subject: [PATCH 5/6] Update extensions/cl_img_matrix_multiply.asciidoc

Listing the initial extension version.

Co-authored-by: Ben Ashbaugh <ben.ashbaugh@intel.com>
---
 extensions/cl_img_matrix_multiply.asciidoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/cl_img_matrix_multiply.asciidoc b/extensions/cl_img_matrix_multiply.asciidoc
index 58a4e6c7..a7fa8807 100644
--- a/extensions/cl_img_matrix_multiply.asciidoc
+++ b/extensions/cl_img_matrix_multiply.asciidoc
@@ -33,7 +33,7 @@ Final Draft
 == Version
 
 Built On: {docdate} +
-Version: Major.Minor.Patch
+Version: 1.0.0
 
 == Dependencies
 

From 0b673c577afc501b8bfd2e30d23c782d6a024e1a Mon Sep 17 00:00:00 2001
From: tomasz-platek <165791413+tomasz-platek@users.noreply.github.com>
Date: Wed, 10 Jul 2024 14:42:58 +0200
Subject: [PATCH 6/6] Update cl_img_matrix_multiply.asciidoc

Adding execution results to the coding samples
---
 extensions/cl_img_matrix_multiply.asciidoc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/extensions/cl_img_matrix_multiply.asciidoc b/extensions/cl_img_matrix_multiply.asciidoc
index a7fa8807..06883028 100644
--- a/extensions/cl_img_matrix_multiply.asciidoc
+++ b/extensions/cl_img_matrix_multiply.asciidoc
@@ -257,6 +257,12 @@ float2 res = img_dot_interleaved_acc(a, &b, acc);
 printf("res = [ %f %f ]\n", res.s0, res.s1);
 ----
 
+Executing a work-item containing this code gives the following result:
+[source]
+----
+res = [ 1.000000 5.000000 ]
+----
+
 This coding sample shows how to initialize the input vectors, use the *img_matmul_acc_2x4_4x4f* function, and access the output vector:
 [source]
 ----
@@ -278,6 +284,13 @@ printf("res = [ %f %f %f %f ]\n", res.s0, res.s1, res.s2, res.s3);
 printf("      [ %f %f %f %f ]\n", res.s4, res.s5, res.s6, res.s7);
 ----
 
+Executing a work-item containing this code gives the following result:
+[source]
+----
+res = [ 1.000000 2.000000 3.000000 4.000000 ]
+      [ 5.000000 6.000000 7.000000 8.000000 ]
+----
+
 == Version History
 
 [cols="5,15,15,70"]