From 6e1bd1c4f24552815e30489787e96d4e194dd3ab Mon Sep 17 00:00:00 2001
From: Emmanuele Bassi <ebassi@gnome.org>
Date: Mon, 12 Aug 2024 11:55:44 +0100
Subject: [PATCH] Use madd() intrinsic if available

AVX introduced the _mm_fmadd_ps() intrinsic, so we can use it if AVX (or
an equivalent instruction set) is available when building Graphene.

There is no functional difference in this commit if AVX is not
available, except that we moved from a generic static inline
implementation to a SIMD-specific one.
---
 include/graphene-simd4f.h | 73 ++++++++++++++++++++++++++++-----------
 src/graphene-simd4f.c     | 28 +++++++++++++++
 2 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h
index 758343d..67957b6 100644
--- a/include/graphene-simd4f.h
+++ b/include/graphene-simd4f.h
@@ -179,6 +179,11 @@ graphene_simd4f_t       graphene_simd4f_ceil            (const graphene_simd4f_t
 GRAPHENE_AVAILABLE_IN_1_12
 graphene_simd4f_t       graphene_simd4f_floor           (const graphene_simd4f_t s);
 
+GRAPHENE_AVAILABLE_IN_1_0
+graphene_simd4f_t       graphene_simd4f_madd            (const graphene_simd4f_t a,
+                                                         const graphene_simd4f_t b,
+                                                         const graphene_simd4f_t c);
+
 #if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE)
 
 /* SSE2 implementation of SIMD 4f */
@@ -504,6 +509,18 @@ typedef GRAPHENE_ALIGN16 union {
   }))
 #  endif
 
+#  if defined(GRAPHENE_USE_AVX)
+#   define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) _mm_fmadd_ps ((a), (b), (c)); \
+  }))
+#  else
+#   define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) _mm_add_ps (_mm_mul_ps ((a), (b)), (c)); \
+  }))
+#  endif
+
 /* On MSVC, we use static inlines */
 # elif defined (_MSC_VER) /* Visual Studio SSE intrinsics */
 
@@ -835,6 +852,20 @@ _simd4f_floor (const graphene_simd4f_t s)
 #endif
 }
 
+#define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)
+
+static inline graphene_simd4f_t
+_simd4d_madd (const graphene_simd4f_t a,
+              const graphene_simd4f_t b,
+              const graphene_simd4f_t c)
+{
+#if defined(GRAPHENE_USE_AVX)
+  return _mm_fmadd_ps (a, b, c);
+#else
+  return _mm_add_ps (_mm_mul_ps (a, b), c);
+#endif
+}
+
 #else /* SSE intrinsics-not GCC or Visual Studio */
 
 #  error "Need GCC-compatible or Visual Studio compiler for SSE extensions."
@@ -1158,6 +1189,11 @@ typedef int graphene_simd4i_t __attribute__((vector_size (16)));
     (graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
   }))
 
+# define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
+  }))
+
 #elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON)
 
 /* ARM Neon implementation of SIMD4f */
@@ -1498,6 +1534,11 @@ typedef float32x2_t graphene_simd2f_t;
     (graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
   }))
 
+# define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
+  }))
+
 #elif defined _MSC_VER /* Visual Studio ARM */
 
 # define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
@@ -1840,6 +1881,16 @@ _simd4f_floor (const graphene_simd4f_t s)
   return graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w);
 }
 
+# define graphene_simd4f_madd(a,b,c) _simd4d_madd(a,b,c)
+
+static inline graphene_simd4f_t
+_simd4f_madd (const graphene_simd4f_t a,
+              const graphene_simd4f_t b,
+              const graphene_simd4f_t c)
+{
+  return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
+}
+
 #else /* ARM NEON intrinsics-not GCC or Visual Studio */
 
 #  error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions."
@@ -1956,6 +2007,8 @@ _simd4f_floor (const graphene_simd4f_t s)
   (graphene_simd4f_ceil ((s)))
 #define graphene_simd4f_floor(s) \
   (graphene_simd4f_floor ((s)))
+#define graphene_simd4f_madd(a,b,c) \
+  (graphene_simd4f_madd ((a), (b), (c)))
 
 #else
 # error "Unsupported simd4f implementation."
@@ -1963,26 +2016,6 @@ _simd4f_floor (const graphene_simd4f_t s)
 
 /* Generic operations, inlined */
 
-/**
- * graphene_simd4f_madd:
- * @m1: a #graphene_simd4f_t
- * @m2: a #graphene_simd4f_t
- * @a: a #graphene_simd4f_t
- *
- * Adds @a to the product of @m1 and @m2.
- *
- * Returns: the result vector
- *
- * Since: 1.0
- */
-static inline graphene_simd4f_t
-graphene_simd4f_madd (const graphene_simd4f_t m1,
-                      const graphene_simd4f_t m2,
-                      const graphene_simd4f_t a)
-{
-  return graphene_simd4f_add (graphene_simd4f_mul (m1, m2), a);
-}
-
 /**
  * graphene_simd4f_sum:
  * @v: a #graphene_simd4f_t
diff --git a/src/graphene-simd4f.c b/src/graphene-simd4f.c
index 00c545b..d9f7e99 100644
--- a/src/graphene-simd4f.c
+++ b/src/graphene-simd4f.c
@@ -1073,6 +1073,26 @@ graphene_simd4f_t
   return graphene_simd4f_floor (s);
 }
 
+/**
+ * graphene_simd4f_madd:
+ * @a: a #graphene_simd4f_t
+ * @b: a #graphene_simd4f_t
+ * @c: a #graphene_simd4f_t
+ *
+ * Adds @a to the product of @m1 and @m2.
+ *
+ * Returns: the result vector
+ *
+ * Since: 1.0
+ */
+graphene_simd4f_t
+(graphene_simd4f_madd) (const graphene_simd4f_t a,
+                        const graphene_simd4f_t b,
+                        const graphene_simd4f_t c)
+{
+  return graphene_simd4f_madd (a, b, c);
+}
+
 #else /* GRAPHENE_USE_SCALAR */
 
 graphene_simd4f_t
@@ -1516,4 +1536,12 @@ graphene_simd4f_t
   return graphene_simd4f_init (floorf (s.x), floorf (s.y), floorf (s.z), floorf (s.w));
 }
 
+graphene_simd4f_t
+(graphene_simd4f_madd) (const graphene_simd4f_t a,
+                        const graphene_simd4f_t b,
+                        const graphene_simd4f_t c)
+{
+  return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
+}
+
 #endif /* GRAPHENE_USE_SCALAR */