ebassi · ebassi · Aug 15, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/include/graphene-config.h.meson b/include/graphene-config.h.meson
@@ -57,6 +57,7 @@ extern "C" {
 # if defined(GRAPHENE_USE_SSE)
 #  include <xmmintrin.h>
 #  include <emmintrin.h>
+#
 #  if defined(_M_IX86_FP)
 #   if _M_IX86_FP >= 2
 #    define GRAPHENE_USE_SSE4_1
@@ -66,9 +67,18 @@ extern "C" {
 #  elif defined(_MSC_VER)
 #   define GRAPHENE_USE_SSE4_1
 #  endif
+#
+#  if defined(__AVX__)
+#    #define GRAPHENE_USE_AVX
+#  endif
+#
 #  if defined(GRAPHENE_USE_SSE4_1)
 #   include <smmintrin.h>
 #  endif
+#
+#  if defined(GRAPHENE_USE_AVX)
+#   include <immintrin.h>
+#  endif
 typedef __m128 graphene_simd4f_t;
 # elif defined(GRAPHENE_USE_ARM_NEON)
 #  if defined (_MSC_VER) && (_MSC_VER < 1920) && defined (_M_ARM64)

diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h
@@ -179,6 +179,11 @@ graphene_simd4f_t       graphene_simd4f_ceil            (const graphene_simd4f_t
 GRAPHENE_AVAILABLE_IN_1_12
 graphene_simd4f_t       graphene_simd4f_floor           (const graphene_simd4f_t s);
 
+GRAPHENE_AVAILABLE_IN_1_0
+graphene_simd4f_t       graphene_simd4f_madd            (const graphene_simd4f_t a,
+                                                         const graphene_simd4f_t b,
+                                                         const graphene_simd4f_t c);
+
 #if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE)
 
 /* SSE2 implementation of SIMD 4f */
@@ -504,6 +509,18 @@ typedef GRAPHENE_ALIGN16 union {
   }))
 #  endif
 
+#  if defined(GRAPHENE_USE_AVX)
+#   define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) _mm_fmadd_ps ((a), (b), (c)); \
+  }))
+#  else
+#   define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) _mm_add_ps (_mm_mul_ps ((a), (b)), (c)); \
+  }))
+#  endif
+
 /* On MSVC, we use static inlines */
 # elif defined (_MSC_VER) /* Visual Studio SSE intrinsics */
 
@@ -835,6 +852,20 @@ _simd4f_floor (const graphene_simd4f_t s)
 #endif
 }
 
+#define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)
+
+static inline graphene_simd4f_t
+_simd4f_madd (const graphene_simd4f_t a,
+              const graphene_simd4f_t b,
+              const graphene_simd4f_t c)
+{
+#if defined(GRAPHENE_USE_AVX)
+  return _mm_fmadd_ps (a, b, c);
+#else
+  return _mm_add_ps (_mm_mul_ps (a, b), c);
+#endif
+}
+
 #else /* SSE intrinsics-not GCC or Visual Studio */
 
 #  error "Need GCC-compatible or Visual Studio compiler for SSE extensions."
@@ -1158,6 +1189,11 @@ typedef int graphene_simd4i_t __attribute__((vector_size (16)));
     (graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
   }))
 
+# define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
+  }))
+
 #elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON)
 
 /* ARM Neon implementation of SIMD4f */
@@ -1498,6 +1534,11 @@ typedef float32x2_t graphene_simd2f_t;
     (graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
   }))
 
+# define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
+  }))
+
 #elif defined _MSC_VER /* Visual Studio ARM */
 
 # define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
@@ -1840,6 +1881,16 @@ _simd4f_floor (const graphene_simd4f_t s)
   return graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w);
 }
 
+# define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)
+
+static inline graphene_simd4f_t
+_simd4f_madd (const graphene_simd4f_t a,
+              const graphene_simd4f_t b,
+              const graphene_simd4f_t c)
+{
+  return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
+}
+
 #else /* ARM NEON intrinsics-not GCC or Visual Studio */
 
 #  error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions."
@@ -1956,33 +2007,15 @@ _simd4f_floor (const graphene_simd4f_t s)
   (graphene_simd4f_ceil ((s)))
 #define graphene_simd4f_floor(s) \
   (graphene_simd4f_floor ((s)))
+#define graphene_simd4f_madd(a,b,c) \
+  (graphene_simd4f_madd ((a), (b), (c)))
 
 #else
 # error "Unsupported simd4f implementation."
 #endif
 
 /* Generic operations, inlined */
 
-/**
- * graphene_simd4f_madd:
- * @m1: a #graphene_simd4f_t
- * @m2: a #graphene_simd4f_t
- * @a: a #graphene_simd4f_t
- *
- * Adds @a to the product of @m1 and @m2.
- *
- * Returns: the result vector
- *
- * Since: 1.0
- */
-static inline graphene_simd4f_t
-graphene_simd4f_madd (const graphene_simd4f_t m1,
-                      const graphene_simd4f_t m2,
-                      const graphene_simd4f_t a)
-{
-  return graphene_simd4f_add (graphene_simd4f_mul (m1, m2), a);
-}
-
 /**
  * graphene_simd4f_sum:
  * @v: a #graphene_simd4f_t

diff --git a/src/graphene-simd4f.c b/src/graphene-simd4f.c
@@ -1073,6 +1073,26 @@ graphene_simd4f_t
   return graphene_simd4f_floor (s);
 }
 
+/**
+ * graphene_simd4f_madd:
+ * @a: a #graphene_simd4f_t
+ * @b: a #graphene_simd4f_t
+ * @c: a #graphene_simd4f_t
+ *
+ * Adds @a to the product of @m1 and @m2.
+ *
+ * Returns: the result vector
+ *
+ * Since: 1.0
+ */
+graphene_simd4f_t
+(graphene_simd4f_madd) (const graphene_simd4f_t a,
+                        const graphene_simd4f_t b,
+                        const graphene_simd4f_t c)
+{
+  return graphene_simd4f_madd (a, b, c);
+}
+
 #else /* GRAPHENE_USE_SCALAR */
 
 graphene_simd4f_t
@@ -1516,4 +1536,12 @@ graphene_simd4f_t
   return graphene_simd4f_init (floorf (s.x), floorf (s.y), floorf (s.z), floorf (s.w));
 }
 
+graphene_simd4f_t
+(graphene_simd4f_madd) (const graphene_simd4f_t a,
+                        const graphene_simd4f_t b,
+                        const graphene_simd4f_t c)
+{
+  return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
+}
+
 #endif /* GRAPHENE_USE_SCALAR */