Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AVX implementation of graphene_simd4f_madd() #269

Merged
merged 2 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions include/graphene-config.h.meson
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ extern "C" {
# if defined(GRAPHENE_USE_SSE)
# include <xmmintrin.h>
# include <emmintrin.h>
#
# if defined(_M_IX86_FP)
# if _M_IX86_FP >= 2
# define GRAPHENE_USE_SSE4_1
Expand All @@ -66,9 +67,18 @@ extern "C" {
# elif defined(_MSC_VER)
# define GRAPHENE_USE_SSE4_1
# endif
#
# if defined(__AVX__)
# #define GRAPHENE_USE_AVX
# endif
#
# if defined(GRAPHENE_USE_SSE4_1)
# include <smmintrin.h>
# endif
#
# if defined(GRAPHENE_USE_AVX)
# include <immintrin.h>
# endif
typedef __m128 graphene_simd4f_t;
# elif defined(GRAPHENE_USE_ARM_NEON)
# if defined (_MSC_VER) && (_MSC_VER < 1920) && defined (_M_ARM64)
Expand Down
73 changes: 53 additions & 20 deletions include/graphene-simd4f.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ graphene_simd4f_t graphene_simd4f_ceil (const graphene_simd4f_t
GRAPHENE_AVAILABLE_IN_1_12
graphene_simd4f_t graphene_simd4f_floor (const graphene_simd4f_t s);

GRAPHENE_AVAILABLE_IN_1_0
graphene_simd4f_t graphene_simd4f_madd (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c);

#if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE)

/* SSE2 implementation of SIMD 4f */
Expand Down Expand Up @@ -504,6 +509,18 @@ typedef GRAPHENE_ALIGN16 union {
}))
# endif

# if defined(GRAPHENE_USE_AVX)
# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) _mm_fmadd_ps ((a), (b), (c)); \
}))
# else
# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) _mm_add_ps (_mm_mul_ps ((a), (b)), (c)); \
}))
# endif

/* On MSVC, we use static inlines */
# elif defined (_MSC_VER) /* Visual Studio SSE intrinsics */

Expand Down Expand Up @@ -835,6 +852,20 @@ _simd4f_floor (const graphene_simd4f_t s)
#endif
}

#define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)

static inline graphene_simd4f_t
_simd4f_madd (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
#if defined(GRAPHENE_USE_AVX)
return _mm_fmadd_ps (a, b, c);
#else
return _mm_add_ps (_mm_mul_ps (a, b), c);
#endif
}

#else /* SSE intrinsics-not GCC or Visual Studio */

# error "Need GCC-compatible or Visual Studio compiler for SSE extensions."
Expand Down Expand Up @@ -1158,6 +1189,11 @@ typedef int graphene_simd4i_t __attribute__((vector_size (16)));
(graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
}))

# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
}))

#elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON)

/* ARM Neon implementation of SIMD4f */
Expand Down Expand Up @@ -1498,6 +1534,11 @@ typedef float32x2_t graphene_simd2f_t;
(graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
}))

# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
}))

#elif defined _MSC_VER /* Visual Studio ARM */

# define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
Expand Down Expand Up @@ -1840,6 +1881,16 @@ _simd4f_floor (const graphene_simd4f_t s)
return graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w);
}

# define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)

static inline graphene_simd4f_t
_simd4f_madd (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
}

#else /* ARM NEON intrinsics-not GCC or Visual Studio */

# error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions."
Expand Down Expand Up @@ -1956,33 +2007,15 @@ _simd4f_floor (const graphene_simd4f_t s)
(graphene_simd4f_ceil ((s)))
#define graphene_simd4f_floor(s) \
(graphene_simd4f_floor ((s)))
#define graphene_simd4f_madd(a,b,c) \
(graphene_simd4f_madd ((a), (b), (c)))

#else
# error "Unsupported simd4f implementation."
#endif

/* Generic operations, inlined */

/**
* graphene_simd4f_madd:
* @m1: a #graphene_simd4f_t
* @m2: a #graphene_simd4f_t
* @a: a #graphene_simd4f_t
*
* Adds @a to the product of @m1 and @m2.
*
* Returns: the result vector
*
* Since: 1.0
*/
static inline graphene_simd4f_t
graphene_simd4f_madd (const graphene_simd4f_t m1,
const graphene_simd4f_t m2,
const graphene_simd4f_t a)
{
return graphene_simd4f_add (graphene_simd4f_mul (m1, m2), a);
}

/**
* graphene_simd4f_sum:
* @v: a #graphene_simd4f_t
Expand Down
28 changes: 28 additions & 0 deletions src/graphene-simd4f.c
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,26 @@ graphene_simd4f_t
return graphene_simd4f_floor (s);
}

/**
* graphene_simd4f_madd:
* @a: a #graphene_simd4f_t
* @b: a #graphene_simd4f_t
* @c: a #graphene_simd4f_t
*
* Adds @a to the product of @m1 and @m2.
*
* Returns: the result vector
*
* Since: 1.0
*/
graphene_simd4f_t
(graphene_simd4f_madd) (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
return graphene_simd4f_madd (a, b, c);
}

#else /* GRAPHENE_USE_SCALAR */

graphene_simd4f_t
Expand Down Expand Up @@ -1516,4 +1536,12 @@ graphene_simd4f_t
return graphene_simd4f_init (floorf (s.x), floorf (s.y), floorf (s.z), floorf (s.w));
}

graphene_simd4f_t
(graphene_simd4f_madd) (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
}

#endif /* GRAPHENE_USE_SCALAR */
Loading