sslotin · Thraetaona · Dec 11, 2022
diff --git a/matmul/self-contained.cc b/matmul/self-contained.cc
@@ -1,76 +1,235 @@
 #pragma GCC optimize("Ofast,unroll-loops")
 #pragma GCC target("avx2,fma")
 
-#include <bits/stdc++.h>
-#include <x86intrin.h>
-
-const int u = 48; // all block coordinates must be a multiple of lcm(6, 16) = 48
-
-const int n = 40 * u; // = 1920 (also try 240, 480, 960)
-alignas(64) float a[n * n], b[n * n], c[n * n];
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stdint.h>
+#include <errno.h> // needed for the extern int errno;
+
+#ifndef __cplusplus // Dependencies that are specific to C (as opposed to C++)
+#include <stdalign.h> // for aligned memory allocations (needed by SIMD)
+#endif
+
+// C version of C++'s std::min().
+#define MIN(a, b) ({ \
+    __typeof__ (a) _a = (a); \
+    __typeof__ (b) _b = (b); \
+    _a < _b ? _a : _b; \
+})
+
+#define KERNEL_ROWS 6
+#define KERNEL_COLS 2
+
+// These may be fine-tuned for different target architectures and cache sizes.
+#define L3_STEP 64  // How many columns of matrix B to select.
+#define L2_STEP 120 // How many rows of matrix A to select.
+#define L1_STEP 240 // How many rows of matrix B to select.
+
+// Documented here: https://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
+//
+// v8sf means that this vector is a vector of 8 units of 4 bytes.  In other
+// words, this vector is 32 (8x4) bytes wide and is divided into 8 float-sized
+// units that are 4 bytes each.
+typedef float v8sf __attribute__ (( vector_size(8 * sizeof (float)) ));
+
+
+// A 6x16 micro-kernel to benefit from instruction-level SIMD parallelism.
+//
+// NOTE: While these for-loops are nested, they are also quite short in length.
+// For this exact reason, they (and the 2-D SIMD vectors) will automatically be
+// unrolled by the compiler (as defined in the #pragma) for greater performance.
+//
+// This is a shortcut way in C to force an inline "variable" into a register.
+static const v8sf _FORCE_INTO_REGISTER = {};
+__attribute__ ((optimize("unroll-loops"))) static inline void kernel(
+    const float* mat_a, const v8sf* mat_b, v8sf* __restrict__ result,
+    const int x, const int y, const int l, const int r, const int n 
+) {
+    // Zero'ed initially, and then stored in the YMM (on x86) registers.
+    v8sf simd_registers[KERNEL_ROWS][KERNEL_COLS] = {0};
+
+    // NOTE: Use signed only; other types (e.g., uint_fast32_t, unsigned int, or
+    // uint32_t) prevent the Compiler from making the (rightful) assumption that
+    // overflows are undefined, especially in the first for-loop.
+    // Additionally, using int_fastN_t will leave it to the Compiler to  use the
+    // fastest available register.  This is usually a long int on x86 (so there
+    // won't be any saving on space) but this may still benefit smaller systems
+    // such as ARM where the cache space is scarce and the fastest available
+    // register may actually be smaller than a long int or even an int.
+    for (int_fast32_t k = l; k < r; ++k)
+        for (int_fast8_t  i = 0; i < KERNEL_ROWS; ++i)
+            for (int_fast8_t j = 0; j < KERNEL_COLS; ++j) {
+                // This forces mat_a[x + i][k] into a register, and 
+                // multiplies mat_b[k][y:y+16] by it, then updates
+                // simd_registers[i][0] and simd_registers[i][1].
+                // 
+                // Gets converted to an FMA (Fused Multiply-Add) by GCC.
+                simd_registers[i][j] += (_FORCE_INTO_REGISTER + mat_a[(x + i)*n + k]) * ((v8sf*)mat_b)[(k*n + y)/8 + j];
+            }
+
+    // Writes the sub-matrix results back from the vector registers.
+    for (int_fast8_t i = 0; i < KERNEL_ROWS; ++i)
+        for (int_fast8_t j = 0; j < KERNEL_COLS; ++j)
+            ((v8sf*)result)[(((x + i)*n + y)/8) + j] += simd_registers[i][j];
+}
 
-typedef float vec __attribute__ (( vector_size(32) ));
+// Helper function to allocate and zero a region of memory (similar to calloc())
+// but in an aligned manner for 32-bit floats. (Also error-checked)
+float* float_aligned_calloc(const unsigned int size) {
+    alignas(64) float* ptr = (float*) aligned_alloc(64, size * sizeof (*ptr));
+    // aligned_alloc() returns NULL on failure and/or 0-sized allocations.
+    if (ptr == NULL && size != 0) { 
+        // (If any) prints the error code with its message to stderr.
+        if (errno != 0) {
+            fprintf(stderr, \
+                "Error no. `%d` in aligned memory allocation was "
+                "encountered: \"%s\"\n ", errno, strerror(errno) \
+            );
+        }
+        // Memory allocation was not successful, so we return NULL to indicate
+        // that an error had occured.
+        return NULL;
+    }
+
+    memset(ptr, 0, size * sizeof (*ptr));
+    // The requested memory was successfully allocated and will be returned.
+    return ptr;
+}
 
-// 6x16 micro-kernel
-void kernel(int x, int y, int l, int r) {
-    vec t[6][2]{0};
+// Multiplies two square matrices of the same width and height.
+// Takes two pointers to said matrices, as well as their "dimension" (i.e.,
+// width or height)
+float* matmul(const float *mat_a, const float *mat_b, int n) {
+    // To simplify the implementation, we pad the height and width,
+    // so that they are divisible by 6 and 16, respectively.
+    int nx = ((n + 5)/6) * 6;
+    int ny = ((n + 15)/16) * 16;
+
+    // SIMD operartions need to be able to access their memory regions with
+    // aligned read/write operations, otherwise the program could crash.
+    float *aligned_mat_a = float_aligned_calloc(nx * ny);
+    float *aligned_mat_b = float_aligned_calloc(nx * ny);
+    float *aligned_result = float_aligned_calloc(nx * ny);
+
+    // Copies the non-aligned input arrays into the new, aligned ones.
+    for (int i = 0; i < n; ++i) {
+        memcpy(&aligned_mat_a[i * ny], &mat_a[i * n], sizeof (*aligned_mat_a) * n);
+        memcpy(&aligned_mat_b[i * ny], &mat_b[i * n], sizeof (*aligned_mat_b) * n);
+    }
 
-    for (int k = l; k < r; k++)
-        for (int i = 0; i < 6; i++)
-            for (int j = 0; j < 2; j++)
-                t[i][j] += (vec{} + a[x * n + i * n + k]) * ((vec*) b)[n / 8 * k + y / 8 + j];
+    // The outermost loops comprise the Macro Kernel.
+    for (int i3 = 0; i3 < ny; i3 += L3_STEP)
+        // We are now working with mat_b[:][i3:i3+L3_STEP].
+        for (int i2 = 0; i2 < nx; i2 += L2_STEP)
+        // We are now working with mat_aa[i2:i2+L2_STEP][:].
+            for (int i1 = 0; i1 < ny; i1 += L1_STEP)
+                // We are now working with b[i1:i1+s1][i3:i3+s3].
+                // This equates to updating c[i2:i2+s2][i3:i3+s3]
+                // with [l:r] = [i1:i1+s1]
+                for (int x = i2; x < MIN(i2 + L2_STEP, n); x += 6)
+                    for (int y = i3; y < MIN(i3 + L3_STEP, n); y += 16) {
+                        kernel(
+                            aligned_mat_a, (v8sf*)aligned_mat_b, (v8sf*)aligned_result,
+                            x, y, i1, MIN(i1 + L1_STEP, n), ny
+                        );
+                    }
+
+    float* result = (float*) malloc(n*n * sizeof (*result));
+    for (int i = 0; i < n; ++i)
+        memcpy(&result[i * n], &aligned_result[i * ny], n * sizeof (*result));
+
+    // Frees the previous temporary arrays as we are done working with them.
+    free(aligned_mat_a);
+    free(aligned_mat_b);
+    free(aligned_result);
 
-    for (int i = 0; i < 6; i++)
-        for (int j = 0; j < 2; j++)
-            ((vec*) c)[x * n / 8 + i * n / 8 + y / 8 + j] += t[i][j];
+    return (float*) result;
 }
 
-void matmul() {
-    const int s3 = 2 * u;
-    const int s2 = 4 * u;
-    const int s1 = 8 * u;
+// A very slow but correct method for double-checking the results from matmul()
+// and benchmarks' comparison.
+float* naive(const float* mat_a, const float* mat_b, const int n) {
+    float *result = (float*) calloc(n * n, sizeof (result));
+    if (result == NULL) {
+        return NULL;
+    }
 
-    for (int i3 = 0; i3 < n; i3 += s3)
-        for (int i2 = 0; i2 < n; i2 += s2)
-            for (int i1 = 0; i1 < n; i1 += s1)
-                for (int x = i2; x < std::min(i2 + s2, n); x += 6)
-                    for (int y = i3; y < std::min(i3 + s3, n); y += 16)
-                        kernel(x, y, i1, std::min(i1 + s1, n));
-}
-
-void naive() {
     for (int i = 0; i < n; i++)
         for (int j = 0; j < n; j++)
             for (int k = 0; k < n; k++)
-                c[i * n + j] += a[i * n + k] * b[k * n + j];
+                result[i*n + j] += mat_a[i*n + k] * mat_b[k*n + j];
+
+    return result;
 }
 
-void timeit(void (*f)()) {
-    memset(c, 0, sizeof c);
-    f();
-
-    static const int idx[5] = {rand(), rand(), rand(), rand(), rand()};
-    for (int i : idx)
-        printf("%.4f ", c[i % (n * n)]);
+static int idx[5];
+float* timeit(
+    float* (*f)(const float*, const float*, const int),
+    const float* mat_a, const float* mat_b, const int n
+) {
+    float *result_matrix = f(mat_a, mat_b, n);
+    // Selects 5 randomly pre-determined elements of the matrices to be shown.
+    for (int i = 0; i < 5; ++i)
+        printf("%.4f ", result_matrix[idx[i] % (n * n)]);
     printf("\n");
 
     clock_t start = clock();
-
     int cnt = 0;
     while (clock() - start < CLOCKS_PER_SEC)
-        f(), cnt++;
-
-    printf("%.6f x %d\n", float(clock() - start) / CLOCKS_PER_SEC / cnt, cnt);
+        f(mat_a, mat_b, n), cnt++;
+    printf("Took %.6f x %d\n", (float)(clock() - start) / CLOCKS_PER_SEC / cnt, cnt);
+
+    return result_matrix;
 }
 
+// Test Driver
 int main() {
-    for (int i = 0; i < n * n; i++) {
-        a[i] = float(rand()) / RAND_MAX;
-        b[i] = float(rand()) / RAND_MAX;
+    const int n = 1920; // Matrix of size n x n (n needs to be a natural number.)
+
+    // These are just test samples.  They should be allocated on the heap if
+    // their sizes are large, otherwise an stack overflow will happen.
+    float *mat_a = (float*) malloc(sizeof (*mat_a) * n*n);
+    float *mat_b = (float*) malloc(sizeof (*mat_b) * n*n);
+
+    for (int i = 0; i < n*n; ++i) {
+        mat_a[i] = (float)rand() / RAND_MAX;
+        mat_b[i] = (float)rand() / RAND_MAX;
+    }
+
+    // idx is a static global variable that timit() will use to randomly print
+    // 5 elements of its callee's result.
+    for (int i = 0; i < 5; ++i)
+        idx[i] = rand();
+
+    printf("Running matmul() on a matrix of size %dx%d...\n", n, n);
+    float* matmul_res = timeit(matmul, mat_a, mat_b, n);
+
+    printf("\n");
+
+    printf("Running naive() on a matrix of size %dx%d....\n", n, n);
+    float* naive_res = timeit(naive, mat_a, mat_b, n);
+
+    printf("\n\n");
+
+    printf("Calculations finished, comparing all the results now...\n");
+    // Compares each and every element of the results of both naive() and
+    // matmul() and reports if a mismatch was found (due to the use of -Ofast,
+    // an error/difference of up to 0.0010f is tolerated.)
+    for (int i = 0; i < n*n; ++i) {
+        if (abs(matmul_res[i] - naive_res[i]) > 0.0010f) {
+            printf(
+                "Mismatching element found: %.4f and %.4f at index %d with a delta of %f.\n", 
+                matmul_res[i], naive_res[i], i, matmul_res[i] - naive_res[i]
+            );
+            return EXIT_FAILURE;
+        }
     }
+    printf("Both results were fully matching.\n");
 
-    timeit(naive);
-    timeit(matmul);
+    free(matmul_res);
+    free(naive_res);
 
-    return 0;
+    return EXIT_SUCCESS;
 }