Adds a simple image processing example

Two kernels are available: demosaicking and inverting. Cluster and FC implementations. Find more info in the README
SEU-NetSI · Apr 9, 2021 · 60081a5 · 60081a5
1 parent 2438954
commit 60081a5
Show file tree

Hide file tree

Showing 5 changed files with 581 additions and 0 deletions.
diff --git a/GAP8/image_processing_examples/simple_kernel_example/Makefile b/GAP8/image_processing_examples/simple_kernel_example/Makefile
@@ -0,0 +1,39 @@
+APP = test
+APP_SRCS += test.c $(GAP_LIB_PATH)/img_io/ImgIO.c img_proc.c
+APP_INC  += . $(GAP_LIB_PATH)/include
+
+APP_CFLAGS += -O3 -g
+
+
+PMSIS_OS ?= pulp_os
+
+# this flag enables asynchronous image capturing
+# keep in mind that you should always have a buffer enqueued
+# if recording an image stream to avoid missing data (meaning you need 2)
+APP_CFLAGS += -DASYNC_CAPTURE
+# configures camera in QVGA format (324x244)
+# the extra 4 is from padding for the RGB camera (but also present on the grey one)
+APP_CFLAGS += -DQVGA_MODE
+
+# YOU CAN ONLY ACTIVATE ONE KERNEL AT A TIME
+
+# apply the demosaicking algorithm (only useful if using RGB camera)
+# runs sequential on fabric controller
+# APP_CFLAGS += -DDEMOSAICKING_KERNEL_FC
+# apply the demosaicking algorithm (only useful if using RGB camera)
+# runs paralell on cluster
+APP_CFLAGS += -DDEMOSAICKING_KERNEL_CLUSTER
+# only if one of the demosaicking kernel above is active
+# activate for a colored output
+APP_CFLAGS += -DCOLOR_IMAGE
+# apply an example kernel that inverts the image
+# runs sequential on fabric controller
+# APP_CFLAGS += -DINVERTING_KERNEL_FC
+# apply an example kernel that inverts the image
+# runs paralell on cluster
+# APP_CFLAGS += -DINVERTING_KERNEL_CLUSTER
+
+clean::
+	rm -rf img_raw.ppm img_color.ppm img_gray.ppm img_inverted.ppm
+
+include $(RULES_DIR)/pmsis_rules.mk
diff --git a/GAP8/image_processing_examples/simple_kernel_example/README.md b/GAP8/image_processing_examples/simple_kernel_example/README.md
@@ -0,0 +1,24 @@
+# Image processing examples
+
+This example takes an image, applies a kernel and writes the image over your JTAG cable (using openOCD file semi-hosting) to your computer.
+You can choose between two kernels:
+
+- demosaicking 
+- inverting
+
+for both you can choose to either execute the kernel sequentially on the fabric controller or parallelized on the cluster. Check out the defines in the Makefile to configure your kernel! But be careful, it only supports to choose one kernel at a time.
+## Performance
+To keep this example simple, we are just saving the time before and after the kernel runs - if you want more precise performance measurements check out  https://greenwaves-technologies.com/manuals/BUILD/PMSIS_API/html/group__Perf.html .
+Here some pitfalls that we improved in the demosaicking kernel:
+First computing a grayscale image from a RGB camera took around 2.33s (all measurements here are at a frequency of 50MHz on the fabric controller and cluster). That's awfully long, so what were we doing wrong?
+
+- `output[idx] = 0.33*red + 0.33*green + 0.33*blue;` was not a good idea - we do not have a floating point unit (FPU) on GAP8, so float multiplications are really slow. 
+- `output[idx] = red/3 + green/3 + blue/3;` This got us to 303.3ms - but we can still improve, right?
+- `output[idx] = (red + green + blue)/3;` We only need one division! This gives us 199.2ms
+- we also have 8 cores on the cluster! Paralellizing it brought us down to 33.3ms.
+
+With those 3 steps we could already improve by a factor of 70. There are still other possible improvements, like moving the check `grayscale == 1` out of the for-loop for avoiding wrong branch predictions or loop-unrolling. 
+
+One last hint: be careful with printf - this is a very long and complicated function and should never be inside your performance measurement.
+
+This example was developed and tested with the gap-sdk 3.8.1.
diff --git a/GAP8/image_processing_examples/simple_kernel_example/img_proc.c b/GAP8/image_processing_examples/simple_kernel_example/img_proc.c
@@ -0,0 +1,240 @@
+#include "img_proc.h"
+
+void demosaicking(char *input, char* output, int width, int height, const int grayscale)
+{
+    int idx = 0;
+    int idxr[8];
+    char red, blue, green;
+
+    for (int y = 0; y < height ; y++)
+    {
+        for (int x = 0; x < width ; x++)
+        {
+            int idx = y * width + x;
+
+            if (x == 0 || y == 0 || x == width-1 || y == height-1)
+            {
+                if(grayscale)
+                {
+                    output[idx] = 0;
+                }
+                else
+                {
+                    output[idx * 3] = 0;
+                    output[idx * 3 + 1] = 0;
+                    output[idx * 3 + 2] = 0;
+                }
+            }
+            else
+            {
+
+                idxr[0] = (y - 1) * width + (x - 1);
+                idxr[1] = (y)*width + (x - 1);
+                idxr[2] = (y + 1) * width + (x - 1);
+                idxr[3] = (y + 1) * width + (x);
+                idxr[4] = (y + 1) * width + (x + 1);
+                idxr[5] = (y)*width + (x + 1);
+                idxr[6] = (y - 1) * width + (x + 1);
+                idxr[7] = (y - 1) * width + (x);
+
+                int x_shift = 0;
+                int y_shift = 0;
+
+                if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 0) //R
+                {
+                    red = input[idx];
+                    blue = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4;
+                    green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4;
+                }
+                else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 0) //G2
+                {
+                    red = (input[idxr[1]] + input[idxr[5]]) / 2;
+                    blue = (input[idxr[3]] + input[idxr[7]]) / 2;
+                    green = input[idx];
+                }
+                else if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 1) //G1
+                {
+                    red = (input[idxr[3]] + input[idxr[7]]) / 2;
+                    blue = (input[idxr[1]] + input[idxr[5]]) / 2;
+                    green = input[idx];
+                }
+                else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 1) //B
+                {
+                    red = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4;
+                    blue = input[idx];
+                    green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4;
+                }
+                else
+                {
+                    red = 0;
+                    green = 0;
+                    blue = 0;
+                }
+
+                if(grayscale)
+                {
+                    output[idx] = (red + green + blue)/3;
+
+                }else
+                {
+                    output[idx * 3] = red;
+                    output[idx * 3 + 1] = green;
+                    output[idx * 3 + 2] = blue;
+                }
+
+
+            }
+        }
+    }
+}
+
+void cluster_demosaicking(void* args)
+{
+    uint32_t x = 0;
+    uint32_t y = 0;
+    uint32_t core_id = pi_core_id(), cluster_id = pi_cluster_id();
+    plp_example_kernel_instance_i32 *a = (plp_example_kernel_instance_i32*)args;
+    char *input = a->srcBuffer;
+    char *output = a->resBuffer;
+    uint32_t width = a->width;
+    uint32_t height = a->height;
+    uint32_t nPE = a->nPE;
+    uint32_t grayscale = a->grayscale;
+
+    int idxr[8];
+    char red, blue, green;
+
+    // uint32_t total = width*height;
+
+    // amount of elements per core, rounded up
+    uint32_t x_per_core = (height+nPE-1)/nPE;
+    // compute the last element of the area each core has to process
+    uint32_t upper_bound = (core_id+1)*x_per_core;
+    // as we always rounded up before (to distribute the load as equal as possible) we need to check if the upper bound is still in our matrix
+    if(upper_bound > height ) upper_bound = height; 
+    // loop over the area assigned to the core
+    for (y = core_id*x_per_core; y < upper_bound; y++) {
+
+        for (int x = 0; x < width ; x++)
+        {
+            int idx = y * width + x;
+
+            if (x == 0 || y == 0 || x == width-1 || y == height-1)
+            {
+                if(grayscale)
+                {
+                    output[idx] = 0;
+                }
+                else
+                {
+                    output[idx * 3] = 0;
+                    output[idx * 3 + 1] = 0;
+                    output[idx * 3 + 2] = 0;
+                }
+            }
+            else
+            {
+
+                idxr[0] = (y - 1) * width + (x - 1);
+                idxr[1] = (y)*width + (x - 1);
+                idxr[2] = (y + 1) * width + (x - 1);
+                idxr[3] = (y + 1) * width + (x);
+                idxr[4] = (y + 1) * width + (x + 1);
+                idxr[5] = (y)*width + (x + 1);
+                idxr[6] = (y - 1) * width + (x + 1);
+                idxr[7] = (y - 1) * width + (x);
+
+                int x_shift = 0;
+                int y_shift = 0;
+
+                if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 0) //R
+                {
+                    red = input[idx];
+                    blue = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4;
+                    green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4;
+                }
+                else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 0) //G2
+                {
+                    red = (input[idxr[1]] + input[idxr[5]]) / 2;
+                    blue = (input[idxr[3]] + input[idxr[7]]) / 2;
+                    green = input[idx];
+                }
+                else if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 1) //G1
+                {
+                    red = (input[idxr[3]] + input[idxr[7]]) / 2;
+                    blue = (input[idxr[1]] + input[idxr[5]]) / 2;
+                    green = input[idx];
+                }
+                else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 1) //B
+                {
+                    red = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4;
+                    blue = input[idx];
+                    green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4;
+                }
+                else
+                {
+                    red = 0;
+                    green = 0;
+                    blue = 0;
+                }
+
+                if(grayscale)
+                {
+                    output[idx] = (red + green + blue)/3;
+
+                }else
+                {
+                    output[idx * 3] = red;
+                    output[idx * 3 + 1] = green;
+                    output[idx * 3 + 2] = blue;
+                }
+
+
+            }
+        }
+    }
+}
+
+void inverting(char *input, char* output, int width, int height)
+{
+    int idx = 0;
+
+    for (int y = 0; y < height ; y++)
+    {
+        for (int x = 0; x < width ; x++)
+        {
+            int idx = y * width + x;
+
+            output[idx] = 255 - input[idx];
+
+        }
+    }
+}
+
+
+void cluster_inverting(void* args)
+{
+    uint32_t idx = 0;
+    uint32_t core_id = pi_core_id(), cluster_id = pi_cluster_id();
+    plp_example_kernel_instance_i32 *a = (plp_example_kernel_instance_i32*)args;
+    char *srcBuffer = a->srcBuffer;
+    char *resBuffer = a->resBuffer;
+    uint32_t width = a->width;
+    uint32_t height = a->height;
+    uint32_t nPE = a->nPE;
+
+    uint32_t total = width*height;
+
+    // amount of elements per core, rounded up
+    uint32_t per_core = (total+nPE-1)/nPE;
+    // compute the last element of the area each core has to process
+    uint32_t upper_bound = (core_id+1)*per_core;
+    // as we always rounded up before (to distribute the load as equal as possible) we need to check if the upper bound is still in our matrix
+    if(upper_bound > total ) upper_bound = total; 
+    // loop over the area assigned to the core
+    for (idx = core_id*per_core; idx < upper_bound; idx++) {
+
+            resBuffer[idx] = 255 - srcBuffer[idx];
+
+    }
+}
diff --git a/GAP8/image_processing_examples/simple_kernel_example/img_proc.h b/GAP8/image_processing_examples/simple_kernel_example/img_proc.h
@@ -0,0 +1,20 @@
+#ifndef __IMG_PROC_H__
+#define __IMG_PROC_H__
+
+#include "pmsis.h"
+
+typedef struct {
+    char *srcBuffer;     // pointer to the input vector
+    char *resBuffer;     // pointer to the output vector
+    uint32_t width;      // image width
+    uint32_t height;     // image height
+    uint32_t nPE;        // number of cores
+    uint32_t grayscale;        // grayscale if one
+} plp_example_kernel_instance_i32;
+
+void demosaicking(char *input, char* output, int width, int height, int grayscale);
+void cluster_demosaicking(void* args);
+void inverting(char *input, char* output, int width, int height);
+void cluster_inverting(void* args);
+
+#endif