forked from bitcraze/aideck-gap8-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds a simple image processing example
Two kernels are available: demosaicking and inverting. Cluster and FC implementations. Find more info in the README
- Loading branch information
Hanna Müller
committed
Apr 9, 2021
1 parent
2438954
commit 60081a5
Showing
5 changed files
with
581 additions
and
0 deletions.
There are no files selected for viewing
39 changes: 39 additions & 0 deletions
39
GAP8/image_processing_examples/simple_kernel_example/Makefile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
APP = test | ||
APP_SRCS += test.c $(GAP_LIB_PATH)/img_io/ImgIO.c img_proc.c | ||
APP_INC += . $(GAP_LIB_PATH)/include | ||
|
||
APP_CFLAGS += -O3 -g | ||
|
||
|
||
PMSIS_OS ?= pulp_os | ||
|
||
# this flag enables asynchronous image capturing | ||
# keep in mind that you should always have a buffer enqueued | ||
# if recording an image stream to avoid missing data (meaning you need 2) | ||
APP_CFLAGS += -DASYNC_CAPTURE | ||
# configures camera in QVGA format (324x244) | ||
# the extra 4 is from padding for the RGB camera (but also present on the grey one) | ||
APP_CFLAGS += -DQVGA_MODE | ||
|
||
# YOU CAN ONLY ACTIVATE ONE KERNEL AT A TIME | ||
|
||
# apply the demosaicking algorithm (only useful if using RGB camera) | ||
# runs sequential on fabric controller | ||
# APP_CFLAGS += -DDEMOSAICKING_KERNEL_FC | ||
# apply the demosaicking algorithm (only useful if using RGB camera) | ||
# runs paralell on cluster | ||
APP_CFLAGS += -DDEMOSAICKING_KERNEL_CLUSTER | ||
# only if one of the demosaicking kernel above is active | ||
# activate for a colored output | ||
APP_CFLAGS += -DCOLOR_IMAGE | ||
# apply an example kernel that inverts the image | ||
# runs sequential on fabric controller | ||
# APP_CFLAGS += -DINVERTING_KERNEL_FC | ||
# apply an example kernel that inverts the image | ||
# runs paralell on cluster | ||
# APP_CFLAGS += -DINVERTING_KERNEL_CLUSTER | ||
|
||
clean:: | ||
rm -rf img_raw.ppm img_color.ppm img_gray.ppm img_inverted.ppm | ||
|
||
include $(RULES_DIR)/pmsis_rules.mk |
24 changes: 24 additions & 0 deletions
24
GAP8/image_processing_examples/simple_kernel_example/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Image processing examples | ||
|
||
This example takes an image, applies a kernel and writes the image over your JTAG cable (using openOCD file semi-hosting) to your computer. | ||
You can choose between two kernels: | ||
|
||
- demosaicking | ||
- inverting | ||
|
||
for both you can choose to either execute the kernel sequentially on the fabric controller or parallelized on the cluster. Check out the defines in the Makefile to configure your kernel! But be careful, it only supports to choose one kernel at a time. | ||
## Performance | ||
To keep this example simple, we are just saving the time before and after the kernel runs - if you want more precise performance measurements check out https://greenwaves-technologies.com/manuals/BUILD/PMSIS_API/html/group__Perf.html . | ||
Here some pitfalls that we improved in the demosaicking kernel: | ||
First computing a grayscale image from a RGB camera took around 2.33s (all measurements here are at a frequency of 50MHz on the fabric controller and cluster). That's awfully long, so what were we doing wrong? | ||
|
||
- `output[idx] = 0.33*red + 0.33*green + 0.33*blue;` was not a good idea - we do not have a floating point unit (FPU) on GAP8, so float multiplications are really slow. | ||
- `output[idx] = red/3 + green/3 + blue/3;` This got us to 303.3ms - but we can still improve, right? | ||
- `output[idx] = (red + green + blue)/3;` We only need one division! This gives us 199.2ms | ||
- we also have 8 cores on the cluster! Paralellizing it brought us down to 33.3ms. | ||
|
||
With those 3 steps we could already improve by a factor of 70. There are still other possible improvements, like moving the check `grayscale == 1` out of the for-loop for avoiding wrong branch predictions or loop-unrolling. | ||
|
||
One last hint: be careful with printf - this is a very long and complicated function and should never be inside your performance measurement. | ||
|
||
This example was developed and tested with the gap-sdk 3.8.1. |
240 changes: 240 additions & 0 deletions
240
GAP8/image_processing_examples/simple_kernel_example/img_proc.c
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,240 @@ | ||
#include "img_proc.h" | ||
|
||
void demosaicking(char *input, char* output, int width, int height, const int grayscale) | ||
{ | ||
int idx = 0; | ||
int idxr[8]; | ||
char red, blue, green; | ||
|
||
for (int y = 0; y < height ; y++) | ||
{ | ||
for (int x = 0; x < width ; x++) | ||
{ | ||
int idx = y * width + x; | ||
|
||
if (x == 0 || y == 0 || x == width-1 || y == height-1) | ||
{ | ||
if(grayscale) | ||
{ | ||
output[idx] = 0; | ||
} | ||
else | ||
{ | ||
output[idx * 3] = 0; | ||
output[idx * 3 + 1] = 0; | ||
output[idx * 3 + 2] = 0; | ||
} | ||
} | ||
else | ||
{ | ||
|
||
idxr[0] = (y - 1) * width + (x - 1); | ||
idxr[1] = (y)*width + (x - 1); | ||
idxr[2] = (y + 1) * width + (x - 1); | ||
idxr[3] = (y + 1) * width + (x); | ||
idxr[4] = (y + 1) * width + (x + 1); | ||
idxr[5] = (y)*width + (x + 1); | ||
idxr[6] = (y - 1) * width + (x + 1); | ||
idxr[7] = (y - 1) * width + (x); | ||
|
||
int x_shift = 0; | ||
int y_shift = 0; | ||
|
||
if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 0) //R | ||
{ | ||
red = input[idx]; | ||
blue = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4; | ||
green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4; | ||
} | ||
else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 0) //G2 | ||
{ | ||
red = (input[idxr[1]] + input[idxr[5]]) / 2; | ||
blue = (input[idxr[3]] + input[idxr[7]]) / 2; | ||
green = input[idx]; | ||
} | ||
else if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 1) //G1 | ||
{ | ||
red = (input[idxr[3]] + input[idxr[7]]) / 2; | ||
blue = (input[idxr[1]] + input[idxr[5]]) / 2; | ||
green = input[idx]; | ||
} | ||
else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 1) //B | ||
{ | ||
red = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4; | ||
blue = input[idx]; | ||
green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4; | ||
} | ||
else | ||
{ | ||
red = 0; | ||
green = 0; | ||
blue = 0; | ||
} | ||
|
||
if(grayscale) | ||
{ | ||
output[idx] = (red + green + blue)/3; | ||
|
||
}else | ||
{ | ||
output[idx * 3] = red; | ||
output[idx * 3 + 1] = green; | ||
output[idx * 3 + 2] = blue; | ||
} | ||
|
||
|
||
} | ||
} | ||
} | ||
} | ||
|
||
void cluster_demosaicking(void* args) | ||
{ | ||
uint32_t x = 0; | ||
uint32_t y = 0; | ||
uint32_t core_id = pi_core_id(), cluster_id = pi_cluster_id(); | ||
plp_example_kernel_instance_i32 *a = (plp_example_kernel_instance_i32*)args; | ||
char *input = a->srcBuffer; | ||
char *output = a->resBuffer; | ||
uint32_t width = a->width; | ||
uint32_t height = a->height; | ||
uint32_t nPE = a->nPE; | ||
uint32_t grayscale = a->grayscale; | ||
|
||
int idxr[8]; | ||
char red, blue, green; | ||
|
||
// uint32_t total = width*height; | ||
|
||
// amount of elements per core, rounded up | ||
uint32_t x_per_core = (height+nPE-1)/nPE; | ||
// compute the last element of the area each core has to process | ||
uint32_t upper_bound = (core_id+1)*x_per_core; | ||
// as we always rounded up before (to distribute the load as equal as possible) we need to check if the upper bound is still in our matrix | ||
if(upper_bound > height ) upper_bound = height; | ||
// loop over the area assigned to the core | ||
for (y = core_id*x_per_core; y < upper_bound; y++) { | ||
|
||
for (int x = 0; x < width ; x++) | ||
{ | ||
int idx = y * width + x; | ||
|
||
if (x == 0 || y == 0 || x == width-1 || y == height-1) | ||
{ | ||
if(grayscale) | ||
{ | ||
output[idx] = 0; | ||
} | ||
else | ||
{ | ||
output[idx * 3] = 0; | ||
output[idx * 3 + 1] = 0; | ||
output[idx * 3 + 2] = 0; | ||
} | ||
} | ||
else | ||
{ | ||
|
||
idxr[0] = (y - 1) * width + (x - 1); | ||
idxr[1] = (y)*width + (x - 1); | ||
idxr[2] = (y + 1) * width + (x - 1); | ||
idxr[3] = (y + 1) * width + (x); | ||
idxr[4] = (y + 1) * width + (x + 1); | ||
idxr[5] = (y)*width + (x + 1); | ||
idxr[6] = (y - 1) * width + (x + 1); | ||
idxr[7] = (y - 1) * width + (x); | ||
|
||
int x_shift = 0; | ||
int y_shift = 0; | ||
|
||
if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 0) //R | ||
{ | ||
red = input[idx]; | ||
blue = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4; | ||
green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4; | ||
} | ||
else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 0) //G2 | ||
{ | ||
red = (input[idxr[1]] + input[idxr[5]]) / 2; | ||
blue = (input[idxr[3]] + input[idxr[7]]) / 2; | ||
green = input[idx]; | ||
} | ||
else if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 1) //G1 | ||
{ | ||
red = (input[idxr[3]] + input[idxr[7]]) / 2; | ||
blue = (input[idxr[1]] + input[idxr[5]]) / 2; | ||
green = input[idx]; | ||
} | ||
else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 1) //B | ||
{ | ||
red = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4; | ||
blue = input[idx]; | ||
green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4; | ||
} | ||
else | ||
{ | ||
red = 0; | ||
green = 0; | ||
blue = 0; | ||
} | ||
|
||
if(grayscale) | ||
{ | ||
output[idx] = (red + green + blue)/3; | ||
|
||
}else | ||
{ | ||
output[idx * 3] = red; | ||
output[idx * 3 + 1] = green; | ||
output[idx * 3 + 2] = blue; | ||
} | ||
|
||
|
||
} | ||
} | ||
} | ||
} | ||
|
||
void inverting(char *input, char* output, int width, int height) | ||
{ | ||
int idx = 0; | ||
|
||
for (int y = 0; y < height ; y++) | ||
{ | ||
for (int x = 0; x < width ; x++) | ||
{ | ||
int idx = y * width + x; | ||
|
||
output[idx] = 255 - input[idx]; | ||
|
||
} | ||
} | ||
} | ||
|
||
|
||
void cluster_inverting(void* args) | ||
{ | ||
uint32_t idx = 0; | ||
uint32_t core_id = pi_core_id(), cluster_id = pi_cluster_id(); | ||
plp_example_kernel_instance_i32 *a = (plp_example_kernel_instance_i32*)args; | ||
char *srcBuffer = a->srcBuffer; | ||
char *resBuffer = a->resBuffer; | ||
uint32_t width = a->width; | ||
uint32_t height = a->height; | ||
uint32_t nPE = a->nPE; | ||
|
||
uint32_t total = width*height; | ||
|
||
// amount of elements per core, rounded up | ||
uint32_t per_core = (total+nPE-1)/nPE; | ||
// compute the last element of the area each core has to process | ||
uint32_t upper_bound = (core_id+1)*per_core; | ||
// as we always rounded up before (to distribute the load as equal as possible) we need to check if the upper bound is still in our matrix | ||
if(upper_bound > total ) upper_bound = total; | ||
// loop over the area assigned to the core | ||
for (idx = core_id*per_core; idx < upper_bound; idx++) { | ||
|
||
resBuffer[idx] = 255 - srcBuffer[idx]; | ||
|
||
} | ||
} |
20 changes: 20 additions & 0 deletions
20
GAP8/image_processing_examples/simple_kernel_example/img_proc.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#ifndef __IMG_PROC_H__ | ||
#define __IMG_PROC_H__ | ||
|
||
#include "pmsis.h" | ||
|
||
typedef struct { | ||
char *srcBuffer; // pointer to the input vector | ||
char *resBuffer; // pointer to the output vector | ||
uint32_t width; // image width | ||
uint32_t height; // image height | ||
uint32_t nPE; // number of cores | ||
uint32_t grayscale; // grayscale if one | ||
} plp_example_kernel_instance_i32; | ||
|
||
void demosaicking(char *input, char* output, int width, int height, int grayscale); | ||
void cluster_demosaicking(void* args); | ||
void inverting(char *input, char* output, int width, int height); | ||
void cluster_inverting(void* args); | ||
|
||
#endif |
Oops, something went wrong.