Skip to content

Commit

Permalink
Adds a simple image processing example
Browse files Browse the repository at this point in the history
Two kernels are available: demosaicking and inverting.
Cluster and FC implementations. Find more info in the README
  • Loading branch information
Hanna Müller committed Apr 9, 2021
1 parent 2438954 commit 60081a5
Show file tree
Hide file tree
Showing 5 changed files with 581 additions and 0 deletions.
39 changes: 39 additions & 0 deletions GAP8/image_processing_examples/simple_kernel_example/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
APP = test
APP_SRCS += test.c $(GAP_LIB_PATH)/img_io/ImgIO.c img_proc.c
APP_INC += . $(GAP_LIB_PATH)/include

APP_CFLAGS += -O3 -g


PMSIS_OS ?= pulp_os

# this flag enables asynchronous image capturing
# keep in mind that you should always have a buffer enqueued
# if recording an image stream to avoid missing data (meaning you need 2)
APP_CFLAGS += -DASYNC_CAPTURE
# configures camera in QVGA format (324x244)
# the extra 4 is from padding for the RGB camera (but also present on the grey one)
APP_CFLAGS += -DQVGA_MODE

# YOU CAN ONLY ACTIVATE ONE KERNEL AT A TIME

# apply the demosaicking algorithm (only useful if using RGB camera)
# runs sequential on fabric controller
# APP_CFLAGS += -DDEMOSAICKING_KERNEL_FC
# apply the demosaicking algorithm (only useful if using RGB camera)
# runs paralell on cluster
APP_CFLAGS += -DDEMOSAICKING_KERNEL_CLUSTER
# only if one of the demosaicking kernel above is active
# activate for a colored output
APP_CFLAGS += -DCOLOR_IMAGE
# apply an example kernel that inverts the image
# runs sequential on fabric controller
# APP_CFLAGS += -DINVERTING_KERNEL_FC
# apply an example kernel that inverts the image
# runs paralell on cluster
# APP_CFLAGS += -DINVERTING_KERNEL_CLUSTER

clean::
rm -rf img_raw.ppm img_color.ppm img_gray.ppm img_inverted.ppm

include $(RULES_DIR)/pmsis_rules.mk
24 changes: 24 additions & 0 deletions GAP8/image_processing_examples/simple_kernel_example/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Image processing examples

This example takes an image, applies a kernel and writes the image over your JTAG cable (using openOCD file semi-hosting) to your computer.
You can choose between two kernels:

- demosaicking
- inverting

for both you can choose to either execute the kernel sequentially on the fabric controller or parallelized on the cluster. Check out the defines in the Makefile to configure your kernel! But be careful, it only supports to choose one kernel at a time.
## Performance
To keep this example simple, we are just saving the time before and after the kernel runs - if you want more precise performance measurements check out https://greenwaves-technologies.com/manuals/BUILD/PMSIS_API/html/group__Perf.html .
Here some pitfalls that we improved in the demosaicking kernel:
First computing a grayscale image from a RGB camera took around 2.33s (all measurements here are at a frequency of 50MHz on the fabric controller and cluster). That's awfully long, so what were we doing wrong?

- `output[idx] = 0.33*red + 0.33*green + 0.33*blue;` was not a good idea - we do not have a floating point unit (FPU) on GAP8, so float multiplications are really slow.
- `output[idx] = red/3 + green/3 + blue/3;` This got us to 303.3ms - but we can still improve, right?
- `output[idx] = (red + green + blue)/3;` We only need one division! This gives us 199.2ms
- we also have 8 cores on the cluster! Paralellizing it brought us down to 33.3ms.

With those 3 steps we could already improve by a factor of 70. There are still other possible improvements, like moving the check `grayscale == 1` out of the for-loop for avoiding wrong branch predictions or loop-unrolling.

One last hint: be careful with printf - this is a very long and complicated function and should never be inside your performance measurement.

This example was developed and tested with the gap-sdk 3.8.1.
240 changes: 240 additions & 0 deletions GAP8/image_processing_examples/simple_kernel_example/img_proc.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
#include "img_proc.h"

void demosaicking(char *input, char* output, int width, int height, const int grayscale)
{
int idx = 0;
int idxr[8];
char red, blue, green;

for (int y = 0; y < height ; y++)
{
for (int x = 0; x < width ; x++)
{
int idx = y * width + x;

if (x == 0 || y == 0 || x == width-1 || y == height-1)
{
if(grayscale)
{
output[idx] = 0;
}
else
{
output[idx * 3] = 0;
output[idx * 3 + 1] = 0;
output[idx * 3 + 2] = 0;
}
}
else
{

idxr[0] = (y - 1) * width + (x - 1);
idxr[1] = (y)*width + (x - 1);
idxr[2] = (y + 1) * width + (x - 1);
idxr[3] = (y + 1) * width + (x);
idxr[4] = (y + 1) * width + (x + 1);
idxr[5] = (y)*width + (x + 1);
idxr[6] = (y - 1) * width + (x + 1);
idxr[7] = (y - 1) * width + (x);

int x_shift = 0;
int y_shift = 0;

if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 0) //R
{
red = input[idx];
blue = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4;
green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4;
}
else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 0) //G2
{
red = (input[idxr[1]] + input[idxr[5]]) / 2;
blue = (input[idxr[3]] + input[idxr[7]]) / 2;
green = input[idx];
}
else if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 1) //G1
{
red = (input[idxr[3]] + input[idxr[7]]) / 2;
blue = (input[idxr[1]] + input[idxr[5]]) / 2;
green = input[idx];
}
else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 1) //B
{
red = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4;
blue = input[idx];
green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4;
}
else
{
red = 0;
green = 0;
blue = 0;
}

if(grayscale)
{
output[idx] = (red + green + blue)/3;

}else
{
output[idx * 3] = red;
output[idx * 3 + 1] = green;
output[idx * 3 + 2] = blue;
}


}
}
}
}

void cluster_demosaicking(void* args)
{
uint32_t x = 0;
uint32_t y = 0;
uint32_t core_id = pi_core_id(), cluster_id = pi_cluster_id();
plp_example_kernel_instance_i32 *a = (plp_example_kernel_instance_i32*)args;
char *input = a->srcBuffer;
char *output = a->resBuffer;
uint32_t width = a->width;
uint32_t height = a->height;
uint32_t nPE = a->nPE;
uint32_t grayscale = a->grayscale;

int idxr[8];
char red, blue, green;

// uint32_t total = width*height;

// amount of elements per core, rounded up
uint32_t x_per_core = (height+nPE-1)/nPE;
// compute the last element of the area each core has to process
uint32_t upper_bound = (core_id+1)*x_per_core;
// as we always rounded up before (to distribute the load as equal as possible) we need to check if the upper bound is still in our matrix
if(upper_bound > height ) upper_bound = height;
// loop over the area assigned to the core
for (y = core_id*x_per_core; y < upper_bound; y++) {

for (int x = 0; x < width ; x++)
{
int idx = y * width + x;

if (x == 0 || y == 0 || x == width-1 || y == height-1)
{
if(grayscale)
{
output[idx] = 0;
}
else
{
output[idx * 3] = 0;
output[idx * 3 + 1] = 0;
output[idx * 3 + 2] = 0;
}
}
else
{

idxr[0] = (y - 1) * width + (x - 1);
idxr[1] = (y)*width + (x - 1);
idxr[2] = (y + 1) * width + (x - 1);
idxr[3] = (y + 1) * width + (x);
idxr[4] = (y + 1) * width + (x + 1);
idxr[5] = (y)*width + (x + 1);
idxr[6] = (y - 1) * width + (x + 1);
idxr[7] = (y - 1) * width + (x);

int x_shift = 0;
int y_shift = 0;

if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 0) //R
{
red = input[idx];
blue = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4;
green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4;
}
else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 0) //G2
{
red = (input[idxr[1]] + input[idxr[5]]) / 2;
blue = (input[idxr[3]] + input[idxr[7]]) / 2;
green = input[idx];
}
else if ((x + x_shift) % 2 == 0 && (y + y_shift) % 2 == 1) //G1
{
red = (input[idxr[3]] + input[idxr[7]]) / 2;
blue = (input[idxr[1]] + input[idxr[5]]) / 2;
green = input[idx];
}
else if ((x + x_shift) % 2 == 1 && (y + y_shift) % 2 == 1) //B
{
red = (input[idxr[0]] + input[idxr[2]] + input[idxr[4]] + input[idxr[6]]) / 4;
blue = input[idx];
green = (input[idxr[1]] + input[idxr[3]] + input[idxr[5]] + input[idxr[7]]) / 4;
}
else
{
red = 0;
green = 0;
blue = 0;
}

if(grayscale)
{
output[idx] = (red + green + blue)/3;

}else
{
output[idx * 3] = red;
output[idx * 3 + 1] = green;
output[idx * 3 + 2] = blue;
}


}
}
}
}

void inverting(char *input, char* output, int width, int height)
{
int idx = 0;

for (int y = 0; y < height ; y++)
{
for (int x = 0; x < width ; x++)
{
int idx = y * width + x;

output[idx] = 255 - input[idx];

}
}
}


void cluster_inverting(void* args)
{
uint32_t idx = 0;
uint32_t core_id = pi_core_id(), cluster_id = pi_cluster_id();
plp_example_kernel_instance_i32 *a = (plp_example_kernel_instance_i32*)args;
char *srcBuffer = a->srcBuffer;
char *resBuffer = a->resBuffer;
uint32_t width = a->width;
uint32_t height = a->height;
uint32_t nPE = a->nPE;

uint32_t total = width*height;

// amount of elements per core, rounded up
uint32_t per_core = (total+nPE-1)/nPE;
// compute the last element of the area each core has to process
uint32_t upper_bound = (core_id+1)*per_core;
// as we always rounded up before (to distribute the load as equal as possible) we need to check if the upper bound is still in our matrix
if(upper_bound > total ) upper_bound = total;
// loop over the area assigned to the core
for (idx = core_id*per_core; idx < upper_bound; idx++) {

resBuffer[idx] = 255 - srcBuffer[idx];

}
}
20 changes: 20 additions & 0 deletions GAP8/image_processing_examples/simple_kernel_example/img_proc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifndef __IMG_PROC_H__
#define __IMG_PROC_H__

#include "pmsis.h"

typedef struct {
char *srcBuffer; // pointer to the input vector
char *resBuffer; // pointer to the output vector
uint32_t width; // image width
uint32_t height; // image height
uint32_t nPE; // number of cores
uint32_t grayscale; // grayscale if one
} plp_example_kernel_instance_i32;

void demosaicking(char *input, char* output, int width, int height, int grayscale);
void cluster_demosaicking(void* args);
void inverting(char *input, char* output, int width, int height);
void cluster_inverting(void* args);

#endif
Loading

0 comments on commit 60081a5

Please sign in to comment.