Skip to content

Commit

Permalink
13: Rewrite SYCL stencil using USM
Browse files Browse the repository at this point in the history
  • Loading branch information
al42and committed Nov 13, 2024
1 parent 9278fba commit 8ed0d3c
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 56 deletions.
9 changes: 5 additions & 4 deletions content/13-examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@ GPU parallelization: first steps

Let's apply several techniques presented in previous episodes to make stencil update GPU-parallel.

OpenMP (or OpenACC) offloading requires to define a region to be executed in parallel as well as data that shall be copied over/ used in GPU memory. Similarly, SYCL programming model offers convenient ways to define execution kernels, context to run them in (called queue) and simplified CPU-GPU transfer of needed data.
OpenMP (or OpenACC) offloading requires to define a region to be executed in parallel as well as data that shall be copied over/ used in GPU memory.
Similarly, SYCL programming model offers convenient ways to define execution kernels, context to run them in (called queue). However, for better control over data movement, we use manual memory allocation and movement.

Changes of stencil update code for OpenMP and SYCL are shown in the tabs below:

Expand All @@ -304,7 +305,7 @@ Changes of stencil update code for OpenMP and SYCL are shown in the tabs below:

.. literalinclude:: examples/stencil/sycl/core-naive.cpp
:language: cpp
:emphasize-lines: 31,35
:emphasize-lines: 24-26,28,42-44

.. callout:: Loading modules on LUMI

Expand Down Expand Up @@ -398,7 +399,7 @@ Changes of stencil update code as well as the main program are shown in tabs bel

.. literalinclude:: examples/stencil/sycl/core.cpp
:language: cpp
:emphasize-lines: 13-14,27-28,41-55
:emphasize-lines: 13-14,25,40-50

.. tab:: Python

Expand All @@ -411,7 +412,7 @@ Changes of stencil update code as well as the main program are shown in tabs bel

.. literalinclude:: examples/stencil/sycl/main.cpp
:language: cpp
:emphasize-lines: 38-39,44-45,51,56,59,75
:emphasize-lines: 38-39,44-45,51,56,59,75-77

.. challenge:: Exercise: updated GPU ports

Expand Down
42 changes: 21 additions & 21 deletions content/examples/stencil/sycl/core-naive.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,37 @@
// prev: temperature values from previous time step
// a: diffusivity
// dt: time step
void evolve(sycl::queue &Q,
field *curr, field *prev, double a, double dt)
{
void evolve(sycl::queue &Q, field *curr, field *prev, double a, double dt) {
// Help the compiler avoid being confused by the structs
double *currdata = curr->data.data();
double *prevdata = prev->data.data();
int nx = prev->nx;
int ny = prev->ny;
int size = (nx + 2) * (ny + 2);

// Determine the temperature field at next time step
// As we have fixed boundary conditions, the outermost gridpoints
// are not updated.
double dx2 = prev->dx * prev->dx;
double dy2 = prev->dy * prev->dy;

{
sycl::buffer<double, 2> buf_curr { currdata, sycl::range<2>(nx + 2, ny + 2) },
buf_prev { prevdata, sycl::range<2>(nx + 2, ny + 2) };
double *currdata = sycl::malloc_device<double>(size, Q);
double *prevdata = sycl::malloc_device<double>(size, Q);
Q.copy<double>(prev->data.data(), prevdata, size);

Q.submit([&](sycl::handler &cgh) {
auto acc_curr = sycl::accessor(buf_curr, cgh, sycl::read_write);
auto acc_prev = sycl::accessor(buf_prev, cgh, sycl::read_only);
Q.parallel_for(sycl::range<2>(nx, ny), [=](sycl::id<2> id) {
auto j = id[0] + 1;
auto i = id[1] + 1;

cgh.parallel_for(sycl::range<2>(nx, ny), [=](sycl::id<2> id) {
auto j = id[0] + 1;
auto i = id[1] + 1;
acc_curr[j][i] = acc_prev[j][i] + a * dt *
((acc_prev[j][i + 1] - 2.0 * acc_prev[j][i] + acc_prev[j][i - 1]) / dx2 +
(acc_prev[j + 1][i] - 2.0 * acc_prev[j][i] + acc_prev[j - 1][i]) / dy2);
});
});
}
// Data is automatically copied back to the CPU when buffers go out of scope
int ind = i * (ny + 2) + j;
int ip = (i + 1) * (ny + 2) + j;
int im = (i - 1) * (ny + 2) + j;
int jp = i * (ny + 2) + j + 1;
int jm = i * (ny + 2) + j - 1;
currdata[ind] = prevdata[ind] + a*dt*
((prevdata[ip] - 2.0*prevdata[ind] + prevdata[im]) / dx2 +
(prevdata[jp] - 2.0*prevdata[ind] + prevdata[jm]) / dy2);
});

Q.copy<double>(currdata, curr->data.data(), size);
sycl::free(currdata, Q);
sycl::free(prevdata, Q);
}
47 changes: 21 additions & 26 deletions content/examples/stencil/sycl/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
// Update the temperature values using five-point stencil
// Arguments:
// queue: SYCL queue
// d_curr: current temperature values
// d_prev: temperature values from previous time step
// currdata: current temperature values (device pointer)
// prevdata: temperature values from previous time step (device pointer)
// prev: description of the grid parameters
// a: diffusivity
// dt: time step
void evolve(sycl::queue &Q, sycl::buffer<double, 2> d_curr, sycl::buffer<double, 2> d_prev,
void evolve(sycl::queue &Q, double* currdata, const double* prevdata,
const field *prev, double a, double dt)
{
int nx = prev->nx;
Expand All @@ -22,34 +22,29 @@ void evolve(sycl::queue &Q, sycl::buffer<double, 2> d_curr, sycl::buffer<double,
double dx2 = prev->dx * prev->dx;
double dy2 = prev->dy * prev->dy;

{
Q.submit([&](sycl::handler &cgh) {
auto acc_curr = sycl::accessor(d_curr, cgh, sycl::read_write);
auto acc_prev = sycl::accessor(d_prev, cgh, sycl::read_only);
Q.parallel_for(sycl::range<2>(nx, ny), [=](sycl::id<2> id) {
auto j = id[0] + 1;
auto i = id[1] + 1;

cgh.parallel_for(sycl::range<2>(nx, ny), [=](sycl::id<2> id) {
auto j = id[0] + 1;
auto i = id[1] + 1;
acc_curr[j][i] = acc_prev[j][i] + a * dt *
((acc_prev[j][i + 1] - 2.0 * acc_prev[j][i] + acc_prev[j][i - 1]) / dx2 +
(acc_prev[j + 1][i] - 2.0 * acc_prev[j][i] + acc_prev[j - 1][i]) / dy2);
});
});
}
int ind = i * (ny + 2) + j;
int ip = (i + 1) * (ny + 2) + j;
int im = (i - 1) * (ny + 2) + j;
int jp = i * (ny + 2) + j + 1;
int jm = i * (ny + 2) + j - 1;
currdata[ind] = prevdata[ind] + a*dt*
((prevdata[ip] - 2.0*prevdata[ind] + prevdata[im]) / dx2 +
(prevdata[jp] - 2.0*prevdata[ind] + prevdata[jm]) / dy2);
});
}

void copy_to_buffer(sycl::queue Q, sycl::buffer<double, 2> buffer, const field* f)
void copy_to_buffer(sycl::queue Q, double* buffer, const field* f)
{
Q.submit([&](sycl::handler& h) {
auto acc = buffer.get_access<sycl::access::mode::write>(h);
h.copy(f->data.data(), acc);
});
int size = (f->nx + 2) * (f->ny + 2);
Q.copy<double>(f->data.data(), buffer, size);
}

void copy_from_buffer(sycl::queue Q, sycl::buffer<double, 2> buffer, field *f)
void copy_from_buffer(sycl::queue Q, const double* buffer, field *f)
{
Q.submit([&](sycl::handler& h) {
auto acc = buffer.get_access<sycl::access::mode::read>(h);
h.copy(acc, f->data.data());
}).wait();
int size = (f->nx + 2) * (f->ny + 2);
Q.copy<double>(buffer, f->data.data(), size).wait();
}
6 changes: 3 additions & 3 deletions content/examples/stencil/sycl/heat.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ void initialize(int argc, char *argv[], field *heat1,

void evolve(sycl::queue &Q, field *curr, field *prev, double a, double dt);

void evolve(sycl::queue &Q, sycl::buffer<double, 2> buf_curr, sycl::buffer<double, 2> buf_prev,
void evolve(sycl::queue &Q, double* buf_curr, const double* buf_prev,
const field *prev, double a, double dt);

void field_set_size(field *heat, int nx, int ny);
Expand All @@ -61,8 +61,8 @@ void field_swap(field *heat1, field *heat2);
void field_allocate(field *heat);

// Data movement function prototypes
void copy_to_buffer(sycl::queue Q, sycl::buffer<double, 2> buffer, const field* f);
void copy_to_buffer(sycl::queue Q, double* buffer, const field* f);

void copy_from_buffer(sycl::queue Q, sycl::buffer<double, 2> buffer, field *f);
void copy_from_buffer(sycl::queue Q, const double* buffer, field *f);

#endif // __HEAT_H__
6 changes: 4 additions & 2 deletions content/examples/stencil/sycl/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ int main(int argc, char **argv)
sycl::queue Q;

// Create two identical device buffers
const sycl::range<2> buffer_size{ size_t(current.nx + 2), size_t(current.ny + 2) };
sycl::buffer<double, 2> d_current{buffer_size}, d_previous{buffer_size};
double *d_current = sycl::malloc_device<double>((current.nx + 2) * (current.ny + 2), Q);
double *d_previous = sycl::malloc_device<double>((current.nx + 2) * (current.ny + 2), Q);

// Start timer
auto start_clock = start_time();
Expand Down Expand Up @@ -73,5 +73,7 @@ int main(int argc, char **argv)
std::chrono::duration<double> elapsed = stop_clock - start_clock;
printf("Iterations took %.3f seconds.\n", elapsed.count());
Q.wait_and_throw();
sycl::free(d_previous, Q);
sycl::free(d_current, Q);
return 0;
}

0 comments on commit 8ed0d3c

Please sign in to comment.