diff --git a/.ci/linkchecker.sh b/.ci/linkchecker.sh new file mode 100644 index 000000000..974466dc4 --- /dev/null +++ b/.ci/linkchecker.sh @@ -0,0 +1,9 @@ +# We ignore ANSYS CFX and Fluent links as they give +# https://github.com/curl/curl/issues/4409, which needs a +# rather new version of OpenSSL. +# The cytopia/linkcheck is rather slow. There are faster alternatives, +# especially https://github.com/filiph/linkcheck, but that +# requires Dart etc. +# Consider to utilize a faster link checker as soon as we have +# all the exclude links configured and it works. +git clone https://github.com/cytopia/linkcheck.git /tmp/linkchecker && /tmp/linkchecker/linkcheck -i '^http(s)?:\/\/(localhost)|(127.0.0.1)|(documentation.sigma2.no/page/on/same/site.html)|(download.open-mpi.org/release/open-mpi/v4.0/openmpi-)|(example.org/institution/simulationDataq)|(rt.uninett.no/SelfService)|(desktop.saga.sigma2.no)|(desktop.fram.sigma2.no)|(example.org/institution/simulationData)|(www.pythonware.com/products/pil)|(www.linuxconfig.org/Bash_scripting_Tutorial)|(https://documentation.sigma2.no/_downloads/bdfbca90a90a8d1b824fc6b1154ceee7/serial.zip)|(https://www.ansys.com/products/fluids/ansys-cfx)|(https://www.ansys.com/products/fluids/ansys-fluent)' -e 'md,txt,rst' -k -c '200,301,302,303,307,308' . diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml new file mode 100644 index 000000000..4fe74dac0 --- /dev/null +++ b/.github/workflows/sphinx.yml @@ -0,0 +1,41 @@ +name: Build HTML + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + build: + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/main' + steps: + - name: Check out repo + uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: "3.10" + - name: Install Python dependencies + run: | + python -m pip install -r requirements.txt + - name: Run Sphinx + run: | + sphinx-build . _build + - name: Commit and push to gh-pages + run: |- + git config --global user.email "workflow-bot@example.com" + git config --global user.name "workflow-bot" + git checkout --orphan gh-pages + git rm --cached -r . + mv CNAME _build .. + rm -rf * + mv ../_build/* . + mv ../CNAME . + touch .nojekyll + git add . + git commit -m "generated using sphinx-build" + git push --set-upstream origin gh-pages --force diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..1210d4299 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# vim temporary files +.*.swp +.*.swo + +# macOS +.DS_Store + +# sphinx +_build/ +venv/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 000000000..323cc92ca --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,37 @@ +--- +stages: + - linkchecker + - spellchecker + - build + +image: python:3.10-alpine + +linkchecker: + stage: linkchecker + only: + - schedules + script: + - apk add --no-cache git bash curl + - bash .ci/linkchecker.sh + +spellchecker: + stage: spellchecker + script: + - apk add --no-cache bash curl + - curl -L -o ./install-misspell.sh https://git.io/misspell + - bash ./install-misspell.sh + - ./bin/misspell -error . + +build: + stage: build + when: on_success + script: + - pip install --upgrade pip + - pip install -r requirements.txt + # '-W': Turn warnings into errors + # '--keep-going': When encountering a warning continue to process (this + # allows us to capture multiple warnings at the same time, avoiding the + # 'build->warning->fix->build->warning' loop where both fixes could be + # solved at the same time) + # '-n': Warn about internal missing references + - sphinx-build -W --keep-going -n . _build diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 000000000..e69de29bb diff --git a/CNAME b/CNAME new file mode 100644 index 000000000..766ef1b93 --- /dev/null +++ b/CNAME @@ -0,0 +1 @@ +documentation.sigma2.no diff --git a/_downloads/0105ae9db861b5434b03f0f2ee915de4/mandelbrot_initial.c b/_downloads/0105ae9db861b5434b03f0f2ee915de4/mandelbrot_initial.c new file mode 100644 index 000000000..e054953ca --- /dev/null +++ b/_downloads/0105ae9db861b5434b03f0f2ee915de4/mandelbrot_initial.c @@ -0,0 +1,155 @@ +/** + * Mandelbrot implementation for accelerators (e.g. GPUs) + */ + +#include "utils/lodepng.h" +#include "utils/palette.h" +#include +#include +#include +#include +#include + +// Default width and height for image if not given +static const int WIDTH = 1280; +static const int HEIGHT = 720; +// Default output name if not given +static const char* OUTPUT_NAME = "mandelbrot.png"; +// Maximum iteration count before exiting mandelbrot function +static const uint32_t MAX_ITER = 1000; + +// Helper function to scale 'num' to the range '[min, max]' +#pragma acc routine seq +float scale(float num, const float min, const float max) { + const float scale = max - min; + return num * scale + min; +} + +/** + * Mandelbrot function, calculates the value of the mandelbrot set at pixel 'px/py' + */ +#pragma acc routine seq +uint32_t mandelbrot(const int px, const int py, const int width, const int height, + const int max_iter) { + const float x0 = scale((float) px / (float) width, -2.5, 1.); + const float y0 = scale((float) py / (float) height, -1., 1.); + float x = 0.; + float y = 0.; + float x2 = 0.; + float y2 = 0.; + int iters = 0; + while (x2 + y2 < 4. && iters < max_iter) { + y = 2. * x * y + y0; + x = x2 - y2 + x0; + x2 = x * x; + y2 = y * y; + iters += 1; + } + return (uint32_t) iters; +} + +int main (int argc, char** argv) { + int width = WIDTH; + int height = HEIGHT; + char output_name[128]; + int max_iter = MAX_ITER; + strncpy (output_name, OUTPUT_NAME, strnlen (OUTPUT_NAME, 127) + 1); + // Assume the first argument is the width and height of the image + if (argc > 1) { + if (strncmp (argv[1], "-h", 2) == 0 || strncmp (argv[1], "--help", 6) == 0) { + printf("Usage: %s x \n", argv[0]); + printf("\tImage size can also be one of {8k, 4k, 3k, 1080p, 720p}\n"); + return EXIT_SUCCESS; + } + // First we check image size is one of the predefined sizes + if (strncmp (argv[1], "8k", 2) == 0) { + width = 7680; + height = 4320; + } else if (strncmp (argv[1], "4k", 2) == 0) { + width = 3840; + height = 2160; + } else if (strncmp (argv[1], "3k", 2) == 0) { + width = 3000; + height = 2000; + } else if (strncmp (argv[1], "1080p", 5) == 0) { + width = 1920; + height = 1080; + } else if (strncmp (argv[1], "720p", 4) == 0) { + width = 1280; + height = 720; + } else { + // Assume user has supplied x + // Try to find 'x' in argument + char* token; + token = strtok (argv[1], "x"); + if (token != NULL) { + width = atoi (token); + } else { + printf("\033[0;31mInvalid width/height definition:\033[0m '%s'\n", argv[1]); + printf("\tShould be 'x'\n"); + return EXIT_FAILURE; + } + token = strtok (NULL, "x"); + if (token != NULL) { + height = atoi (token); + } else { + printf("\033[0;31mInvalid width/height definition:\033[0m '%s'\n", argv[1]); + printf("\tShould be 'x'\n"); + return EXIT_FAILURE; + } + } + } + // Second argument is the maximum iteration count + if (argc > 2) { + max_iter = atoi (argv[2]); + } + // Third argument is the output filename to write PNG file to + if (argc > 3) { + if (strlen (argv[3]) > 127) { + printf("\033[0;31mOutput filename to large!\033[0m"); + return EXIT_FAILURE; + } + strncpy (output_name, argv[3], strnlen (argv[3], 127) + 1); + } + // Allocate storage for image + uint32_t* image = calloc (width * height, sizeof (uint32_t)); + if (image == NULL) { + printf("\033[0;31mCould not allocate memory for image!\033[0m\n"); + return EXIT_FAILURE; + } + printf("Generating \033[0;35m%dx%d\033[0m image with max \033[0;35m%d\033[0m iterations\n", + width, height, + max_iter); + /****************************************************************************/ + /*************************** Main computation ***************************/ + /****************************************************************************/ + const double start_time = omp_get_wtime (); + // For each pixel of our image calculate the value of the mandelbrot set + #pragma acc parallel loop \ + copyout(image[:width * height]) \ + copyin(palette[:palette_size]) \ + collapse(2) + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + const uint32_t iters = mandelbrot (x, y, width, height, max_iter); + image[y * width + x] = palette[iters % palette_size]; + } + } + const double end_time = omp_get_wtime (); + printf("Used \033[0;35m%.3f\033[0m ms for computation\n", + (end_time - start_time) * 1000.0); + /****************************************************************************/ + // Write image to file + const unsigned char png_error = lodepng_encode32_file(output_name, + (const unsigned char*) image, + width, height); + // Free image storage + free (image); + if (png_error) { + printf("\033[0;31mAn error occurred while writing to PNG:\033[0m %s\n", + lodepng_error_text (png_error)); + return EXIT_FAILURE; + } + printf("Wrote Mandelbrot result to \033[0;35m%s\033[0m\n", output_name); + return EXIT_SUCCESS; +} diff --git a/_downloads/0222d30f6733c67f4afcb6e626b30824/saga_mpi_job.sh b/_downloads/0222d30f6733c67f4afcb6e626b30824/saga_mpi_job.sh new file mode 100644 index 000000000..f7da73f44 --- /dev/null +++ b/_downloads/0222d30f6733c67f4afcb6e626b30824/saga_mpi_job.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +############################################### +# Script example for a normal MPI job on Saga # +############################################### + +## Project: replace XXXX with your project ID +#SBATCH --account=nnXXXXk + +## Job name: +#SBATCH --job-name=MyJob +## Number of tasks (aka processes) to start: Pure mpi, one cpu per task +#SBATCH --ntasks=16 +## Amount of memory per cpu (= per task, since we get 1 cpu per task): +#SBATCH --mem-per-cpu=4G +## Run for 10 minutes, syntax is d-hh:mm:ss +#SBATCH --time=0-00:10:00 + +# you may not place bash commands before the last SBATCH directive +###################################################### +## Setting variables and prepare runtime environment: +##---------------------------------------------------- +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +# Loading Software modules +# Allways be explicit on loading modules and setting run time environment!!! +module --quiet purge # Restore loaded modules to the default +module load MySoftWare/Versions #nb: Versions is important! + +# Type "module avail MySoftware" to find available modules and versions +# It is also recommended to to list loaded modules, for easier debugging: +module list + +####################################################### +## Prepare jobs, moving input files and making sure +# output is copied back and taken care of +##----------------------------------------------------- + +# Prepare input files +cp inputfiles $SCRATCH +cd $SCRATCH + +# Make sure output is copied back after job finishes +savefile outputfile1 outputfile2 + +######################################################## +# Run the application, and we typically time it: +##------------------------------------------------------ + +# Run the application - please add hash in front of srun and remove +# hash in front of mpirun if using intel-toolchain + +# For OpenMPI (foss and iomkl toolchains), srun is recommended: +time srun MySoftWare-exec + +## For IntelMPI (intel toolchain), mpirun is recommended: +#time mpirun MySoftWare-exec + +######################################################### +# That was about all this time; lets call it a day... +##------------------------------------------------------- +# Finish the script +exit 0 diff --git a/_downloads/044d6d549de4442a8175e9e2ef82633e/laplace_mpiacc_noaware.f90 b/_downloads/044d6d549de4442a8175e9e2ef82633e/laplace_mpiacc_noaware.f90 new file mode 100644 index 000000000..ee5d4f571 --- /dev/null +++ b/_downloads/044d6d549de4442a8175e9e2ef82633e/laplace_mpiacc_noaware.f90 @@ -0,0 +1,230 @@ + program laplace_mpiacc_noaware + + use mpi + use openacc + + implicit none + integer status(MPI_STATUS_SIZE) + integer :: i,j,k,ii + integer :: iter,count_rate, count_max,count,nenv_var + integer :: myid,ierr,nproc,nxp,nyp,tag,tag1,tag2,nsend + integer, parameter :: nx=20000,ny=nx + integer, parameter :: max_iter=525 + double precision, parameter :: pi=4d0*datan(1d0) + real, parameter :: error=0.05 + double precision :: max_err,time_s,& + d2fx,d2fy,max_err_part + real :: t_start,t_final + double precision, allocatable :: f(:,:),f_k(:,:) + double precision, allocatable :: f_send(:,:),f_full(:,:) + character(len=300) :: env_var + + integer(kind=acc_device_kind) deviceType + integer :: myDevice,numDevice,host_rank,host_comm + + !MPI starts + ! Initialise OpenMPI communication. + call MPI_INIT(ierr) + ! Get number of active processes (from 0 to nproc-1). + call MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr ) + ! Identify the ID rank (process). + call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr ) + +!check if GPU-aware support is enabled + if(myid.eq.0) then + print*, '' + call getenv("MPICH_GPU_SUPPORT_ENABLED", env_var) + read(env_var, '(i10)' ) nenv_var + if (nenv_var.eq. 1) then + print *, '--MPICH_GPU_SUPPORT_ENABLED is enabled!' + print*, '' + elseif (nenv_var.ne. 1) then + print *, '--MPICH_GPU_SUPPORT_ENABLED is NOT enabled!' + print *, '' + endif + endif + + t_start = MPI_WTIME() + + if (mod(nx,nproc).ne.0) then + if (myid.eq.0) write(*,*) 'nproc has to divide nx' + stop + else + nxp = nx/nproc + endif + if (mod(ny,nproc).ne.0) then + if (myid.eq.0) write(*,*) 'nproc has to divide ny' + stop + else + nyp = ny/nproc + endif + + if(myid.eq.0) then + print*,'--nbr of proc', nproc + write(*,*)'--nbr of points nx,ny',nx,ny + write(*,*)'--nbr of elmts on each proc, nyp=ny/nproc', nyp + endif + +!Generate the Initial Conditions (ICs) +!Distribute the ICs over all processes using the operation MPI_Scatter + allocate(f(0:nx+1,0:nyp+1)); + + f=0d0; tag1=2020; tag2=2021 + + if(myid.eq.0) then + allocate(f_send(1:nx,1:ny)) + CALL RANDOM_NUMBER(f_send) + endif + + call MPI_Scatter(f_send,nx*nyp,MPI_DOUBLE_PRECISION,& + f(1:nx,1:nyp), nx*nyp,MPI_DOUBLE_PRECISION,& + 0,MPI_COMM_WORLD, ierr) + + call MPI_Barrier(MPI_COMM_WORLD, ierr) + + if(myid.eq.0) deallocate(f_send) + +!Set a device: Determine which processes are on each node +!such that each process will be connected to a GPU + +!!Split the world communicator into subgroups of commu, each of which +!contains processes that run on the same node, and which can create a +!shared +!memory region (via the type MPI_COMM_TYPE_SHARED). +!The call returns a new communicator "host_comm", which is created by +!each subgroup. + + call MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,& + MPI_INFO_NULL, host_comm,ierr) + call MPI_COMM_RANK(host_comm, host_rank,ierr) + + myDevice = host_rank + +!returns the device type to be used + deviceType = acc_get_device_type() + +!returns the number of devices available on the host + numDevice = acc_get_num_devices(deviceType) + +!sets the device number and the device type to be used + call acc_set_device_num(myDevice, deviceType) + + if(myid.eq.0)print*, "--Number of devices per node:", numDevice + if(myid.eq.0)print*,"" + + print*, "--MPI rank", myid, "is connected to GPU", myDevice + + allocate(f_k(1:nx,1:nyp)) + + iter = 0 + + if(myid.eq.0) then + print*,"" + print*, "--Start iterations",iter + print*,"" + endif + +!Unstructed data locality +!$acc enter data copyin(f) create(f_k) + do while (max_err.gt.error.and.iter.le.max_iter) + +!copy data from GPU to CPU +!$acc update host(f) + +!transfer the data at the boundaries to the neighbouring MPI-process +!send f(:,nyp) from myid-1 to be stored in f(:,0) in myid+1 + if(myid.lt.nproc-1) then + call MPI_Send(f(:,nyp),(nx+2)*1,MPI_DOUBLE_PRECISION,myid+1,tag1,& + MPI_COMM_WORLD, ierr) + endif + +!receive f(:,0) from myid-1 + if(myid.gt.0) then + call MPI_Recv(f(:,0),(nx+2)*1,MPI_DOUBLE_PRECISION,myid-1, & + tag1,MPI_COMM_WORLD, status,ierr) + endif + +!send f(:,1) from myid+1 to be stored in f(:,nyp+1) in myid-1 + if(myid.gt.0) then + call MPI_Send(f(:,1),(nx+2)*1,MPI_DOUBLE_PRECISION,myid-1,tag2,& + MPI_COMM_WORLD, ierr) + endif + +!receive f(:,npy+1) from myid-1 + if(myid.lt.nproc-1) then + call MPI_Recv(f(:,nyp+1),(nx+2)*1,MPI_DOUBLE_PRECISION,myid+1,& + tag2,MPI_COMM_WORLD, status,ierr) + endif + +!update data from CPU to GPU +!$acc update device(f) +!$acc parallel loop present(f,f_k) collapse(2) + do j=1,nyp + do i=1,nx + d2fx = f(i+1,j) + f(i-1,j) + d2fy = f(i,j+1) + f(i,j-1) + f_k(i,j) = 0.25*(d2fx + d2fy) + enddo + enddo +!$acc end parallel loop + + max_err=0. + +!$acc parallel loop present(f,f_k) collapse(2) & +!$acc reduction(max:max_err) + do j=1,nyp + do i=1,nx + max_err = max(dabs(f_k(i,j) - f(i,j)),max_err) + f(i,j) = f_k(i,j) + enddo + enddo +!$acc end parallel loop + +!max_err is copied back to the CPU-host by default + call MPI_ALLREDUCE(MPI_IN_PLACE,max_err,1,& + MPI_DOUBLE_PRECISION,MPI_MAX, MPI_COMM_WORLD,ierr ) + + if(myid.eq.0) then + if(mod(iter,50).eq.0 )write(*,'(i5,f10.6)')iter,max_err + endif + + iter = iter + 1 + + enddo +!$acc exit data copyout(f_k) delete(f) + + deallocate(f) + + if(myid.eq.0) write(*,'(i5,f10.6)') iter,max_err + + call MPI_Barrier(MPI_COMM_WORLD, ierr) + + t_final = MPI_WTIME() + time_s = t_final - t_start + + if(myid.eq.0)print*, '--Time it takes (s)', time_s + + if(myid.eq.0) then + print*, '--Job is completed successfully--' + print*,'' + endif + +!to check the result + allocate(f_full(nx,ny)) + call MPI_Gather(f_k, nx*nyp, MPI_DOUBLE_PRECISION, & + f_full, nx*nyp, MPI_DOUBLE_PRECISION, 0, & + MPI_COMM_WORLD, ierr) + + if(myid.eq.0) then + do j=1,ny + write(111,*)j,sum(f_full(:,j)) + enddo + print*,"--Sum",sum(f_full(:,:))/nx/2 + print*,"--END :)" + endif + + deallocate(f_full,f_k) + + call MPI_FINALIZE( ierr ) + + end diff --git a/_downloads/0c1656603fcc9bc29011507fab75537b/mandelbrot_gpu.tar.gz b/_downloads/0c1656603fcc9bc29011507fab75537b/mandelbrot_gpu.tar.gz new file mode 100644 index 000000000..47ff29208 Binary files /dev/null and b/_downloads/0c1656603fcc9bc29011507fab75537b/mandelbrot_gpu.tar.gz differ diff --git a/_downloads/0ddfeb84c79cc59ca5004f7408ff8102/jacobi_serial.cpp b/_downloads/0ddfeb84c79cc59ca5004f7408ff8102/jacobi_serial.cpp new file mode 100644 index 000000000..599f1488b --- /dev/null +++ b/_downloads/0ddfeb84c79cc59ca5004f7408ff8102/jacobi_serial.cpp @@ -0,0 +1,63 @@ +/** + * Serial implementation of the Jacobi iteration + */ + +#include +#include + +// Number of rows and columns in our matrix +static const int NUM_ELEMENTS = 2000; +// Total number of elements in our matrix +static const int TOT_ELEMENTS = NUM_ELEMENTS * NUM_ELEMENTS; +// Maximum number of iterations before quiting +static const int MAX_ITER = 10000; +// Error tolerance for iteration +static const float MAX_ERROR = 0.01; +// Seed for random number generator +static const int SEED = 12345; + +int main (int argc, char** argv) { + // Initialize random number generator + srand (SEED); + + // Create array to calculate on + float arr[TOT_ELEMENTS]; + + // Fill array with data + for (int i = 0; i < TOT_ELEMENTS; i++) { + // The following will create random values between [0, 1] + arr[i] = (float) rand () / (float) RAND_MAX; + } + + // Before starting calculation we will define a few helper variables + float tmp[TOT_ELEMENTS]; + float err = __FLT_MAX__; + + // We copy here to get the boundary elements, which will be copied back and forth unchanged + std::memcpy(tmp, arr, TOT_ELEMENTS*sizeof(float)); + + int iterations = 0; + // Perform Jacobi iterations until we either have low enough error or too many iterations + while (err > MAX_ERROR && iterations < MAX_ITER) { + err = 0.; + // For each element take the average of the surrounding elements + for (int i = 1; i < NUM_ELEMENTS - 1; i++) { + for (int j = 1; j < NUM_ELEMENTS - 1; j++) { + tmp[i * NUM_ELEMENTS + j] = 0.25 * (arr[i * NUM_ELEMENTS + j+1] + + arr[i * NUM_ELEMENTS + j-1] + + arr[(i-1) * NUM_ELEMENTS + j] + + arr[(i+1) * NUM_ELEMENTS + j]); + err = std::max(err, std::abs(tmp[i*NUM_ELEMENTS + j] - arr[i*NUM_ELEMENTS + j])); + } + } + + // Transfer new array to old (including boundary, which was untouched in the loop) + std::memcpy(arr, tmp, TOT_ELEMENTS*sizeof(float)); + + iterations++; + } + + std::cout << "Iterations : " << iterations << " | Error : " << err << std::endl; + + return EXIT_SUCCESS; +} diff --git a/_downloads/1128542edf572b203314b54d0dde3de2/minimal_array_job.sh b/_downloads/1128542edf572b203314b54d0dde3de2/minimal_array_job.sh new file mode 100644 index 000000000..720ca8d9b --- /dev/null +++ b/_downloads/1128542edf572b203314b54d0dde3de2/minimal_array_job.sh @@ -0,0 +1,14 @@ +#!/bin/bash +#SBATCH --account=YourProject +#SBATCH --time=1:0:0 +#SBATCH --mem-per-cpu=4G --ntasks=2 +#SBATCH --array=1-200 + +set -o errexit # exit on errors +set -o nounset # treat unset variables as errors +module --quiet purge # clear any inherited modules + +DATASET=dataset.$SLURM_ARRAY_TASK_ID +OUTFILE=result.$SLURM_ARRAY_TASK_ID + +YourProgram $DATASET > $OUTFILE diff --git a/_downloads/15af6a1c50c8466bf49b917af8f0d35f/kernels.qdrep b/_downloads/15af6a1c50c8466bf49b917af8f0d35f/kernels.qdrep new file mode 100644 index 000000000..3785b1713 Binary files /dev/null and b/_downloads/15af6a1c50c8466bf49b917af8f0d35f/kernels.qdrep differ diff --git a/_downloads/1b97dd40682c2ab09e28f64b698eff77/loop_add_cuda.cu b/_downloads/1b97dd40682c2ab09e28f64b698eff77/loop_add_cuda.cu new file mode 100644 index 000000000..a62e29208 --- /dev/null +++ b/_downloads/1b97dd40682c2ab09e28f64b698eff77/loop_add_cuda.cu @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include + +// CUDA kernel, callable from host due to `__global__` +__global__ void add(const float* a, const float* b, float* c, const size_t n) { + // Calculate the array index of this thread + const int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id < n) { + c[id] = a[id] + b[id]; + } +} + +int main(int argc, char* argv[]) { + printf("ENTER MAIN\n"); + // Number of elements to compute over + const size_t num_elements = 1000000; + + // Allocate memory that can be accessed both on host and device + float* a; + float* b; + float* c; + // Should ideally catch errors here, but skip for brevity + cudaMallocManaged(&a, num_elements * sizeof(float)); + cudaMallocManaged(&b, num_elements * sizeof(float)); + cudaMallocManaged(&c, num_elements * sizeof(float)); + + // Fill our input arrays, on host, with some data to calculate + for (int i = 0; i < num_elements; i++) { + a[i] = sinf(i) * sinf(i); + b[i] = cosf(i) * cosf(i); + } + + // Define how many threads to launch on CUDA device + const int block_size = 1024; // Number of threads in each thread block + // Number of thread blocks in a grid + const int grid_size = (int) ceil((float) num_elements / block_size); + + for (int i = 0; i < 100000; i++) { + // Call CUDA kernel to run on device + add<<>>(a, b, c, num_elements); + // Wait for computation before doing anything with data on host + cudaDeviceSynchronize(); + } + + // Should print 1.0 at all entries + printf("c[0] : %f\n", c[0]); + printf("c[1] : %f\n", c[1]); + printf("c[42] : %f\n", c[42]); + + // Free memory + cudaFree(a); + cudaFree(b); + cudaFree(c); + + printf("EXIT SUCCESS\n"); + return EXIT_SUCCESS; +} diff --git a/_downloads/204ae7edd04bdf9cd6b7f36843a4fcf0/omp.c b/_downloads/204ae7edd04bdf9cd6b7f36843a4fcf0/omp.c new file mode 100644 index 000000000..1b22ec94d --- /dev/null +++ b/_downloads/204ae7edd04bdf9cd6b7f36843a4fcf0/omp.c @@ -0,0 +1,86 @@ +/** +* Example program to show how to combine OpenMP offload and cuBLAS library calls +*/ + +#include +#include +#include +#include +#include + +#define N 10000 + +int main() { + printf("Starting SAXPY + OpenMP offload program\n"); + // Allocate vectors which we will use for computations + float* a = (float*) calloc(N, sizeof(float)); + float* b = (float*) calloc(N, sizeof(float)); + float sum = 0.0; + const float alpha = 2.0; + + if (a == NULL || b == NULL) { + printf("Could not allocate compute vectors!"); + return EXIT_FAILURE; + } + + // Initialize input arrays, this is done on CPU host + printf(" Initializing vectors on CPU\n"); + for (int i = 0; i < N; i++) { + a[i] = 1.0; + b[i] = 2.0; + } + + // Create cuBLAS handle for interacting with cuBLAS routines + printf(" Creating cuBLAS handle\n"); + cublasHandle_t handle; + cublasStatus_t status; // Variable to hold return status from cuBLAS routines + status = cublasCreate(&handle); + if (status != CUBLAS_STATUS_SUCCESS) { + printf("Could not initialize cuBLAS handle!\n"); + return EXIT_FAILURE; + } + + // Create OpenMP data region so that our compute vectors are accessible on + // GPU device for cuBLAS + printf(" Starting calculation\n"); + #pragma omp target data map(tofrom:b[0:N]) map(to:a[0:N]) + { + // To allow cuBLAS to interact with our compute vectors we need to make + // them available as pointers. NOTE however that these pointers point to + // areas in the GPU memory so they cannot be dereferenced on the CPU, + // however, by using the 'host_data' directive we can use the pointers from + // CPU code passing them to other functions that require pointers to GPU + // memory + #pragma omp target data use_device_ptr(a, b) + { + status = cublasSaxpy(handle, N, &alpha, a, 1, b, 1); + if (status != CUBLAS_STATUS_SUCCESS) { + printf("SAXPY failed!\n"); + // NOTE we cannot exit here since this is within an accelerated region + } + } + // We can now continue to use a and b in OpenMP offloading code + #pragma omp target teams distribute parallel for schedule(nonmonotonic:static,1) reduction(+:sum) + for (int i = 0; i < N; i++) { + sum += b[i]; + } + } + // After the above OpenMP region has ended 'a' has not changed, 'b' contains + // the result of the SAXPY routine and 'sum' contains the sum over 'b' + + // To ensure everything worked we can check that the sum is as we expected + if (fabs(sum - 4.0 * (float) N) < 0.001) { + printf(" Calculation produced the correct result of '4 * %d == %.0f'!\n", N, sum); + } else { + printf(" Calculation produced _incorrect_ result, expected '4 * %d == %.3f'\n", N, sum); + } + + // Free cuBLAS handle + cublasDestroy(handle); + // Free computation vectors + free(a); + free(b); + // Indicate to caller that everything worked as expected + printf("Ending SAXPY + OpenMP program\n"); + return EXIT_SUCCESS; +} diff --git a/_downloads/217f4f555f5a40d4eafaba778dfaa170/HPL.dat b/_downloads/217f4f555f5a40d4eafaba778dfaa170/HPL.dat new file mode 100644 index 000000000..ff96de4ce --- /dev/null +++ b/_downloads/217f4f555f5a40d4eafaba778dfaa170/HPL.dat @@ -0,0 +1,36 @@ +HPLinpack benchmark input file +Innovative Computing Laboratory, University of Tennessee +HPL.out output file name (if any) +6 device out (6=stdout,7=stderr,file) +1 # of problems sizes (N) +50000 Ns +1 # of NBs +192 NBs +0 PMAP process mapping (0=Row-,1=Column-major) +1 # of process grids (P x Q) +4 Ps +8 Qs +16.0 threshold +1 # of panel fact +2 PFACTs (0=left, 1=Crout, 2=Right) +1 # of recursive stopping criterium +4 NBMINs (>= 1) +1 # of panels in recursion +2 NDIVs +1 # of recursive panel fact. +1 RFACTs (0=left, 1=Crout, 2=Right) +1 # of broadcast +1 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM) +1 # of lookahead depth +1 DEPTHs (>=0) +2 SWAP (0=bin-exch,1=long,2=mix) +64 swapping threshold +0 L1 in (0=transposed,1=no-transposed) form +0 U in (0=transposed,1=no-transposed) form +1 Equilibration (0=no,1=yes) +8 memory alignment in double (> 0) +##### This line (no. 32) is ignored (it serves as a separator). ###### +0 Number of additional problem sizes for PTRANS +1200 10000 30000 values of N +0 number of additional blocking sizes for PTRANS +40 9 8 13 13 20 16 32 64 values of NB diff --git a/_downloads/264e0f82aaf76f3c27a58a3a80205412/data.qdrep b/_downloads/264e0f82aaf76f3c27a58a3a80205412/data.qdrep new file mode 100644 index 000000000..895f513a1 Binary files /dev/null and b/_downloads/264e0f82aaf76f3c27a58a3a80205412/data.qdrep differ diff --git a/_downloads/26f47f6f96a83fe5d3ef564e0a754652/gpu_intro.py b/_downloads/26f47f6f96a83fe5d3ef564e0a754652/gpu_intro.py new file mode 100644 index 000000000..65646d6ad --- /dev/null +++ b/_downloads/26f47f6f96a83fe5d3ef564e0a754652/gpu_intro.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import tensorflow as tf + +# Test if there are any GPUs available +print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU'))) + +# Have Tensorflow output where computations are run +tf.debugging.set_log_device_placement(True) + +# Create some tensors +a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) +b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) +c = tf.matmul(a, b) + +# Print result +print(c) diff --git a/_downloads/2720351be9098961b6792f3befd13896/fftw_serial.f90 b/_downloads/2720351be9098961b6792f3befd13896/fftw_serial.f90 new file mode 100644 index 000000000..5f167924c --- /dev/null +++ b/_downloads/2720351be9098961b6792f3befd13896/fftw_serial.f90 @@ -0,0 +1,84 @@ +module parameter_kind + implicit none + public + integer, parameter :: FFTW_FORWARD=-1,FFTW_BACKWARD=+1 + integer, parameter :: FFTW_MEASURE=0 + integer, parameter :: sp = selected_real_kind(6, 37) !Single precision + integer, parameter :: dp = selected_real_kind(15, 307) !Double precision + integer, parameter :: fp = dp + real(fp), parameter :: pi = 4.0_fp*atan(1.0_fp),dt=0.25_fp + end module parameter_kind + + program fftw_serial + + use parameter_kind + + implicit none + + !include "fftw3.f" + + integer, parameter :: nt=512 + integer :: i,ierr + integer*8 :: plan_forward,plan_backward + complex(fp), allocatable :: in(:),out(:),f(:) + real(fp), allocatable :: t(:),w(:) + + allocate(t(nt),w(nt)); allocate(f(nt)) + + call grid_1d(nt,t,w) + +!Example of sine function + do i=1,nt + f(i) = cmplx(sin(2.0_fp*t(i)),0.0_fp) + enddo + + print*,"--sum before FFT", sum(real(f(1:nt/2))) + +!Creating 1D plans + allocate(in(nt),out(nt)) + call dfftw_plan_dft_1d(plan_forward,nt,in,out,FFTW_FORWARD,FFTW_MEASURE) + call dfftw_plan_dft_1d(plan_backward,nt,in,out,FFTW_BACKWARD,FFTW_MEASURE) + +!Forward FFT + in(:) = f(:) + call dfftw_execute_dft(plan_forward, in, out) + f(:) = out(:) + +!Backward FFT + call dfftw_execute_dft(plan_backward, out, in) +!The data on the backforward are unnormalized, so they should be divided by N. + in(:) = in(:)/real(nt) + +!Destroying plans + call dfftw_destroy_plan(plan_forward) + call dfftw_destroy_plan(plan_backward) + + print*,"--sum iFFT", sum(real(in(1:nt/2))) + +!Printing the FFT of sin(2t) + do i=1,nt/2 + write(204,*)w(i),dsqrt(cdabs(f(i))**2) + enddo + deallocate(in); deallocate(out); deallocate(f) + end + + subroutine grid_1d(nt,t,w) + use parameter_kind + + implicit none + integer :: i,nt + real(fp) :: t(nt),w(nt) + +!Defining a uniform temporal grid + do i=1,nt + t(i) = (-dble(nt-1)/2.0_fp + (i-1))*dt + enddo + +!Defining a uniform frequency grid + do i=0,nt/2-1 + w(i+1) = 2.0_fp*pi*dble(i)/(nt*dt) + enddo + do i=nt/2,nt-1 + w(i+1) = 2.0_fp*pi*dble(i-nt)/(nt*dt) + enddo + end subroutine grid_1d diff --git a/_downloads/279a661df1a778477720a8abbad526a0/run.sh b/_downloads/279a661df1a778477720a8abbad526a0/run.sh new file mode 100644 index 000000000..346ab2481 --- /dev/null +++ b/_downloads/279a661df1a778477720a8abbad526a0/run.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=CUDA-test +#SBATCH --account=nnk +#SBATCH --time=05:00 +#SBATCH --mem-per-cpu=1G +#SBATCH --qos=devel +#SBATCH --partition=accel +#SBATCH --gpus=1 + +## Set up job environment: +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error + +module --quiet purge # Reset the modules to the system default +module load CUDA/11.1.1-GCC-10.2.0 +module list + +# Compile our code +nvcc loop_add_cuda.cu -o loop_add_cuda + +# Run our computation +./loop_add_cuda + +exit 0 diff --git a/_downloads/27bc02a24553adf798b5e0145f88cfe4/mnist.py b/_downloads/27bc02a24553adf798b5e0145f88cfe4/mnist.py new file mode 100644 index 000000000..a5a9224dd --- /dev/null +++ b/_downloads/27bc02a24553adf798b5e0145f88cfe4/mnist.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +import datetime +import os +import tensorflow as tf + +# Access storage path for '$SLURM_SUBMIT_DIR' +storage_path = os.path.join(os.environ['SLURM_SUBMIT_DIR'], + os.environ['SLURM_JOB_ID']) + +# Load dataset +mnist = tf.keras.datasets.mnist +(x_train, y_train), (x_test, y_test) = mnist.load_data() +x_train, x_test = x_train / 255., x_test / 255. + + +def create_model(): + model = tf.keras.models.Sequential([ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(512, activation='relu'), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation='softmax') + ]) + model.compile(optimizer='adam', + loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True), + metrics=['accuracy']) + return model + + +# Create and display summary of model +model = create_model() +# Output, such as from the following command, is outputted into the '.out' file +# produced by 'sbatch' +model.summary() +# Output to check if we are using GPU +print(tf.config.experimental.list_physical_devices('GPU')) +# Tensorboard support +log_dir = os.path.join(os.environ['SLURM_SUBMIT_DIR'], + "logs", + "fit", + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) +tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, + histogram_freq=1) + + +# Save model in TensorFlow format +model.save(os.path.join(storage_path, "model")) + +# Create checkpointing of weights +ckpt_path = os.path.join(storage_path, "checkpoints", "mnist-{epoch:04d}.ckpt") +ckpt_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=ckpt_path, + save_weights_only=True, + verbose=1) + +# Save initial weights +model.save_weights(ckpt_path.format(epoch=0)) + +# Train model with checkpointing +model.fit(x_train, y_train, + epochs=50, + callbacks=[ckpt_callback, tensorboard_callback], + validation_data=(x_test, y_test), + verbose=0) diff --git a/_downloads/2845b8c35f8be3ab482009cf26ba139f/laplace_mpiomp_aware.f90 b/_downloads/2845b8c35f8be3ab482009cf26ba139f/laplace_mpiomp_aware.f90 new file mode 100644 index 000000000..482405d74 --- /dev/null +++ b/_downloads/2845b8c35f8be3ab482009cf26ba139f/laplace_mpiomp_aware.f90 @@ -0,0 +1,237 @@ + program laplace_mpiomp_aware + + use mpi + use omp_lib + + implicit none + integer status(MPI_STATUS_SIZE) + integer :: i,j,k,ii + integer :: iter,count_rate, count_max,count,nenv_var + integer :: myid,ierr,nproc,nxp,nyp,tag,tag1,tag2,nsend + integer, parameter :: nx=20000,ny=nx + integer, parameter :: max_iter=525 + double precision, parameter :: pi=4d0*datan(1d0) + real, parameter :: error=0.05 + double precision :: max_err,time_s,& + d2fx,d2fy,max_err_part + real :: t_start,t_final + double precision, allocatable :: f(:,:),f_k(:,:) + double precision, allocatable :: f_send(:,:),f_full(:,:) + character(len=300) :: env_var + + integer :: deviceType,myDevice,numDevice,host_rank,host_comm + + !MPI starts + ! Initialise OpenMPI communication. + call MPI_INIT(ierr) + ! Get number of active processes (from 0 to nproc-1). + call MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr ) + ! Identify the ID rank (process). + call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr ) + +!check if GPU-aware support is enabled + if(myid.eq.0) then + print*, '' + call getenv("MPICH_GPU_SUPPORT_ENABLED", env_var) + read(env_var, '(i10)' ) nenv_var + if (nenv_var.eq. 1) then + print *, '--MPICH_GPU_SUPPORT_ENABLED is enabled!' + print*, '' + elseif (nenv_var.ne. 1) then + print *, '--MPICH_GPU_SUPPORT_ENABLED is NOT enabled!' + print *, '' + endif + endif + + t_start = MPI_WTIME() + + if (mod(nx,nproc).ne.0) then + if (myid.eq.0) write(*,*) 'nproc has to divide nx' + stop + else + nxp = nx/nproc + endif + if (mod(ny,nproc).ne.0) then + if (myid.eq.0) write(*,*) 'nproc has to divide ny' + stop + else + nyp = ny/nproc + endif + + if(myid.eq.0) then + print*,'--nbr of proc', nproc + write(*,*)'--nbr of points nx,ny',nx,ny + write(*,*)'--nbr of elmts on each proc, nyp=ny/nproc', nyp + endif + +!Generate the Initial Conditions (ICs) +!Distribute the ICs over all processes using the operation MPI_Scatter + allocate(f(0:nx+1,0:nyp+1)); + + f=0d0; tag1=2020; tag2=2021 + + if(myid.eq.0) then + allocate(f_send(1:nx,1:ny)) + CALL RANDOM_NUMBER(f_send) + endif + + call MPI_Scatter(f_send,nx*nyp,MPI_DOUBLE_PRECISION,& + f(1:nx,1:nyp), nx*nyp,MPI_DOUBLE_PRECISION,& + 0,MPI_COMM_WORLD, ierr) + + call MPI_Barrier(MPI_COMM_WORLD, ierr) + + if(myid.eq.0) deallocate(f_send) + +!Set a device: Determine which processes are on each node +!such that each process will be connected to a GPU + +!!Split the world communicator into subgroups of commu, each of which +!contains processes that run on the same node, and which can create a +!shared +!memory region (via the type MPI_COMM_TYPE_SHARED). +!The call returns a new communicator "host_comm", which is created by +!each subgroup. + + call MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,& + MPI_INFO_NULL, host_comm,ierr) + call MPI_COMM_RANK(host_comm, host_rank,ierr) + + myDevice = host_rank + +!returns the device number of the device on which the calling thread is +!executing + deviceType = omp_get_device_num() +!returns the number of devices available for offloading. + numDevice = omp_get_num_devices() +!sets the device number to use in device constructs by setting the +!initial value of the default-device-var + + call omp_set_default_device(myDevice) + + if(myid.eq.0)print*, "--Number of devices per node:", numDevice + if(myid.eq.0)print*,"" + + print*, "--MPI rank", myid, "is connected to GPU", myDevice + + allocate(f_k(1:nx,1:nyp)) + + iter = 0 + + if(myid.eq.0) then + print*,"" + print*, "--Start iterations",iter + print*,"" + endif + +!Unstructed data locality +!$omp target enter data device(myDevice) map(to:f) map(alloc:f_k) + + do while (max_err.gt.error.and.iter.le.max_iter) + +!Performing MPI_send and MPI_Recv between GPUs without passing through the host +!$omp target data use_device_ptr(f) + +!transfer the data at the boundaries to the neighbouring MPI-process +!send f(:,nyp) from myid-1 to be stored in f(:,0) in myid+1 + if(myid.lt.nproc-1) then + call MPI_Send(f(:,nyp),(nx+2)*1,MPI_DOUBLE_PRECISION,myid+1,tag1,& + MPI_COMM_WORLD, ierr) + endif + +!receive f(:,0) from myid-1 + if(myid.gt.0) then + call MPI_Recv(f(:,0),(nx+2)*1,MPI_DOUBLE_PRECISION,myid-1, & + tag1,MPI_COMM_WORLD, status,ierr) + endif + +!send f(:,1) from myid+1 to be stored in f(:,nyp+1) in myid-1 + if(myid.gt.0) then + call MPI_Send(f(:,1),(nx+2)*1,MPI_DOUBLE_PRECISION,myid-1,tag2,& + MPI_COMM_WORLD, ierr) + endif + +!receive f(:,npy+1) from myid-1 + if(myid.lt.nproc-1) then + call MPI_Recv(f(:,nyp+1),(nx+2)*1,MPI_DOUBLE_PRECISION,myid+1,& + tag2,MPI_COMM_WORLD, status,ierr) + endif + +!$omp end target data + +!$omp target teams distribute parallel do collapse(2) schedule(static,1) + do j=1,nyp + do i=1,nx + d2fx = f(i+1,j) + f(i-1,j) + d2fy = f(i,j+1) + f(i,j-1) + f_k(i,j) = 0.25*(d2fx + d2fy) + enddo + enddo +!$omp end target teams distribute parallel do + + max_err=0. + +!$omp target teams distribute parallel do reduction(max:max_err) & +!$omp collapse(2) schedule(static,1) + do j=1,nyp + do i=1,nx + max_err = max(dabs(f_k(i,j) - f(i,j)),max_err) + f(i,j) = f_k(i,j) + enddo + enddo +!$omp end target teams distribute parallel do + +!max_err is copied back to the CPU-host by default + +!$omp target enter data device(myDevice) map(to:max_err) +!Performing MPI_Allreduce between GPUs without passing through the host +!$omp target data use_device_ptr(max_err) + call MPI_ALLREDUCE(MPI_IN_PLACE,max_err,1,& + MPI_DOUBLE_PRECISION,MPI_MAX, MPI_COMM_WORLD,ierr ) +!$omp end target data +!$omp target exit data map(from:max_err) + + if(myid.eq.0) then + if(mod(iter,50).eq.0 )write(*,'(i5,f10.6)')iter,max_err + endif + + iter = iter + 1 + + enddo +!$omp target exit data map(from:f_k) map(delete:f) + + deallocate(f) + + if(myid.eq.0) write(*,'(i5,f10.6)') iter,max_err + + call MPI_Barrier(MPI_COMM_WORLD, ierr) + + t_final = MPI_WTIME() + time_s = t_final - t_start + + if(myid.eq.0)print*, '--Time it takes (s)', time_s + + if(myid.eq.0) then + print*, '--Job is completed successfully--' + print*,'' + endif + +!to check the result + allocate(f_full(nx,ny)) + call MPI_Gather(f_k, nx*nyp, MPI_DOUBLE_PRECISION, & + f_full, nx*nyp, MPI_DOUBLE_PRECISION, 0, & + MPI_COMM_WORLD, ierr) + + if(myid.eq.0) then + do j=1,ny + write(111,*)j,sum(f_full(:,j)) + enddo + print*,"--Sum",sum(f_full(:,:))/nx/2 + print*,"--END :)" + endif + + deallocate(f_full,f_k) + + call MPI_FINALIZE( ierr ) + + end diff --git a/_downloads/2f7a1135a098bcc07494d9a428762c3e/betzy_mpi_job.sh b/_downloads/2f7a1135a098bcc07494d9a428762c3e/betzy_mpi_job.sh new file mode 100644 index 000000000..95721d718 --- /dev/null +++ b/_downloads/2f7a1135a098bcc07494d9a428762c3e/betzy_mpi_job.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +############################################### +# Script example for a normal MPI job on Betzy # +############################################### + +## Project: replace XXXX with your project ID +#SBATCH --account=nnXXXXk + +## Job name: +#SBATCH --job-name=MyJob +## Allocating amount of resources: +#SBATCH --nodes=10 +## Number of tasks (aka processes) to start on each node: Pure mpi, one task per core +#SBATCH --ntasks-per-node=128 +## No memory pr task since this option is turned off on Betzy in partition normal. +## Run for 10 minutes, syntax is d-hh:mm:ss +#SBATCH --time=0-00:10:00 + +# you may not place bash commands before the last SBATCH directive +###################################################### +## Setting variables and prepare runtime environment: +##---------------------------------------------------- +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +# Loading Software modules +# Allways be explicit on loading modules and setting run time environment!!! +module --quiet purge # Restore loaded modules to the default +module load MySoftWare/Versions #nb: Versions is important! + +# Type "module avail MySoftware" to find available modules and versions +# It is also recommended to to list loaded modules, for easier debugging: +module list + +####################################################### +## Prepare jobs, moving input files and making sure +# output is copied back and taken care of +##----------------------------------------------------- + +# Prepare input files +cp inputfiles $SCRATCH +cd $SCRATCH + +# Make sure output is copied back after job finishes +savefile outputfile1 outputfile2 + +######################################################## +# Run the application, and we typically time it: +##------------------------------------------------------ + +# Run the application - please add hash in front of srun and remove +# hash in front of mpirun if using intel-toolchain + +# For OpenMPI (foss and iomkl toolchains), srun is recommended: +time srun MySoftWare-exec + +## For IntelMPI (intel toolchain), mpirun is recommended: +#time mpirun MySoftWare-exec + +######################################################### +# That was about all this time; lets call it a day... +##------------------------------------------------------- +# Finish the script +exit 0 diff --git a/_downloads/36659a8f7a7f3020144408a86d35d626/jacobi_serial.c b/_downloads/36659a8f7a7f3020144408a86d35d626/jacobi_serial.c new file mode 100644 index 000000000..945e81b8b --- /dev/null +++ b/_downloads/36659a8f7a7f3020144408a86d35d626/jacobi_serial.c @@ -0,0 +1,57 @@ +/** + * Serial implementation of the Jacobi iteration + */ + +#include +#include +#include + +// Number of rows and columns in our matrix +static const int NUM_ELEMENTS = 2000; +// Maximum number of iterations before quiting +static const int MAX_ITER = 10000; +// Error tolerance for iteration +static const float MAX_ERROR = 0.01; +// Seed for random number generator +static const int SEED = 12345; + +int main (int argc, char** argv) { + // Initialize random number generator + srand (SEED); + // Create array to calculate on + float array[NUM_ELEMENTS][NUM_ELEMENTS]; + // Fill array with data + for (int i = 0; i < NUM_ELEMENTS; i++) { + for (int j = 0; j < NUM_ELEMENTS; j++) { + // The following will create random values between [0, 1] + array[i][j] = (float) rand () / (float) RAND_MAX; + } + } + // Before starting calculation we will define a few helper variables + float arr_new[NUM_ELEMENTS][NUM_ELEMENTS]; + float error = __FLT_MAX__; + int iterations = 0; + // Perform Jacobi iterations until we either have low enough error or too + // many iterations + while (error > MAX_ERROR && iterations < MAX_ITER) { + error = 0.; + // For each element take the average of the surrounding elements + for (int i = 1; i < NUM_ELEMENTS - 1; i++) { + for (int j = 1; j < NUM_ELEMENTS - 1; j++) { + arr_new[i][j] = 0.25 * (array[i][j + 1] + + array[i][j - 1] + + array[i - 1][j] + + array[i + 1][j]); + error = fmaxf (error, fabsf (arr_new[i][j] - array[i][j])); + } + } + // Transfer new array to old + for (int i = 1; i < NUM_ELEMENTS - 1; i++) { + for (int j = 1; j < NUM_ELEMENTS - 1; j++) { + array[i][j] = arr_new[i][j]; + } + } + iterations += 1; + } + return EXIT_SUCCESS; +} diff --git a/_downloads/38e33f6cabeb18dfc9a1f8e935c927d5/jacobi_optimized.c b/_downloads/38e33f6cabeb18dfc9a1f8e935c927d5/jacobi_optimized.c new file mode 100644 index 000000000..710eab437 --- /dev/null +++ b/_downloads/38e33f6cabeb18dfc9a1f8e935c927d5/jacobi_optimized.c @@ -0,0 +1,58 @@ +/** + * Final optimized OpenACC version + */ + +#include +#include +#include + +// Number of rows and columns in our matrix +static const int NUM_ELEMENTS = 2000; +// Maximum number of iterations before quiting +static const int MAX_ITER = 10000; +// Error tolerance for iteration +static const float MAX_ERROR = 0.01; +// Seed for random number generator +static const int SEED = 12345; + +int main (int argc, char** argv) { + // Initialize random number generator + srand (SEED); + // Create array to calculate on + float array[NUM_ELEMENTS][NUM_ELEMENTS]; + // Fill array with data + for (int i = 0; i < NUM_ELEMENTS; i++) { + for (int j = 0; j < NUM_ELEMENTS; j++) { + // The following will create random values between [0, 1] + array[i][j] = (float) rand () / (float) RAND_MAX; + } + } + // Before starting calculation we will define a few helper variables + float arr_new[NUM_ELEMENTS][NUM_ELEMENTS]; + float error = __FLT_MAX__; + int iterations = 0; + // Perform Jacobi iterations until we either have low enough error or too + // many iterations + #pragma acc data copy(array) create(arr_new) + while (error > MAX_ERROR && iterations < MAX_ITER) { + error = 0.; + #pragma acc parallel loop reduction(max:error) collapse(2) + for (int i = 1; i < NUM_ELEMENTS - 1; i++) { + for (int j = 1; j < NUM_ELEMENTS - 1; j++) { + arr_new[i][j] = 0.25 * (array[i][j + 1] + + array[i][j - 1] + + array[i - 1][j] + + array[i + 1][j]); + error = fmaxf (error, fabsf (arr_new[i][j] - array[i][j])); + } + } + #pragma acc parallel loop collapse(2) + for (int i = 1; i < NUM_ELEMENTS - 1; i++) { + for (int j = 1; j < NUM_ELEMENTS - 1; j++) { + array[i][j] = arr_new[i][j]; + } + } + iterations += 1; + } + return EXIT_SUCCESS; +} diff --git a/_downloads/39300185bd1cf3a39e44fa16acb210df/array_test.py b/_downloads/39300185bd1cf3a39e44fa16acb210df/array_test.py new file mode 100644 index 000000000..a4e6bb6a8 --- /dev/null +++ b/_downloads/39300185bd1cf3a39e44fa16acb210df/array_test.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python + +import time + +print('start at ' + time.strftime('%H:%M:%S')) + +print('sleep for 10 seconds ...') +time.sleep(10) + +print('stop at ' + time.strftime('%H:%M:%S')) diff --git a/_downloads/3c4537a3dbc39ac7e73eadba3f202a9c/cuda_const.cu b/_downloads/3c4537a3dbc39ac7e73eadba3f202a9c/cuda_const.cu new file mode 100644 index 000000000..a6d39e76f --- /dev/null +++ b/_downloads/3c4537a3dbc39ac7e73eadba3f202a9c/cuda_const.cu @@ -0,0 +1,253 @@ +#include "png_writer.h" +#include +#include +#include +#include +#include +#include + +// Default grid size (GRID_SIZE * GRID_SIZE elements are needed) +static const int DEFAULT_GRID_SIZE = 500; +// Default number of iterations if no command line argument is given +static const int DEFAULT_ITER = 1000; +// Default diffusion constant +static const float DEFAULT_ALPHA = 0.1; +static const float DEFAULT_CELL_SIZE = 0.01; +// Number of blocks to use when launching CUDA kernels +static const int FIXED_BLOCKS = 16; + +// Forward declarations +// Initialize the field with a size of (size + 2)^2 +__global__ void init_field(float* field, const int size); +// Evolve the 'next' field from 'curr' with total grid size of (size + 2)^2, +// 'alpha' is the diffusion constant and 'dt' is the time derivative + +/*__global__ void evolve(const float* curr, float* next, const int size, const float cell_size, const float alpha, const float dt); + */ +__global__ void evolve(const float* curr, float* next, const int size); +// Helper method to save the field to PNG +void save(const float* field, const int size, const int iteration); +// Check the return value of a CUDA function and abort if abnormal behavior +void check_cuda(const cudaError_t err, const char* msg); + + +// constants array declaration using constant memory + +__constant__ float constants[2]; // cell_size, alpha, dt + +void setup_constants () { + float cell_size = DEFAULT_CELL_SIZE; + float alpha = DEFAULT_ALPHA; + float dt = pow(cell_size, 4) / (2.0 * alpha * (pow(cell_size, 2) + pow(cell_size, 2))); + float r = alpha * dt; + const float host_constants[] = {cell_size, r}; + check_cuda(cudaMemcpyToSymbol(constants, host_constants, 2*sizeof(float)), "Error constant memory" ); +} + +int main(int argc, char* argv[]) { + printf("%s", "Cuda with Constant memory\n"); + // grid_size represents the N x N grid to compute over + int grid_size = DEFAULT_GRID_SIZE; + // The number of iterations to perform to solve the equation + int num_iter = DEFAULT_ITER; + // Diffusion constant + float alpha = DEFAULT_ALPHA; + // Size of each grid cell + float cell_size = DEFAULT_CELL_SIZE; + // Calculate the time increment to propagate with + const float dt = pow(cell_size, 4) / (2.0 * alpha * (pow(cell_size, 2) + pow(cell_size, 2))); + // Save interval + int save_interval = -1; + + // Command line handling + if (argc > 1) { + grid_size = atoi(argv[1]); + } + if (argc > 2) { + num_iter = atoi(argv[2]); + } + if (argc > 3) { + save_interval = atoi(argv[3]); + } + + // Initialization + printf("Solving heat equation for grid \033[0;35m%d x %d\033[0m with \033[0;35m%d\033[0m iterations\n", + grid_size, grid_size, num_iter); + // Setup CUDA block and grid dimensions to use for kernel launch + dim3 dim_block; + dim3 dim_grid; + if (grid_size + 2 < 32) { + dim_block = dim3(grid_size + 2, grid_size + 2); + dim_grid = dim3(1, 1); + } else { + dim_block = dim3(FIXED_BLOCKS, FIXED_BLOCKS); + const int grids = (grid_size + 2 + FIXED_BLOCKS - 1) / FIXED_BLOCKS; + dim_grid = dim3(grids, grids); + } + printf("Launching \033[0;35m(%d, %d)\033[0m grids with \033[0;35m(%d, %d)\033[0m blocks\n", + dim_grid.x, dim_grid.y, dim_block.x, dim_block.y); + // Setup grid arrays + float* grid; + float* next_grid; + check_cuda(cudaMallocManaged(&grid, (grid_size + 2) * (grid_size + 2) * sizeof(float)), + "Could not allocate 'grid'"); + check_cuda(cudaMallocManaged(&next_grid, (grid_size + 2) * (grid_size + 2) * sizeof(float)), + "Could not allocate 'next_grid'"); + + init_field<<>>(grid, grid_size); + check_cuda(cudaGetLastError(), "'init_field' of 'grid' failed"); + init_field<<>>(next_grid, grid_size); + check_cuda(cudaGetLastError(), "'init_field' of 'next_grid' failed"); + + if (save_interval > 0) { + check_cuda(cudaDeviceSynchronize(), "'init_field' of 'grid' or 'next_grid' failed"); + save(grid, grid_size, 0); + if (grid_size < 34) { + for(int i = 0; i < grid_size + 2; i++) { + for(int j = 0; j < grid_size + 2; j++) { + const int index = i * (grid_size + 2) + j; + printf(" %2.0f", grid[index]); + } + printf("\n"); + } + } + } + + // Main calculation + setup_constants(); + const double start_time = omp_get_wtime(); + for (int i = 1; i <= num_iter; i++) { + // One iteration of the heat equation + // evolve<<>>(grid, next_grid, grid_size, cell_size, alpha, dt); + evolve<<>>(grid, next_grid, grid_size); + // Wait until the kernel is done running before performing pointer swap + check_cuda(cudaDeviceSynchronize(), "Waiting for evolve before pointer swap"); + // Exchange old grid with the new updated grid + float* tmp = grid; + grid = next_grid; + next_grid = tmp; + + // Save image if necessary + if (save_interval > 0 && (i % save_interval) == 0) { + save(grid, grid_size, i); + } + } + const double total_time = omp_get_wtime() - start_time; + printf("Used \033[0;35m%.3f\033[0m seconds to evolve field\n", total_time); + printf("Average time per field update: \033[0;35m%.3f\033[0m ms\n", (total_time * 1e3) / num_iter); + + // Free data and terminate + cudaFree(grid); + cudaFree(next_grid); + check_cuda(cudaDeviceReset(), "Device Reset Failed\n"); + return EXIT_SUCCESS; +} + +// Initialize the field with a size of (size + 2)^2 +// +// This function will fill the field with an initial condition that we want to +// simulate from +__global__ void init_field(float* field, const int size) { + // Calculate CUDA index in two dimensions + const int row = blockIdx.x * blockDim.x + threadIdx.x; + const int col = blockIdx.y * blockDim.y + threadIdx.y; + // Calculate field index from CUDA indexes + const int index = row * (size + 2) + col; + if (index < (size + 2) * (size + 2)) { + // First create a uniform temperature with a source disk in the middle + // Radius of source disk + const float radius = (float) size / 6.0; + // Distance of the current index to center of the field + const int dx = row - size / 2 + 1; + const int dy = col - size / 2 + 1; + if (dx * dx + dy * dy < radius * radius) { + field[index] = 5.0; + } else if (0 < col && col < size + 1 && 0 < row && row < size + 1){ + field[index] = 65.0; + } + + // The following will be slow and lead to thread divergence, but it isn't + // that important since this is not a hot loop + if (row == 0) { + // Top of the field + field[index] = 85.0; + } + if (row == size + 1) { + // Bottom of the field + field[index] = 5.0; + } + if (col == 0) { + // Left side of the field + field[index] = 20.0; + } + if (col == size + 1) { + // Right side of the field + field[index] = 70.0; + } + } +} + +// Evolve the 'next' field from 'curr' with total grid size of (size + 2)^2, +// 'alpha' is the diffusion constant and 'dt' is the time derivative + +/*__global__ void evolve(const float* curr, float* next, const int size, const float + cell_size, const float alpha, const float dt) { +*/ + +__global__ void evolve(const float* curr, float* next, const int size) { + /* + // Calculate unique index in CUDA + const int row = blockIdx.x * blockDim.x + threadIdx.x; + const int col = blockIdx.y * blockDim.y + threadIdx.y; + const int index = row * (size + 2) + col; + */ + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + + +#define CURR(i,j) curr[((i)+1)*(size+2)+(j)+1] +#define NEXT(i,j) next[((i)+1)*(size+2)+(j)+1] + + // Additional variables + // const float cell = cell_size * cell_size; + // const float r = alpha * dt; + + const float cell_size = constants[0]; + const float r = constants[1]; + + // When launching this kernel we don't take into account that we don't want + // it run for the boundary, we solve this by the following if guard, this + // means that we launch 4 threads more than we actually need, but this is a + // very low overhead + + if (0 < i && i < size + 1 && 0 < j && j < size + 1) { + NEXT(i,j) = CURR(i,j) + r * ( + (CURR(i-1,j)+CURR(i+1,j)+ + CURR(i,j-1)+CURR(i,j+1)- + 4.0*CURR(i,j)) / (cell_size*cell_size) + ); + } + +} + +// Helper method to save the field to PNG +void save(const float* field, const int size, const int iteration) { + char filename[256]; + sprintf(filename, "field_%05d.png", iteration); + const int write_res = write_field(filename, field, size); + if (write_res != 0) { + fprintf(stderr, "\033[0;31mCould not write initial image!\033[0m\n\tError: %d\n", + write_res); + abort(); + } +} + +// Check the return value of a CUDA function and abort if abnormal behavior +void check_cuda(const cudaError_t err, const char* msg) { + if (err != cudaSuccess) { + fprintf(stderr, "\033[0;31m%s:\033[0m\n", msg); + fprintf(stderr, "\tError(\033[0;33m%s\033[0m): %s\n", cudaGetErrorName(err), cudaGetErrorString(err)); + abort(); + } +} + diff --git a/_downloads/4274afd07d1d674f4a267cb8f580b787/jacobi_kernels.c b/_downloads/4274afd07d1d674f4a267cb8f580b787/jacobi_kernels.c new file mode 100644 index 000000000..1c7e0ecee --- /dev/null +++ b/_downloads/4274afd07d1d674f4a267cb8f580b787/jacobi_kernels.c @@ -0,0 +1,60 @@ +/** + * Initial translation to OpenACC using 'kernels' directive + */ + +#include +#include +#include + +// Number of rows and columns in our matrix +static const int NUM_ELEMENTS = 2000; +// Maximum number of iterations before quiting +static const int MAX_ITER = 10000; +// Error tolerance for iteration +static const float MAX_ERROR = 0.01; +// Seed for random number generator +static const int SEED = 12345; + +int main (int argc, char** argv) { + // Initialize random number generator + srand (SEED); + // Create array to calculate on + float array[NUM_ELEMENTS][NUM_ELEMENTS]; + // Fill array with data + for (int i = 0; i < NUM_ELEMENTS; i++) { + for (int j = 0; j < NUM_ELEMENTS; j++) { + // The following will create random values between [0, 1] + array[i][j] = (float) rand () / (float) RAND_MAX; + } + } + // Before starting calculation we will define a few helper variables + float arr_new[NUM_ELEMENTS][NUM_ELEMENTS]; + float error = __FLT_MAX__; + int iterations = 0; + // Perform Jacobi iterations until we either have low enough error or too + // many iterations + while (error > MAX_ERROR && iterations < MAX_ITER) { + error = 0.; + #pragma acc kernels + { + // For each element take the average of the surrounding elements + for (int i = 1; i < NUM_ELEMENTS - 1; i++) { + for (int j = 1; j < NUM_ELEMENTS - 1; j++) { + arr_new[i][j] = 0.25 * (array[i][j + 1] + + array[i][j - 1] + + array[i - 1][j] + + array[i + 1][j]); + error = fmaxf (error, fabsf (arr_new[i][j] - array[i][j])); + } + } + // Transfer new array to old + for (int i = 1; i < NUM_ELEMENTS - 1; i++) { + for (int j = 1; j < NUM_ELEMENTS - 1; j++) { + array[i][j] = arr_new[i][j]; + } + } + } + iterations += 1; + } + return EXIT_SUCCESS; +} diff --git a/_downloads/42ab328c87be33cec72229ad6b7705b1/test.xml b/_downloads/42ab328c87be33cec72229ad6b7705b1/test.xml new file mode 100644 index 000000000..130e183ee --- /dev/null +++ b/_downloads/42ab328c87be33cec72229ad6b7705b1/test.xml @@ -0,0 +1,214 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + beast.math.distributions.Uniform + beast.math.distributions.Exponential + beast.math.distributions.LogNormalDistributionModel + beast.math.distributions.Normal + beast.math.distributions.Beta + beast.math.distributions.Gamma + beast.math.distributions.LaplaceDistribution + beast.math.distributions.Prior + beast.math.distributions.InverseGamma + beast.math.distributions.OneOnX + + + + + + + + 0.1 + 1 + 1.0 + + + + 1.0 + + + + + + + + + + + 0.5396 + 0.3819 + + + + + + + 1.0 + 1.0 + 0.0 + + + + + 1.0 + + 1.0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_downloads/42bf886f1ad9fb1c7384f0a7ca61db90/jacobi_shared.cpp b/_downloads/42bf886f1ad9fb1c7384f0a7ca61db90/jacobi_shared.cpp new file mode 100644 index 000000000..76a9d230a --- /dev/null +++ b/_downloads/42bf886f1ad9fb1c7384f0a7ca61db90/jacobi_shared.cpp @@ -0,0 +1,89 @@ +/** + * SYCL accelerated implementation of the Jacobi iteration + */ + +#include +#include + +#include + +// Number of rows and columns in our matrix +static const int NUM_ELEMENTS = 2000; +// Total number of elements in our matrix +static const int TOT_ELEMENTS = NUM_ELEMENTS * NUM_ELEMENTS; +// Maximum number of iterations before quiting +static const int MAX_ITER = 10000; +// Error tolerance for iteration +static const float MAX_ERROR = 0.01; +// Seed for random number generator +static const int SEED = 12345; + +int main (int argc, char** argv) { + // Create default SYCL queue and print name of device + auto Q = sycl::queue{sycl::default_selector{}}; + std::cout << "Chosen device: " + << Q.get_device().get_info() + << std::endl; + + // Initialize random number generator + srand (SEED); + + // Create *SHARED* array to store the input/output + float *arr_s = sycl::malloc_shared(TOT_ELEMENTS, Q); + + // Fill *SHARED* array with data + for (int i = 0; i < TOT_ELEMENTS; i++) { + // The following will create random values between [0, 1] + arr_s[i] = (float) rand () / (float) RAND_MAX; + } + + // Create *SHARED* array to calculate on + float *tmp_s = sycl::malloc_shared(TOT_ELEMENTS, Q); + float err = __FLT_MAX__; + + // We copy here to get the boundary elements, which will be copied back and forth unchanged + std::memcpy(tmp_s, arr_s, TOT_ELEMENTS*sizeof(float)); + + int iterations = 0; + // Perform Jacobi iterations until we either have low enough error or too many iterations + while (err > MAX_ERROR && iterations < MAX_ITER) { + err = 0.; + // Submit work item to the SYCL queue + Q.submit( + [&](sycl::handler &h) { + // Define work kernel as single loop + h.parallel_for( + sycl::range{(NUM_ELEMENTS - 2) * (NUM_ELEMENTS - 2)}, + [=](sycl::id<1> idx) { + // Retain array indices from single loop variable + int i = (idx[0] / NUM_ELEMENTS) + 1; + int j = (idx[0] % NUM_ELEMENTS) + 1; + // For each element take the average of the surrounding elements + tmp_s[i * NUM_ELEMENTS + j] = 0.25 * (arr_s[i * NUM_ELEMENTS + j+1] + + arr_s[i * NUM_ELEMENTS + j-1] + + arr_s[(i-1) * NUM_ELEMENTS + j] + + arr_s[(i+1) * NUM_ELEMENTS + j]); + } + ); + } + ).wait(); // Wait for completion before moving on + + // Find maximum error (cannot be done in the loop kernel above) + for (int i = 0; i < TOT_ELEMENTS; i++) { + err = std::max(err, std::abs(tmp_s[i] - arr_s[i])); + } + + // Transfer new array to old (including boundary, which was untouched in the loop) + std::memcpy(arr_s, tmp_s, TOT_ELEMENTS*sizeof(float)); + + iterations++; + } + + std::cout << "Iterations : " << iterations << " | Error : " << err << std::endl; + + // Free *SHARED* memory + sycl::free(arr_s, Q); + sycl::free(tmp_s, Q); + + return EXIT_SUCCESS; +} diff --git a/_downloads/4bdab811d5f5e50554c7889591b12bda/jacobi_data.c b/_downloads/4bdab811d5f5e50554c7889591b12bda/jacobi_data.c new file mode 100644 index 000000000..8a6bbc582 --- /dev/null +++ b/_downloads/4bdab811d5f5e50554c7889591b12bda/jacobi_data.c @@ -0,0 +1,61 @@ +/** + * Updated Kernels directive code with better data transfer + */ + +#include +#include +#include + +// Number of rows and columns in our matrix +static const int NUM_ELEMENTS = 2000; +// Maximum number of iterations before quiting +static const int MAX_ITER = 10000; +// Error tolerance for iteration +static const float MAX_ERROR = 0.01; +// Seed for random number generator +static const int SEED = 12345; + +int main (int argc, char** argv) { + // Initialize random number generator + srand (SEED); + // Create array to calculate on + float array[NUM_ELEMENTS][NUM_ELEMENTS]; + // Fill array with data + for (int i = 0; i < NUM_ELEMENTS; i++) { + for (int j = 0; j < NUM_ELEMENTS; j++) { + // The following will create random values between [0, 1] + array[i][j] = (float) rand () / (float) RAND_MAX; + } + } + // Before starting calculation we will define a few helper variables + float arr_new[NUM_ELEMENTS][NUM_ELEMENTS]; + float error = __FLT_MAX__; + int iterations = 0; + // Perform Jacobi iterations until we either have low enough error or too + // many iterations + #pragma acc data copy(array, arr_new) + while (error > MAX_ERROR && iterations < MAX_ITER) { + error = 0.; + #pragma acc kernels + { + // For each element take the average of the surrounding elements + for (int i = 1; i < NUM_ELEMENTS - 1; i++) { + for (int j = 1; j < NUM_ELEMENTS - 1; j++) { + arr_new[i][j] = 0.25 * (array[i][j + 1] + + array[i][j - 1] + + array[i - 1][j] + + array[i + 1][j]); + error = fmaxf (error, fabsf (arr_new[i][j] - array[i][j])); + } + } + // Transfer new array to old + for (int i = 1; i < NUM_ELEMENTS - 1; i++) { + for (int j = 1; j < NUM_ELEMENTS - 1; j++) { + array[i][j] = arr_new[i][j]; + } + } + } + iterations += 1; + } + return EXIT_SUCCESS; +} diff --git a/_downloads/4c3e545f897bf6b3b786c36c390f20b2/serial.zip b/_downloads/4c3e545f897bf6b3b786c36c390f20b2/serial.zip new file mode 100644 index 000000000..b175fd30e Binary files /dev/null and b/_downloads/4c3e545f897bf6b3b786c36c390f20b2/serial.zip differ diff --git a/_downloads/5a68466b657561def828ad89e5eb0d3d/timeout_cleanup.sh b/_downloads/5a68466b657561def828ad89e5eb0d3d/timeout_cleanup.sh new file mode 100644 index 000000000..c7f0d6c7e --- /dev/null +++ b/_downloads/5a68466b657561def828ad89e5eb0d3d/timeout_cleanup.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# job name +#SBATCH --job-name=example + +# replace this by your account +#SBATCH --account=YourAccount + +#SBATCH --qos=devel +#SBATCH --ntasks=1 +## Note: On Saga, you will also have to specify --mem-per-cpu + +# we give this job 4 minutes +#SBATCH --time=0-00:04:00 + +# asks Slurm to send the USR1 signal 120 seconds before end of the time limit +#SBATCH --signal=B:USR1@120 + +# define the handler function +# note that this is not executed here, but rather +# when the associated signal is sent +your_cleanup_function() +{ + echo "function your_cleanup_function called at $(date)" + # do whatever cleanup you want here +} + +# call your_cleanup_function once we receive USR1 signal +trap 'your_cleanup_function' USR1 + +echo "starting calculation at $(date)" + +# the calculation "computes" (in this case sleeps) for 1000 seconds +# but we asked slurm only for 240 seconds so it will not finish +# the "&" after the compute step and "wait" are important +sleep 1000 & +wait diff --git a/_downloads/5fa1f36cea9de3dc6c6ae6c4919f02cc/openacc.c b/_downloads/5fa1f36cea9de3dc6c6ae6c4919f02cc/openacc.c new file mode 100644 index 000000000..2d51441c3 --- /dev/null +++ b/_downloads/5fa1f36cea9de3dc6c6ae6c4919f02cc/openacc.c @@ -0,0 +1,86 @@ +/** +* Example program to show how to combine OpenACC and cuBLAS library calls +*/ + +#include +#include +#include +#include +#include + +#define N 10000 + +int main() { + printf("Starting SAXPY + OpenACC program\n"); + // Allocate vectors which we will use for computations + float* a = (float*) calloc(N, sizeof(float)); + float* b = (float*) calloc(N, sizeof(float)); + float sum = 0.0; + const float alpha = 2.0; + + if (a == NULL || b == NULL) { + printf("Could not allocate compute vectors!"); + return EXIT_FAILURE; + } + + // Initialize input arrays, this is done on CPU host + printf(" Initializing vectors on CPU\n"); + for (int i = 0; i < N; i++) { + a[i] = 1.0; + b[i] = 2.0; + } + + // Create cuBLAS handle for interacting with cuBLAS routines + printf(" Creating cuBLAS handle\n"); + cublasHandle_t handle; + cublasStatus_t status; // Variable to hold return status from cuBLAS routines + status = cublasCreate(&handle); + if (status != CUBLAS_STATUS_SUCCESS) { + printf("Could not initialize cuBLAS handle!\n"); + return EXIT_FAILURE; + } + + // Create OpenACC data region so that our compute vectors are accessible on + // GPU device for cuBLAS + printf(" Starting calculation\n"); + #pragma acc data copy(b[0:N]) copyin(a[0:N]) + { + // To allow cuBLAS to interact with our compute vectors we need to make + // them available as pointers. NOTE however that these pointers point to + // areas in the GPU memory so they cannot be dereferenced on the CPU, + // however, by using the 'host_data' directive we can use the pointers from + // CPU code passing them to other functions that require pointers to GPU + // memory + #pragma acc host_data use_device(a, b) + { + status = cublasSaxpy(handle, N, &alpha, a, 1, b, 1); + if (status != CUBLAS_STATUS_SUCCESS) { + printf("SAXPY failed!\n"); + // NOTE we cannot exit here since this is within an accelerated region + } + } + // We can now continue to use a and b in OpenACC kernels and parallel loop + #pragma acc kernels + for (int i = 0; i < N; i++) { + sum += b[i]; + } + } + // After the above OpenACC region has ended 'a' has not changed, 'b' contains + // the result of the SAXPY routine and 'sum' contains the sum over 'b' + + // To ensure everything worked we can check that the sum is as we expected + if (fabs(sum - 4.0 * (float) N) < 0.001) { + printf(" Calculation produced the correct result of '4 * %d == %.0f'!\n", N, sum); + } else { + printf(" Calculation produced _incorrect_ result, expected '4 * %d == %.3f'\n", N, sum); + } + + // Free cuBLAS handle + cublasDestroy(handle); + // Free computation vectors + free(a); + free(b); + // Indicate to caller that everything worked as expected + printf("Ending SAXPY + OpenACC program\n"); + return EXIT_SUCCESS; +} diff --git a/_downloads/66e06d531d6677ed369843327e780cb0/wave_data.c b/_downloads/66e06d531d6677ed369843327e780cb0/wave_data.c new file mode 100644 index 000000000..8e40f57ca --- /dev/null +++ b/_downloads/66e06d531d6677ed369843327e780cb0/wave_data.c @@ -0,0 +1,260 @@ +/** + * OpenACC + MPI implementation of the 1D wave equation + */ + +#include +#include +#include +#include +#include + +// Default number of points to calculate over, if not given on command line +static const int NUM_POINTS = 400; +// Default number of steps to perform per point, if not given on command line +static const int NUM_STEPS = 4000; +// Default time interval, if not given on command line +static const double DEFAULT_DT = 0.00125; +// Speed of sound used for calculation +static const double SOUND_SPEED = 1.0; + +// Define MPI tags for program +static const int lower_tag = 1010; // Send to lower rank +static const int upper_tag = 2020; // Send to higher rank +static const int scatter_tag = 3030; // Gather / Scatter data +static const int gather_tag = 4040; // Gather / Scatter data +// MPI Error codes +static const int ALLOC_WAVE_FAIL = 1001; +static const int ALLOC_WAVES_FAIL = 1002; +static const int INITIAL_DIST_RECV = 1003; +static const int LAST_DIST_RECV = 1004; + +// Helper macro to check an MPI call and print error if it failed +#define check_mpi(code, err) \ +if (code != MPI_SUCCESS) { \ + printf("\033[0;31m%s\033[0m\n", err); \ + printf("\tError code: \033[0;31m%d\033[0m\n", code); \ + MPI_Abort(MPI_COMM_WORLD, 1337); \ + return EXIT_FAILURE; \ +} + +/** + * Helper method to calculate the exact solution at 'x' with time step 't' and + * speed of sound 'c' + */ +double exact (const double x, const double t, const double c) { + return sin (2. * M_PI * (x - c * t)); +} + +/** + * Helper function to calculate the partial derivative du/dt + */ +double dudt (const double x, const double t, const double c) { + return -2. * M_PI * c * cos (2. * M_PI * (x - c * t)); +} + +int main (int argc, char** argv) { + // Define variables to use in calculation, initialized to default values + int points = NUM_POINTS; + int steps = NUM_STEPS; + double dt = DEFAULT_DT; + + /************************** Command line handling ***************************/ + if (argc > 1) { + if (strncmp (argv[1], "-h", 3) == 0 || strncmp (argv[1], "--help", 7) == 0) { + printf("Usage: \033[0;32m%s\033[0m \n", argv[0]); + return EXIT_SUCCESS; + } + points = atoi (argv[1]); + if (points < 1) { + printf("\033[0;31mThe number of points must be a positive number larger than '1'!\033[0m\n"); + return EXIT_FAILURE; + } + } + if (argc > 2) { + steps = atoi (argv[2]); + if (steps < 0) { + printf("\033[0;31mThe number of steps must be a positive number!\033[0m\n"); + return EXIT_FAILURE; + } + } + if (argc > 3) { + dt = atof (argv[3]); + if (dt <= 0.) { + printf("\033[0;31mTime interval must be larger than '0.0'!\033[0m\n"); + return EXIT_FAILURE; + } + } + + /*************************** MPI work sharing *******************************/ + // Initialize MPI + check_mpi (MPI_Init(&argc, &argv), "Could not initialize MPI!"); + // Extract MPI size and current rank + int num_processes = 1; + int rank = 0; + check_mpi (MPI_Comm_size(MPI_COMM_WORLD, &num_processes), "Could not fetch COMM_WORLD size"); + check_mpi (MPI_Comm_rank(MPI_COMM_WORLD, &rank), "Could not fetch COMM_WORLD rank"); + if (points % num_processes != 0) { + if (rank == 0) { + printf("\033[0;31m%d points can't be split into %d processes!\033[0m\n", points, num_processes); + } + MPI_Finalize(); + return EXIT_FAILURE; + } + const int equal_share = points / num_processes; + // The first and last rank calculates one additional element, while all other + // ranks calculates two additional points + const int local_points = (rank == 0 || rank == num_processes - 1) ? equal_share + 1 : equal_share + 2; + const int local_start = (rank == 0) ? 0 : equal_share * rank - 1; + + /*************************** Implementation *********************************/ + // Define pointer to global result so that we can compile, this variable is + // only allocated on the root rank + double* wave = NULL; + if (rank == 0) { + printf("Calculating 1D wave equation with \033[0;35m%d\033[0m points over \033[0;35m%d\033[0m steps with \033[0;35m%f\033[0m time step\n", + points, steps, dt); + printf("\t...split over \033[0;35m%d\033[0m processes, processing \033[0;35m%d\033[0m points each\n", + num_processes, local_points); + // On the root rank we allocate enough space for the full wave, + // it is used as the full result + wave = calloc (points, sizeof (double)); + if (wave == NULL) { + printf("\033[0;31mCould not allocate %d points for wave results\033[0m\n", points); + // No need to check output, we will shortly exit anyway + MPI_Abort(MPI_COMM_WORLD, ALLOC_WAVE_FAIL); + return EXIT_FAILURE; + } + } + // Allocate memory for local work arrays + double* wave0 = calloc (local_points, sizeof (double)); + double* wave1 = calloc (local_points, sizeof (double)); + double* wave2 = calloc (local_points, sizeof (double)); + if (wave0 == NULL || wave1 == NULL || wave2 == NULL) { + printf("\033[0;31mRank %d could not allocate enough space for arrays!\033[0m\n", rank); + MPI_Abort(MPI_COMM_WORLD, ALLOC_WAVES_FAIL); + return EXIT_FAILURE; + } + const double dx = 1. / ((double) points - 1); + const double alpha = SOUND_SPEED * dt / dx; + const double alpha2 = alpha * alpha; + if (rank == 0) { + if (fabs (alpha) >= 1.) { + printf("\033[0;33mComputation will be unstable with the given parameters\033[0m\n"); + printf("\tdt = %f\n", dt); + printf("\tdx = %f (1. / %d)\n", dx, points); + printf("\t|alpha| = %f\n", fabs (alpha)); + } + // Initialize the wave only on the root rank + for (int i = 0; i < points; i++) { + const double x = (double) i / (double) (points - 1); + wave[i] = exact (x, 0., SOUND_SPEED); + } + // Distribute computation to all other ranks + for (int r = 1; r < num_processes; r++) { + const int index = r * equal_share - 1; + const int num_points = (r < num_processes - 1) ? equal_share + 2 : equal_share + 1; + check_mpi (MPI_Send(&wave[index], num_points, MPI_DOUBLE, r, scatter_tag, MPI_COMM_WORLD), + "Could not distribute data"); + } + // Distribute data to root rank also + for (int i = 0; i < local_points; i++) { + wave0[i] = wave[i]; + wave1[i] = wave0[i]; + } + } else { + MPI_Status out; + check_mpi (MPI_Recv(wave0, local_points, MPI_DOUBLE, 0, scatter_tag, MPI_COMM_WORLD, &out), + "Could not receive data"); + if (out.MPI_ERROR != MPI_SUCCESS) { + printf("\033[0;31mMPI Recv error!\033[0m count: %ld, cancelled: %d, error: %d\n", + out._ucount / sizeof (double), out._cancelled, out.MPI_ERROR); + MPI_Abort(MPI_COMM_WORLD, INITIAL_DIST_RECV); + return EXIT_FAILURE; + } + for (int i = 0; i < local_points; i++) { + wave1[i] = wave0[i]; + } + } + // Subsequent steps utilize the existing arrays for computation + #pragma acc data copy(wave1[:local_points]) copyin(wave0[:local_points]) \ + create(wave2[:local_points]) + for (int s = 1; s < steps + 1; s++) { + const double t = (double) s * dt; + if (s == 1) { + // First time step we use the initial derivative information to calculate + // the solution + #pragma acc parallel loop + for (int i = 1; i < local_points - 1; i++) { + const double x = (double) (i + local_start) / (double) (points - 1); + wave2[i] = (1. - alpha2) * wave1[i] + + 0.5 * alpha2 * (wave1[i - 1] + wave1[i + 1]) + + dt * dudt (x, t, SOUND_SPEED); + } + } else { + // After first step we use previous calculations for future values + #pragma acc parallel loop + for (int i = 1; i < local_points - 1; i++) { + wave2[i] = 2. * (1. - alpha2) * wave1[i] + + alpha2 * (wave1[i - 1] + wave1[i + 1]) + - wave0[i]; + } + } + // Share data with neighboors + if (rank > 0) { + MPI_Send(&wave2[1], 1, MPI_DOUBLE, rank - 1, lower_tag, MPI_COMM_WORLD); + MPI_Status out; + MPI_Recv(&wave2[0], 1, MPI_DOUBLE, rank - 1, upper_tag, MPI_COMM_WORLD, &out); + } else { + wave2[0] = exact (0., t, SOUND_SPEED); + } + if (rank < num_processes - 1) { + MPI_Status out; + MPI_Recv(&wave2[local_points - 1], 1, MPI_DOUBLE, rank + 1, lower_tag, MPI_COMM_WORLD, &out); + MPI_Send(&wave2[local_points - 2], 1, MPI_DOUBLE, rank + 1, upper_tag, MPI_COMM_WORLD); + } else { + wave2[local_points - 1] = exact (1., t, SOUND_SPEED); + } + // Shift data + #pragma acc parallel loop + for (int i = 0; i < local_points; i++) { + wave0[i] = wave1[i]; + wave1[i] = wave2[i]; + } + } + // Synchronize data back to root rank + if (rank == 0) { + printf("Synchronizing results\033[0;33m...\033[0m "); + // Copy root rank data back into result array + for (int i = 0; i < local_points; i++) { + wave[i] = wave1[i]; + } + // Receive data from all other ranks + for (int r = 1; r < num_processes; r++) { + const int index = r * equal_share - 1; + const int num_points = (r < num_processes - 1) ? equal_share + 2 : equal_share + 1; + MPI_Status out; + check_mpi (MPI_Recv(&wave[index], num_points, MPI_DOUBLE, r, gather_tag, MPI_COMM_WORLD, &out), + "Could not receive data when gathering result"); + if (out.MPI_ERROR != MPI_SUCCESS) { + printf("\033[0;31mMPI Recv error!\033[0m count: %ld, cancelled: %d, error: %d\n", + out._ucount / sizeof (double), out._cancelled, out.MPI_ERROR); + MPI_Abort(MPI_COMM_WORLD, LAST_DIST_RECV); + return EXIT_FAILURE; + } + } + printf("\033[0;32mcompleted\033[0m!\n"); + printf("Calculation ended \033[0;32msuccesfully\033[0m!\n"); + } else { + check_mpi (MPI_Send(wave1, local_points, MPI_DOUBLE, 0, gather_tag, MPI_COMM_WORLD), + "Could not send data back to root when gathering results"); + } + // Free data before exit + free(wave0); + free(wave1); + free(wave2); + if (rank == 0) { + free(wave); + } + MPI_Finalize(); + return EXIT_SUCCESS; +} diff --git a/_downloads/66ff041b4ef0bd31a54dc7cd77155bdc/submit_monitor.sh b/_downloads/66ff041b4ef0bd31a54dc7cd77155bdc/submit_monitor.sh new file mode 100644 index 000000000..977aae551 --- /dev/null +++ b/_downloads/66ff041b4ef0bd31a54dc7cd77155bdc/submit_monitor.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH --job-name=TestGPUOnSaga +#SBATCH --account=nnk +#SBATCH --time=05:00 +#SBATCH --mem-per-cpu=4G +#SBATCH --qos=devel +#SBATCH --partition=accel +#SBATCH --gpus=1 + +## Set up job environment: +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error + +module --quiet purge # Reset the modules to the system default +module load TensorFlow/2.6.0-foss-2021a-CUDA-11.3.1 +module list + +# Setup monitoring +nvidia-smi --query-gpu=timestamp,utilization.gpu,utilization.memory \ + --format=csv --loop=1 > "gpu_util-$SLURM_JOB_ID.csv" & +NVIDIA_MONITOR_PID=$! # Capture PID of monitoring process +# Run our computation +python gpu_intro.py +# After computation stop monitoring +kill -SIGINT "$NVIDIA_MONITOR_PID" diff --git a/_downloads/69b8d69611b521fa3cf306827b36ef95/monitor.sh b/_downloads/69b8d69611b521fa3cf306827b36ef95/monitor.sh new file mode 100644 index 000000000..828390bd2 --- /dev/null +++ b/_downloads/69b8d69611b521fa3cf306827b36ef95/monitor.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --job-name=CUDA-monitor +#SBATCH --account=nnk +#SBATCH --time=05:00 +#SBATCH --mem-per-cpu=1G +#SBATCH --qos=devel +#SBATCH --partition=accel +#SBATCH --gpus=1 + +## Set up job environment: +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error + +module --quiet purge # Reset the modules to the system default +module load CUDA/11.1.1-GCC-10.2.0 +module list + +# Compile our code +nvcc loop_add_cuda.cu -o loop_add_cuda + +# Setup monitoring +nvidia-smi --query-gpu=timestamp,utilization.gpu,utilization.memory \ + --format=csv --loop=1 > "monitor-$SLURM_JOB_ID.csv" & +NVIDIA_MONITOR_PID=$! # Capture PID of monitoring process + +# Run our computation +./loop_add_cuda + +# After computation stop monitoring +kill -SIGINT "$NVIDIA_MONITOR_PID" + +exit 0 diff --git a/_downloads/69c5e1976eb0a6fb71e1669d25432187/vec_add_cuda.cu b/_downloads/69c5e1976eb0a6fb71e1669d25432187/vec_add_cuda.cu new file mode 100644 index 000000000..8166f5690 --- /dev/null +++ b/_downloads/69c5e1976eb0a6fb71e1669d25432187/vec_add_cuda.cu @@ -0,0 +1,58 @@ +#include +#include +#include +#include +#include + +// CUDA kernel, callable from host due to `__global__` +__global__ void add(const float* a, const float* b, float* c, const size_t n) { + // Calculate the array index of this thread + const int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id < n) { + c[id] = a[id] + b[id]; + } +} + +int main(int argc, char* argv[]) { + printf("ENTER MAIN\n"); + // Number of elements to compute over + const size_t num_elements = 1000000; + + // Allocate memory that can be accessed both on host and device + float* a; + float* b; + float* c; + // Should ideally catch errors here, but skip for brevity + cudaMallocManaged(&a, num_elements * sizeof(float)); + cudaMallocManaged(&b, num_elements * sizeof(float)); + cudaMallocManaged(&c, num_elements * sizeof(float)); + + // Fill our input arrays, on host, with some data to calculate + for (int i = 0; i < num_elements; i++) { + a[i] = sinf(i) * sinf(i); + b[i] = cosf(i) * cosf(i); + } + + // Define how many threads to launch on CUDA device + const int block_size = 1024; // Number of threads in each thread block + // Number of thread blocks in a grid + const int grid_size = (int) ceil((float) num_elements / block_size); + + // Call CUDA kernel to run on device + add<<>>(a, b, c, num_elements); + // Wait for computation before doing anything with data on host + cudaDeviceSynchronize(); + + // Should print 1.0 at all entries + printf("c[0] : %f\n", c[0]); + printf("c[1] : %f\n", c[1]); + printf("c[42] : %f\n", c[42]); + + // Free memory + cudaFree(a); + cudaFree(b); + cudaFree(c); + + printf("EXIT SUCCESS\n"); + return EXIT_SUCCESS; +} diff --git a/_downloads/6b32b7da967dbb465945db1f758fc5a2/laplace_mpiomp_noaware.f90 b/_downloads/6b32b7da967dbb465945db1f758fc5a2/laplace_mpiomp_noaware.f90 new file mode 100644 index 000000000..a63aaab7e --- /dev/null +++ b/_downloads/6b32b7da967dbb465945db1f758fc5a2/laplace_mpiomp_noaware.f90 @@ -0,0 +1,233 @@ + program laplace_mpiomp_noaware + + use mpi + use omp_lib + + implicit none + integer status(MPI_STATUS_SIZE) + integer :: i,j,k,ii + integer :: iter,count_rate, count_max,count,nenv_var + integer :: myid,ierr,nproc,nxp,nyp,tag,tag1,tag2,nsend + integer, parameter :: nx=20000,ny=nx + integer, parameter :: max_iter=525 + double precision, parameter :: pi=4d0*datan(1d0) + real, parameter :: error=0.05 + double precision :: max_err,time_s,& + d2fx,d2fy,max_err_part + real :: t_start,t_final + double precision, allocatable :: f(:,:),f_k(:,:) + double precision, allocatable :: f_send(:,:),f_full(:,:) + character(len=300) :: env_var + + integer :: deviceType,myDevice,numDevice,host_rank,host_comm + + !MPI starts + ! Initialise OpenMPI communication. + call MPI_INIT(ierr) + ! Get number of active processes (from 0 to nproc-1). + call MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr ) + ! Identify the ID rank (process). + call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr ) + +!check if GPU-aware support is enabled + if(myid.eq.0) then + print*, '' + call getenv("MPICH_GPU_SUPPORT_ENABLED", env_var) + read(env_var, '(i10)' ) nenv_var + if (nenv_var.eq. 1) then + print *, '--MPICH_GPU_SUPPORT_ENABLED is enabled!' + print*, '' + elseif (nenv_var.ne. 1) then + print *, '--MPICH_GPU_SUPPORT_ENABLED is NOT enabled!' + print *, '' + endif + endif + + t_start = MPI_WTIME() + + if (mod(nx,nproc).ne.0) then + if (myid.eq.0) write(*,*) 'nproc has to divide nx' + stop + else + nxp = nx/nproc + endif + if (mod(ny,nproc).ne.0) then + if (myid.eq.0) write(*,*) 'nproc has to divide ny' + stop + else + nyp = ny/nproc + endif + + if(myid.eq.0) then + print*,'--nbr of proc', nproc + write(*,*)'--nbr of points nx,ny',nx,ny + write(*,*)'--nbr of elmts on each proc, nyp=ny/nproc', nyp + endif + +!Generate the Initial Conditions (ICs) +!Distribute the ICs over all processes using the operation MPI_Scatter + allocate(f(0:nx+1,0:nyp+1)); + + f=0d0; tag1=2020; tag2=2021 + + if(myid.eq.0) then + allocate(f_send(1:nx,1:ny)) + CALL RANDOM_NUMBER(f_send) + endif + + call MPI_Scatter(f_send,nx*nyp,MPI_DOUBLE_PRECISION,& + f(1:nx,1:nyp), nx*nyp,MPI_DOUBLE_PRECISION,& + 0,MPI_COMM_WORLD, ierr) + + call MPI_Barrier(MPI_COMM_WORLD, ierr) + + if(myid.eq.0) deallocate(f_send) + +!Set a device: Determine which processes are on each node +!such that each process will be connected to a GPU + +!!Split the world communicator into subgroups of commu, each of which +!contains processes that run on the same node, and which can create a +!shared +!memory region (via the type MPI_COMM_TYPE_SHARED). +!The call returns a new communicator "host_comm", which is created by +!each subgroup. + + call MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,& + MPI_INFO_NULL, host_comm,ierr) + call MPI_COMM_RANK(host_comm, host_rank,ierr) + + myDevice = host_rank + +!returns the device number of the device on which the calling thread is +!executing + deviceType = omp_get_device_num() +!returns the number of devices available for offloading. + numDevice = omp_get_num_devices() +!sets the device number to use in device constructs by setting the +!initial value of the default-device-var + + call omp_set_default_device(myDevice) + + if(myid.eq.0)print*, "--Number of devices per node:", numDevice + if(myid.eq.0)print*,"" + + print*, "--MPI rank", myid, "is connected to GPU", myDevice + + allocate(f_k(1:nx,1:nyp)) + + iter = 0 + + if(myid.eq.0) then + print*,"" + print*, "--Start iterations",iter + print*,"" + endif + +!Structed data locality +!$omp target data device(myDevice) map(to:f) map(from:f_k) + + do while (max_err.gt.error.and.iter.le.max_iter) + +!copy data from GPU to CPU +!$omp target update device(myDevice) from(f) +!!$omp target update mapfrom(f) + +!transfer the data at the boundaries to the neighbouring MPI-process +!send f(:,nyp) from myid-1 to be stored in f(:,0) in myid+1 + if(myid.lt.nproc-1) then + call MPI_Send(f(:,nyp),(nx+2)*1,MPI_DOUBLE_PRECISION,myid+1,tag1,& + MPI_COMM_WORLD, ierr) + endif + +!receive f(:,0) from myid-1 + if(myid.gt.0) then + call MPI_Recv(f(:,0),(nx+2)*1,MPI_DOUBLE_PRECISION,myid-1, & + tag1,MPI_COMM_WORLD, status,ierr) + endif + +!send f(:,1) from myid+1 to be stored in f(:,nyp+1) in myid-1 + if(myid.gt.0) then + call MPI_Send(f(:,1),(nx+2)*1,MPI_DOUBLE_PRECISION,myid-1,tag2,& + MPI_COMM_WORLD, ierr) + endif + +!receive f(:,npy+1) from myid-1 + if(myid.lt.nproc-1) then + call MPI_Recv(f(:,nyp+1),(nx+2)*1,MPI_DOUBLE_PRECISION,myid+1,& + tag2,MPI_COMM_WORLD, status,ierr) + endif + +!update data from CPU to GPU +!$omp target update device(myDevice) to(f) +!!$omp target update mapto(f) +!$omp target teams distribute parallel do collapse(2) schedule(static,1) + do j=1,nyp + do i=1,nx + d2fx = f(i+1,j) + f(i-1,j) + d2fy = f(i,j+1) + f(i,j-1) + f_k(i,j) = 0.25*(d2fx + d2fy) + enddo + enddo +!$omp end target teams distribute parallel do + + max_err=0. + +!$omp target teams distribute parallel do reduction(max:max_err) & +!$omp collapse(2) schedule(static,1) + do j=1,nyp + do i=1,nx + max_err = max(dabs(f_k(i,j) - f(i,j)),max_err) + f(i,j) = f_k(i,j) + enddo + enddo +!$omp end target teams distribute parallel do + +!max_err is copied back to the CPU-host by default + call MPI_ALLREDUCE(MPI_IN_PLACE,max_err,1,& + MPI_DOUBLE_PRECISION,MPI_MAX, MPI_COMM_WORLD,ierr ) + + if(myid.eq.0) then + if(mod(iter,50).eq.0 )write(*,'(i5,f10.6)')iter,max_err + endif + + iter = iter + 1 + + enddo +!$omp end target data + + deallocate(f) + + if(myid.eq.0) write(*,'(i5,f10.6)') iter,max_err + + call MPI_Barrier(MPI_COMM_WORLD, ierr) + + t_final = MPI_WTIME() + time_s = t_final - t_start + + if(myid.eq.0)print*, '--Time it takes (s)', time_s + + if(myid.eq.0) then + print*, '--Job is completed successfully--' + print*,'' + endif + +!to check the result + allocate(f_full(nx,ny)) + call MPI_Gather(f_k, nx*nyp, MPI_DOUBLE_PRECISION, & + f_full, nx*nyp, MPI_DOUBLE_PRECISION, 0, & + MPI_COMM_WORLD, ierr) + + if(myid.eq.0) then + do j=1,ny + write(111,*)j,sum(f_full(:,j)) + enddo + print*,"--Sum",sum(f_full(:,:))/nx/2 + print*,"--END :)" + endif + + deallocate(f_full,f_k) + + call MPI_FINALIZE( ierr ) + + end diff --git a/_downloads/6fde652f5ba5592601446d10228a341b/mxm.f90 b/_downloads/6fde652f5ba5592601446d10228a341b/mxm.f90 new file mode 100644 index 000000000..6fa005177 --- /dev/null +++ b/_downloads/6fde652f5ba5592601446d10228a341b/mxm.f90 @@ -0,0 +1,25 @@ +program mxm + integer, parameter :: r8 = selected_real_kind(p=15,r=307) + parameter(N=4000) + real(r8) a(N,N), b(N,N) , c(N,N), temp + integer i, j, l, c1, c2 + + call random_number(a) + call random_number(b) + + call system_clock(count=c1) + +!$acc kernels + do j = 1,N + do l = 1,N + do i = 1,N + c(i,j) = c(i,j) + a(i,l)*b(l,j) + enddo + enddo + enddo +!$acc end kernels + call system_clock(count=c2) + + write(*,*) "Calc time : ",(c2-c1)/1e6," secs" + write(*,*) c(1,1), c(N,N), sum(c) +end program mxm diff --git a/_downloads/74571088608e81a8cca34eb04d783075/wave_acc_mpi.c b/_downloads/74571088608e81a8cca34eb04d783075/wave_acc_mpi.c new file mode 100644 index 000000000..6e601df4b --- /dev/null +++ b/_downloads/74571088608e81a8cca34eb04d783075/wave_acc_mpi.c @@ -0,0 +1,283 @@ +/** + * OpenACC + MPI implementation of the 1D wave equation + */ + +#include +#include +#include +#include +#include +#include + +// Default number of points to calculate over, if not given on command line +static const int NUM_POINTS = 400; +// Default number of steps to perform per point, if not given on command line +static const int NUM_STEPS = 4000; +// Default time interval, if not given on command line +static const double DEFAULT_DT = 0.00125; +// Speed of sound used for calculation +static const double SOUND_SPEED = 1.0; + +// Define MPI tags for program +static const int lower_tag = 1010; // Send to lower rank +static const int upper_tag = 2020; // Send to higher rank +static const int scatter_tag = 3030; // Gather / Scatter data +static const int gather_tag = 4040; // Gather / Scatter data +// MPI Error codes +static const int ALLOC_WAVE_FAIL = 1001; +static const int ALLOC_WAVES_FAIL = 1002; +static const int INITIAL_DIST_RECV = 1003; +static const int LAST_DIST_RECV = 1004; + +// Helper macro to check an MPI call and print error if it failed +#define check_mpi(code, err) \ +if (code != MPI_SUCCESS) { \ + printf("\033[0;31m%s\033[0m\n", err); \ + printf("\tError code: \033[0;31m%d\033[0m\n", code); \ + MPI_Abort(MPI_COMM_WORLD, 1337); \ + return EXIT_FAILURE; \ +} + +/** + * Helper method to calculate the exact solution at 'x' with time step 't' and + * speed of sound 'c' + */ +#pragma acc routine seq +double exact (const double x, const double t, const double c) { + return sin (2. * M_PI * (x - c * t)); +} + +/** + * Helper function to calculate the partial derivative du/dt + */ +#pragma acc routine seq +double dudt (const double x, const double t, const double c) { + return -2. * M_PI * c * cos (2. * M_PI * (x - c * t)); +} + +int main (int argc, char** argv) { + // Define variables to use in calculation, initialized to default values + int points = NUM_POINTS; + int steps = NUM_STEPS; + double dt = DEFAULT_DT; + + /************************** Command line handling ***************************/ + if (argc > 1) { + if (strncmp (argv[1], "-h", 3) == 0 || strncmp (argv[1], "--help", 7) == 0) { + printf("Usage: \033[0;32m%s\033[0m \n", argv[0]); + return EXIT_SUCCESS; + } + points = atoi (argv[1]); + if (points < 1) { + printf("\033[0;31mThe number of points must be a positive number larger than '1'!\033[0m\n"); + return EXIT_FAILURE; + } + } + if (argc > 2) { + steps = atoi (argv[2]); + if (steps < 0) { + printf("\033[0;31mThe number of steps must be a positive number!\033[0m\n"); + return EXIT_FAILURE; + } + } + if (argc > 3) { + dt = atof (argv[3]); + if (dt <= 0.) { + printf("\033[0;31mTime interval must be larger than '0.0'!\033[0m\n"); + return EXIT_FAILURE; + } + } + + /*************************** MPI work sharing *******************************/ + // Initialize MPI + check_mpi (MPI_Init(&argc, &argv), "Could not initialize MPI!"); + // Extract MPI size and current rank + int num_processes = 1; + int rank = 0; + check_mpi (MPI_Comm_size(MPI_COMM_WORLD, &num_processes), "Could not fetch COMM_WORLD size"); + check_mpi (MPI_Comm_rank(MPI_COMM_WORLD, &rank), "Could not fetch COMM_WORLD rank"); + if (points % num_processes != 0) { + if (rank == 0) { + printf("\033[0;31m%d points can't be split into %d processes!\033[0m\n", points, num_processes); + } + MPI_Finalize(); + return EXIT_FAILURE; + } + const int equal_share = points / num_processes; + // The first and last rank calculates one additional element, while all other + // ranks calculates two additional points + const int local_points = (rank == 0 || rank == num_processes - 1) ? equal_share + 1 : equal_share + 2; + const int local_start = (rank == 0) ? 0 : equal_share * rank - 1; + // Determine local rank relative to the node, this is used to allocate GPUs as + // we assume that each rank has its own GPU to utilize + MPI_Comm shared_node; + check_mpi (MPI_Comm_split_type (MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, + MPI_INFO_NULL, &shared_node), + "Could not split COMM_WORLD into shared communicator"); + int local_rank; + check_mpi (MPI_Comm_rank(shared_node, &local_rank), "Could not get local rank"); + // Assign GPU based on local rank + const int devices = acc_get_num_devices (acc_device_nvidia); + acc_set_device_num (local_rank % devices, acc_device_nvidia); + printf("Global rank \033[0;35m%d\033[0m, local \033[0;35m%d\033[0m, using GPU ID: \033[0;35m%d\033[0m (number of GPUs: \033[0;35m%d\033[0m)\n", + rank, local_rank, local_rank % devices, devices); + + /*************************** Implementation *********************************/ + // Define pointer to global result so that we can compile, this variable is + // only allocated on the root rank + double* wave = NULL; + if (rank == 0) { + printf("Calculating 1D wave equation with \033[0;35m%d\033[0m points over \033[0;35m%d\033[0m steps with \033[0;35m%f\033[0m time step\n", + points, steps, dt); + printf("\t...split over \033[0;35m%d\033[0m processes, processing \033[0;35m%d\033[0m points each\n", + num_processes, local_points); + // On the root rank we allocate enough space for the full wave, + // it is used as the full result + wave = calloc (points, sizeof (double)); + if (wave == NULL) { + printf("\033[0;31mCould not allocate %d points for wave results\033[0m\n", points); + // No need to check output, we will shortly exit anyway + MPI_Abort(MPI_COMM_WORLD, ALLOC_WAVE_FAIL); + return EXIT_FAILURE; + } + } + // Allocate memory for local work arrays + double* wave0 = calloc (local_points, sizeof (double)); + double* wave1 = calloc (local_points, sizeof (double)); + double* wave2 = calloc (local_points, sizeof (double)); + if (wave0 == NULL || wave1 == NULL || wave2 == NULL) { + printf("\033[0;31mRank %d could not allocate enough space for arrays!\033[0m\n", rank); + MPI_Abort(MPI_COMM_WORLD, ALLOC_WAVES_FAIL); + return EXIT_FAILURE; + } + const double dx = 1. / ((double) points - 1); + const double alpha = SOUND_SPEED * dt / dx; + const double alpha2 = alpha * alpha; + if (rank == 0) { + if (fabs (alpha) >= 1.) { + printf("\033[0;33mComputation will be unstable with the given parameters\033[0m\n"); + printf("\tdt = %f\n", dt); + printf("\tdx = %f (1. / %d)\n", dx, points); + printf("\t|alpha| = %f\n", fabs (alpha)); + } + // Initialize the wave only on the root rank + #pragma acc parallel loop copyout(wave[:points]) + for (int i = 0; i < points; i++) { + const double x = (double) i / (double) (points - 1); + wave[i] = exact (x, 0., SOUND_SPEED); + } + // Distribute computation to all other ranks + for (int r = 1; r < num_processes; r++) { + const int index = r * equal_share - 1; + const int num_points = (r < num_processes - 1) ? equal_share + 2 : equal_share + 1; + check_mpi (MPI_Send(&wave[index], num_points, MPI_DOUBLE, r, scatter_tag, MPI_COMM_WORLD), + "Could not distribute data"); + } + // Distribute data to root rank also + for (int i = 0; i < local_points; i++) { + wave0[i] = wave[i]; + wave1[i] = wave0[i]; + } + } else { + MPI_Status out; + check_mpi (MPI_Recv(wave0, local_points, MPI_DOUBLE, 0, scatter_tag, MPI_COMM_WORLD, &out), + "Could not receive data"); + if (out.MPI_ERROR != MPI_SUCCESS) { + printf("\033[0;31mMPI Recv error!\033[0m count: %ld, cancelled: %d, error: %d\n", + out._ucount / sizeof (double), out._cancelled, out.MPI_ERROR); + MPI_Abort(MPI_COMM_WORLD, INITIAL_DIST_RECV); + return EXIT_FAILURE; + } + for (int i = 0; i < local_points; i++) { + wave1[i] = wave0[i]; + } + } + // Subsequent steps utilize the existing arrays for computation + #pragma acc data copy(wave1[:local_points]) copyin(wave0[:local_points]) \ + create(wave2[:local_points]) + for (int s = 1; s < steps + 1; s++) { + const double t = (double) s * dt; + if (s == 1) { + // First time step we use the initial derivative information to calculate + // the solution + #pragma acc parallel loop + for (int i = 1; i < local_points - 1; i++) { + const double x = (double) (i + local_start) / (double) (points - 1); + wave2[i] = (1. - alpha2) * wave1[i] + + 0.5 * alpha2 * (wave1[i - 1] + wave1[i + 1]) + + dt * dudt (x, t, SOUND_SPEED); + } + } else { + // After first step we use previous calculations for future values + #pragma acc parallel loop + for (int i = 1; i < local_points - 1; i++) { + wave2[i] = 2. * (1. - alpha2) * wave1[i] + + alpha2 * (wave1[i - 1] + wave1[i + 1]) + - wave0[i]; + } + } + // Copy data from GPU to CPU to prepare for MPI sharing + #pragma acc update self(wave2[1:1]) + #pragma acc update self(wave2[local_points - 2:1]) + // Share data with neighboors + if (rank > 0) { + MPI_Send(&wave2[1], 1, MPI_DOUBLE, rank - 1, lower_tag, MPI_COMM_WORLD); + MPI_Status out; + MPI_Recv(&wave2[0], 1, MPI_DOUBLE, rank - 1, upper_tag, MPI_COMM_WORLD, &out); + } else { + wave2[0] = exact (0., t, SOUND_SPEED); + } + if (rank < num_processes - 1) { + MPI_Status out; + MPI_Recv(&wave2[local_points - 1], 1, MPI_DOUBLE, rank + 1, lower_tag, MPI_COMM_WORLD, &out); + MPI_Send(&wave2[local_points - 2], 1, MPI_DOUBLE, rank + 1, upper_tag, MPI_COMM_WORLD); + } else { + wave2[local_points - 1] = exact (1., t, SOUND_SPEED); + } + // Copy data we got from MPI neighbors back to GPU + #pragma acc update device(wave2[0:1]) + #pragma acc update device(wave2[local_points - 1:1]) + // Shift data + #pragma acc parallel loop + for (int i = 0; i < local_points; i++) { + wave0[i] = wave1[i]; + wave1[i] = wave2[i]; + } + } + // Synchronize data back to root rank + if (rank == 0) { + printf("Synchronizing results\033[0;33m...\033[0m "); + // Copy root rank data back into result array + for (int i = 0; i < local_points; i++) { + wave[i] = wave1[i]; + } + // Receive data from all other ranks + for (int r = 1; r < num_processes; r++) { + const int index = r * equal_share - 1; + const int num_points = (r < num_processes - 1) ? equal_share + 2 : equal_share + 1; + MPI_Status out; + check_mpi (MPI_Recv(&wave[index], num_points, MPI_DOUBLE, r, gather_tag, MPI_COMM_WORLD, &out), + "Could not receive data when gathering result"); + if (out.MPI_ERROR != MPI_SUCCESS) { + printf("\033[0;31mMPI Recv error!\033[0m count: %ld, cancelled: %d, error: %d\n", + out._ucount / sizeof (double), out._cancelled, out.MPI_ERROR); + MPI_Abort(MPI_COMM_WORLD, LAST_DIST_RECV); + return EXIT_FAILURE; + } + } + printf("\033[0;32mcompleted\033[0m!\n"); + printf("Calculation ended \033[0;32msuccesfully\033[0m!\n"); + } else { + check_mpi (MPI_Send(wave1, local_points, MPI_DOUBLE, 0, gather_tag, MPI_COMM_WORLD), + "Could not send data back to root when gathering results"); + } + // Free data before exit + free(wave0); + free(wave1); + free(wave2); + if (rank == 0) { + free(wave); + } + MPI_Finalize(); + return EXIT_SUCCESS; +} diff --git a/_downloads/78d683a69d5ba67f51612a5d731056a1/run_mnist.sh b/_downloads/78d683a69d5ba67f51612a5d731056a1/run_mnist.sh new file mode 100644 index 000000000..d3d0c2299 --- /dev/null +++ b/_downloads/78d683a69d5ba67f51612a5d731056a1/run_mnist.sh @@ -0,0 +1,17 @@ +#!/usr/bin/bash + +#SBATCH --account= +#SBATCH --job-name= +#SBATCH --partition=accel --gpus=1 +#SBATCH --ntasks=1 +#SBATCH --mem-per-cpu=8G +#SBATCH --time=00:30:00 + +# Purge modules and load tensorflow +module purge +module load TensorFlow/2.2.0-fosscuda-2019b-Python-3.7.4 +# List loaded modules for reproducibility +module list + +# Run python script +python $SLURM_SUBMIT_DIR/mnist.py diff --git a/_downloads/798b26722e27e242611451876cb7eb8f/kernels.job b/_downloads/798b26722e27e242611451876cb7eb8f/kernels.job new file mode 100644 index 000000000..5932cc962 --- /dev/null +++ b/_downloads/798b26722e27e242611451876cb7eb8f/kernels.job @@ -0,0 +1,22 @@ +#!/bin/sh + +#SBATCH --account= +#SBATCH --job-name=openacc_guide_kernels +#SBATCH --time=05:00 +#SBATCH --mem-per-cpu=512M +#SBATCH --partition=accel +#SBATCH --gpus=1 + +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error + +module --quiet purge # Reset the modules to the system default +module load NVHPC/20.7 # Load Nvidia HPC SDK with profiler +module list # List modules for easier debugging + +# Run the program through the Nsight command line profiler 'nsys' +# The '-t' flag tells the profiler which aspects it should profile, e.g. CUDA +# and OpenACC code +# The '-f' flag tells the profiler that it can override existing output files +# The '-o' flag tells the profiler the name we would like for the output file +nsys profile -t cuda,openacc -f true -o kernels ./jacobi diff --git a/_downloads/7beb7ef2f0ef17a10074082f488a5a21/generic_job.sh b/_downloads/7beb7ef2f0ef17a10074082f488a5a21/generic_job.sh new file mode 100644 index 000000000..1fdbd149f --- /dev/null +++ b/_downloads/7beb7ef2f0ef17a10074082f488a5a21/generic_job.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Job name: +#SBATCH --job-name=YourJobname +# +# Project: +#SBATCH --account=nnXXXXk +# +# Wall time limit: +#SBATCH --time=DD-HH:MM:SS +# +# Other parameters: +#SBATCH ... + +## Set up job environment: +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error + +module --quiet purge # Reset the modules to the system default +module load SomeProgram/SomeVersion +module list + +## Do some work: +YourCommands diff --git a/_downloads/7c93b9e2b73c2c569645f2670e928111/gaussianoverib.pdf b/_downloads/7c93b9e2b73c2c569645f2670e928111/gaussianoverib.pdf new file mode 100644 index 000000000..2f257b881 Binary files /dev/null and b/_downloads/7c93b9e2b73c2c569645f2670e928111/gaussianoverib.pdf differ diff --git a/_downloads/8098402610e00a1e4208fc9e1026affb/fram_mpi_job.sh b/_downloads/8098402610e00a1e4208fc9e1026affb/fram_mpi_job.sh new file mode 100644 index 000000000..2b78b7644 --- /dev/null +++ b/_downloads/8098402610e00a1e4208fc9e1026affb/fram_mpi_job.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +############################################### +# Script example for a normal MPI job on Fram # +############################################### + +## Project: replace XXXX with your project ID +#SBATCH --account=nnXXXXk + +## Job name: +#SBATCH --job-name=MyJob +## Allocating amount of resources: +#SBATCH --nodes=10 +## Number of tasks (aka processes) to start on each node: Pure mpi, one task per core +#SBATCH --ntasks-per-node=32 +## No memory pr task since this option is turned off on Fram in partition normal. +## Run for 10 minutes, syntax is d-hh:mm:ss +#SBATCH --time=0-00:10:00 + +# you may not place bash commands before the last SBATCH directive +###################################################### +## Setting variables and prepare runtime environment: +##---------------------------------------------------- +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +# Loading Software modules +# Allways be explicit on loading modules and setting run time environment!!! +module --quiet purge # Restore loaded modules to the default +module load MySoftWare/Versions #nb: Versions is important! + +# Type "module avail MySoftware" to find available modules and versions +# It is also recommended to to list loaded modules, for easier debugging: +module list + +####################################################### +## Prepare jobs, moving input files and making sure +# output is copied back and taken care of +##----------------------------------------------------- + +# Prepare input files +cp inputfiles $SCRATCH +cd $SCRATCH + +# Make sure output is copied back after job finishes +savefile outputfile1 outputfile2 + +######################################################## +# Run the application, and we typically time it: +##------------------------------------------------------ + +# Run the application - please add hash in front of srun and remove +# hash in front of mpirun if using intel-toolchain + +# For OpenMPI (foss and iomkl toolchains), srun is recommended: +time srun MySoftWare-exec + +## For IntelMPI (intel toolchain), mpirun is recommended: +#time mpirun MySoftWare-exec + +######################################################### +# That was about all this time; lets call it a day... +##------------------------------------------------------- +# Finish the script +exit 0 diff --git a/_downloads/829624d414d2e4a20ea19ec715f31341/wave_loop.c b/_downloads/829624d414d2e4a20ea19ec715f31341/wave_loop.c new file mode 100644 index 000000000..2c7d79bf1 --- /dev/null +++ b/_downloads/829624d414d2e4a20ea19ec715f31341/wave_loop.c @@ -0,0 +1,262 @@ +/** + * OpenACC + MPI implementation of the 1D wave equation + */ + +#include +#include +#include +#include +#include + +// Default number of points to calculate over, if not given on command line +static const int NUM_POINTS = 400; +// Default number of steps to perform per point, if not given on command line +static const int NUM_STEPS = 4000; +// Default time interval, if not given on command line +static const double DEFAULT_DT = 0.00125; +// Speed of sound used for calculation +static const double SOUND_SPEED = 1.0; + +// Define MPI tags for program +static const int lower_tag = 1010; // Send to lower rank +static const int upper_tag = 2020; // Send to higher rank +static const int scatter_tag = 3030; // Gather / Scatter data +static const int gather_tag = 4040; // Gather / Scatter data +// MPI Error codes +static const int ALLOC_WAVE_FAIL = 1001; +static const int ALLOC_WAVES_FAIL = 1002; +static const int INITIAL_DIST_RECV = 1003; +static const int LAST_DIST_RECV = 1004; + +// Helper macro to check an MPI call and print error if it failed +#define check_mpi(code, err) \ +if (code != MPI_SUCCESS) { \ + printf("\033[0;31m%s\033[0m\n", err); \ + printf("\tError code: \033[0;31m%d\033[0m\n", code); \ + MPI_Abort(MPI_COMM_WORLD, 1337); \ + return EXIT_FAILURE; \ +} + +/** + * Helper method to calculate the exact solution at 'x' with time step 't' and + * speed of sound 'c' + */ +double exact (const double x, const double t, const double c) { + return sin (2. * M_PI * (x - c * t)); +} + +/** + * Helper function to calculate the partial derivative du/dt + */ +double dudt (const double x, const double t, const double c) { + return -2. * M_PI * c * cos (2. * M_PI * (x - c * t)); +} + +int main (int argc, char** argv) { + // Define variables to use in calculation, initialized to default values + int points = NUM_POINTS; + int steps = NUM_STEPS; + double dt = DEFAULT_DT; + + /************************** Command line handling ***************************/ + if (argc > 1) { + if (strncmp (argv[1], "-h", 3) == 0 || strncmp (argv[1], "--help", 7) == 0) { + printf("Usage: \033[0;32m%s\033[0m \n", argv[0]); + return EXIT_SUCCESS; + } + points = atoi (argv[1]); + if (points < 1) { + printf("\033[0;31mThe number of points must be a positive number larger than '1'!\033[0m\n"); + return EXIT_FAILURE; + } + } + if (argc > 2) { + steps = atoi (argv[2]); + if (steps < 0) { + printf("\033[0;31mThe number of steps must be a positive number!\033[0m\n"); + return EXIT_FAILURE; + } + } + if (argc > 3) { + dt = atof (argv[3]); + if (dt <= 0.) { + printf("\033[0;31mTime interval must be larger than '0.0'!\033[0m\n"); + return EXIT_FAILURE; + } + } + + /*************************** MPI work sharing *******************************/ + // Initialize MPI + check_mpi (MPI_Init(&argc, &argv), "Could not initialize MPI!"); + // Extract MPI size and current rank + int num_processes = 1; + int rank = 0; + check_mpi (MPI_Comm_size(MPI_COMM_WORLD, &num_processes), "Could not fetch COMM_WORLD size"); + check_mpi (MPI_Comm_rank(MPI_COMM_WORLD, &rank), "Could not fetch COMM_WORLD rank"); + if (points % num_processes != 0) { + if (rank == 0) { + printf("\033[0;31m%d points can't be split into %d processes!\033[0m\n", points, num_processes); + } + MPI_Finalize(); + return EXIT_FAILURE; + } + const int equal_share = points / num_processes; + // The first and last rank calculates one additional element, while all other + // ranks calculates two additional points + const int local_points = (rank == 0 || rank == num_processes - 1) ? equal_share + 1 : equal_share + 2; + const int local_start = (rank == 0) ? 0 : equal_share * rank - 1; + + /*************************** Implementation *********************************/ + // Define pointer to global result so that we can compile, this variable is + // only allocated on the root rank + double* wave = NULL; + if (rank == 0) { + printf("Calculating 1D wave equation with \033[0;35m%d\033[0m points over \033[0;35m%d\033[0m steps with \033[0;35m%f\033[0m time step\n", + points, steps, dt); + printf("\t...split over \033[0;35m%d\033[0m processes, processing \033[0;35m%d\033[0m points each\n", + num_processes, local_points); + // On the root rank we allocate enough space for the full wave, + // it is used as the full result + wave = calloc (points, sizeof (double)); + if (wave == NULL) { + printf("\033[0;31mCould not allocate %d points for wave results\033[0m\n", points); + // No need to check output, we will shortly exit anyway + MPI_Abort(MPI_COMM_WORLD, ALLOC_WAVE_FAIL); + return EXIT_FAILURE; + } + } + // Allocate memory for local work arrays + double* wave0 = calloc (local_points, sizeof (double)); + double* wave1 = calloc (local_points, sizeof (double)); + double* wave2 = calloc (local_points, sizeof (double)); + if (wave0 == NULL || wave1 == NULL || wave2 == NULL) { + printf("\033[0;31mRank %d could not allocate enough space for arrays!\033[0m\n", rank); + MPI_Abort(MPI_COMM_WORLD, ALLOC_WAVES_FAIL); + return EXIT_FAILURE; + } + const double dx = 1. / ((double) points - 1); + const double alpha = SOUND_SPEED * dt / dx; + const double alpha2 = alpha * alpha; + if (rank == 0) { + if (fabs (alpha) >= 1.) { + printf("\033[0;33mComputation will be unstable with the given parameters\033[0m\n"); + printf("\tdt = %f\n", dt); + printf("\tdx = %f (1. / %d)\n", dx, points); + printf("\t|alpha| = %f\n", fabs (alpha)); + } + // Initialize the wave only on the root rank + for (int i = 0; i < points; i++) { + const double x = (double) i / (double) (points - 1); + wave[i] = exact (x, 0., SOUND_SPEED); + } + // Distribute computation to all other ranks + for (int r = 1; r < num_processes; r++) { + const int index = r * equal_share - 1; + const int num_points = (r < num_processes - 1) ? equal_share + 2 : equal_share + 1; + check_mpi (MPI_Send(&wave[index], num_points, MPI_DOUBLE, r, scatter_tag, MPI_COMM_WORLD), + "Could not distribute data"); + } + // Distribute data to root rank also + for (int i = 0; i < local_points; i++) { + wave0[i] = wave[i]; + wave1[i] = wave0[i]; + } + } else { + MPI_Status out; + check_mpi (MPI_Recv(wave0, local_points, MPI_DOUBLE, 0, scatter_tag, MPI_COMM_WORLD, &out), + "Could not receive data"); + if (out.MPI_ERROR != MPI_SUCCESS) { + printf("\033[0;31mMPI Recv error!\033[0m count: %ld, cancelled: %d, error: %d\n", + out._ucount / sizeof (double), out._cancelled, out.MPI_ERROR); + MPI_Abort(MPI_COMM_WORLD, INITIAL_DIST_RECV); + return EXIT_FAILURE; + } + for (int i = 0; i < local_points; i++) { + wave1[i] = wave0[i]; + } + } + // Subsequent steps utilize the existing arrays for computation + for (int s = 1; s < steps + 1; s++) { + const double t = (double) s * dt; + if (s == 1) { + // First time step we use the initial derivative information to calculate + // the solution + #pragma acc parallel loop + for (int i = 1; i < local_points - 1; i++) { + const double x = (double) (i + local_start) / (double) (points - 1); + wave2[i] = (1. - alpha2) * wave1[i] + + 0.5 * alpha2 * (wave1[i - 1] + wave1[i + 1]) + + dt * dudt (x, t, SOUND_SPEED); + } + } else { + // After first step we use previous calculations for future values + #pragma acc parallel loop + for (int i = 1; i < local_points - 1; i++) { + wave2[i] = 2. * (1. - alpha2) * wave1[i] + + alpha2 * (wave1[i - 1] + wave1[i + 1]) + - wave0[i]; + } + } + // Share data with neighboors + if (rank > 0) { + check_mpi (MPI_Send(&wave2[1], 1, MPI_DOUBLE, rank - 1, lower_tag, MPI_COMM_WORLD), + "Could not send lower update"); + MPI_Status out; + check_mpi (MPI_Recv(&wave2[0], 1, MPI_DOUBLE, rank - 1, upper_tag, MPI_COMM_WORLD, &out), + "Could not receive data for lower update"); + } else { + wave2[0] = exact (0., t, SOUND_SPEED); + } + if (rank < num_processes - 1) { + MPI_Status out; + check_mpi (MPI_Recv(&wave2[local_points - 1], 1, MPI_DOUBLE, rank + 1, lower_tag, MPI_COMM_WORLD, &out), + "Could not receive data for upper update"); + check_mpi (MPI_Send(&wave2[local_points - 2], 1, MPI_DOUBLE, rank + 1, upper_tag, MPI_COMM_WORLD), + "Could not send upper update"); + } else { + wave2[local_points - 1] = exact (1., t, SOUND_SPEED); + } + // Shift data + #pragma acc parallel loop + for (int i = 0; i < local_points; i++) { + wave0[i] = wave1[i]; + wave1[i] = wave2[i]; + } + } + // Synchronize data back to root rank + if (rank == 0) { + printf("Synchronizing results\033[0;33m...\033[0m "); + // Copy root rank data back into result array + for (int i = 0; i < local_points; i++) { + wave[i] = wave1[i]; + } + // Receive data from all other ranks + for (int r = 1; r < num_processes; r++) { + const int index = r * equal_share - 1; + const int num_points = (r < num_processes - 1) ? equal_share + 2 : equal_share + 1; + MPI_Status out; + check_mpi (MPI_Recv(&wave[index], num_points, MPI_DOUBLE, r, gather_tag, MPI_COMM_WORLD, &out), + "Could not receive data when gathering result"); + if (out.MPI_ERROR != MPI_SUCCESS) { + printf("\033[0;31mMPI Recv error!\033[0m count: %ld, cancelled: %d, error: %d\n", + out._ucount / sizeof (double), out._cancelled, out.MPI_ERROR); + MPI_Abort(MPI_COMM_WORLD, LAST_DIST_RECV); + return EXIT_FAILURE; + } + } + printf("\033[0;32mcompleted\033[0m!\n"); + printf("Calculation ended \033[0;32msuccesfully\033[0m!\n"); + } else { + check_mpi (MPI_Send(wave1, local_points, MPI_DOUBLE, 0, gather_tag, MPI_COMM_WORLD), + "Could not send data back to root when gathering results"); + } + // Free data before exit + free(wave0); + free(wave1); + free(wave2); + if (rank == 0) { + free(wave); + } + MPI_Finalize(); + return EXIT_SUCCESS; +} diff --git a/_downloads/85c0fe44e0a67ad226fce46a9f9d01b7/cufft_acc.f90 b/_downloads/85c0fe44e0a67ad226fce46a9f9d01b7/cufft_acc.f90 new file mode 100644 index 000000000..ad9fc14a0 --- /dev/null +++ b/_downloads/85c0fe44e0a67ad226fce46a9f9d01b7/cufft_acc.f90 @@ -0,0 +1,88 @@ +module parameter_kind + implicit none + public + integer, parameter :: sp = selected_real_kind(6, 37) !Single precision + integer, parameter :: dp = selected_real_kind(15, 307) !Double precision + integer, parameter :: fp = dp + real(fp), parameter :: pi = 4.0_fp*atan(1.0_fp),dt=0.25_fp + end module parameter_kind + + program cufft_acc + + use parameter_kind + use cufft + use openacc + + implicit none + + integer, parameter :: nt=512 + integer :: i,ierr,plan + complex(fp), allocatable :: in(:),out(:) + real(fp), allocatable :: t(:),w(:) + + allocate(t(nt),w(nt)); allocate(in(nt),out(nt)) + + call grid_1d(nt,t,w) + +!Example of a sinus function + do i=1,nt + in(i) = cmplx(sin(2.0_fp*t(i)),0.0_fp) + enddo + + print*,"--sum before FFT", sum(real(in(1:nt/2))) +!cufftExecZ2Z executes a double precision complex-to-complex transform plan + ierr = cufftPlan1D(plan,nt,CUFFT_Z2Z,1) +!acc_get_cuda_stream: tells the openACC runtime to identify the CUDA +!stream used by CUDA + ierr = ierr + cufftSetStream(plan,acc_get_cuda_stream(acc_async_sync)) + +!$acc data copy(in) copyout(out) +!$acc host_data use_device(in,out) + ierr = ierr + cufftExecZ2Z(plan, in, out, CUFFT_FORWARD) + ierr = ierr + cufftExecZ2Z(plan, out, in, CUFFT_INVERSE) +!$acc end host_data + +!$acc kernels + out(:) = out(:)/nt + in(:) = in(:)/nt +!$acc end kernels +!$acc end data + + ierr = ierr + cufftDestroy(plan) + + print*,"" + if(ierr.eq.0) then + print*,"--Yep it works :)" + else + print*,"Nop it fails, I stop :(" + endif + print*,"" + print*,"--sum iFFT", sum(real(in(1:nt/2))) + +!printing the fft of sinus + do i=1,nt/2 + write(204,*)w(i),sqrt(cabs(out(i))**2) + enddo + deallocate(in); deallocate(out) + end + + subroutine grid_1d(nt,t,w) + use parameter_kind + + implicit none + integer :: i,nt + real(fp) :: t(nt),w(nt) + +!Defining a uniform temporal grid + do i=1,nt + t(i) = (-dble(nt-1)/2.0_fp + (i-1))*dt + enddo + +!Defining a uniform frequency grid + do i=0,nt/2-1 + w(i+1) = 2.0_fp*pi*dble(i)/(nt*dt) + enddo + do i=nt/2,nt-1 + w(i+1) = 2.0_fp*pi*dble(i-nt)/(nt*dt) + enddo + end subroutine grid_1d diff --git a/_downloads/8cad6f1a82f67cbb7534d8fc3479bce1/mandelbrot_serial.c b/_downloads/8cad6f1a82f67cbb7534d8fc3479bce1/mandelbrot_serial.c new file mode 100644 index 000000000..69dd2989a --- /dev/null +++ b/_downloads/8cad6f1a82f67cbb7534d8fc3479bce1/mandelbrot_serial.c @@ -0,0 +1,149 @@ +/** + * Mandelbrot implementation for accelerators (e.g. GPUs) + */ + +#include "utils/lodepng.h" +#include "utils/palette.h" +#include +#include +#include +#include +#include + +// Default width and height for image if not given +static const int WIDTH = 1280; +static const int HEIGHT = 720; +// Default output name if not given +static const char* OUTPUT_NAME = "mandelbrot.png"; +// Maximum iteration count before exiting mandelbrot function +static const uint32_t MAX_ITER = 1000; + +// Helper function to scale 'num' to the range '[min, max]' +float scale(float num, const float min, const float max) { + const float scale = max - min; + return num * scale + min; +} + +/** + * Mandelbrot function, calculates the value of the mandelbrot set at pixel 'px/py' + */ +uint32_t mandelbrot(const int px, const int py, const int width, const int height, + const int max_iter) { + const float x0 = scale((float) px / (float) width, -2.5, 1.); + const float y0 = scale((float) py / (float) height, -1., 1.); + float x = 0.; + float y = 0.; + float x2 = 0.; + float y2 = 0.; + int iters = 0; + while (x2 + y2 < 4. && iters < max_iter) { + y = 2. * x * y + y0; + x = x2 - y2 + x0; + x2 = x * x; + y2 = y * y; + iters += 1; + } + return (uint32_t) iters; +} + +int main (int argc, char** argv) { + int width = WIDTH; + int height = HEIGHT; + char output_name[128]; + int max_iter = MAX_ITER; + strncpy (output_name, OUTPUT_NAME, strnlen (OUTPUT_NAME, 127) + 1); + // Assume the first argument is the width and height of the image + if (argc > 1) { + if (strncmp (argv[1], "-h", 2) == 0 || strncmp (argv[1], "--help", 6) == 0) { + printf("Usage: %s x \n", argv[0]); + printf("\tImage size can also be one of {8k, 4k, 3k, 1080p, 720p}\n"); + return EXIT_SUCCESS; + } + // First we check image size is one of the predefined sizes + if (strncmp (argv[1], "8k", 2) == 0) { + width = 7680; + height = 4320; + } else if (strncmp (argv[1], "4k", 2) == 0) { + width = 3840; + height = 2160; + } else if (strncmp (argv[1], "3k", 2) == 0) { + width = 3000; + height = 2000; + } else if (strncmp (argv[1], "1080p", 5) == 0) { + width = 1920; + height = 1080; + } else if (strncmp (argv[1], "720p", 4) == 0) { + width = 1280; + height = 720; + } else { + // Assume user has supplied x + // Try to find 'x' in argument + char* token; + token = strtok (argv[1], "x"); + if (token != NULL) { + width = atoi (token); + } else { + printf("\033[0;31mInvalid width/height definition:\033[0m '%s'\n", argv[1]); + printf("\tShould be 'x'\n"); + return EXIT_FAILURE; + } + token = strtok (NULL, "x"); + if (token != NULL) { + height = atoi (token); + } else { + printf("\033[0;31mInvalid width/height definition:\033[0m '%s'\n", argv[1]); + printf("\tShould be 'x'\n"); + return EXIT_FAILURE; + } + } + } + // Second argument is the maximum iteration count + if (argc > 2) { + max_iter = atoi (argv[2]); + } + // Third argument is the output filename to write PNG file to + if (argc > 3) { + if (strlen (argv[3]) > 127) { + printf("\033[0;31mOutput filename to large!\033[0m"); + return EXIT_FAILURE; + } + strncpy (output_name, argv[3], strnlen (argv[3], 127) + 1); + } + // Allocate storage for image + uint32_t* image = calloc (width * height, sizeof (uint32_t)); + if (image == NULL) { + printf("\033[0;31mCould not allocate memory for image!\033[0m\n"); + return EXIT_FAILURE; + } + printf("Generating \033[0;35m%dx%d\033[0m image with max \033[0;35m%d\033[0m iterations\n", + width, height, + max_iter); + /****************************************************************************/ + /*************************** Main computation ***************************/ + /****************************************************************************/ + const double start_time = omp_get_wtime (); + // For each pixel of our image calculate the value of the mandelbrot set + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + const uint32_t iters = mandelbrot (x, y, width, height, max_iter); + image[y * width + x] = palette[iters % palette_size]; + } + } + const double end_time = omp_get_wtime (); + printf("Used \033[0;35m%.3f\033[0m ms for computation\n", + (end_time - start_time) * 1000.0); + /****************************************************************************/ + // Write image to file + const unsigned char png_error = lodepng_encode32_file(output_name, + (const unsigned char*) image, + width, height); + // Free image storage + free (image); + if (png_error) { + printf("\033[0;31mAn error occurred while writing to PNG:\033[0m %s\n", + lodepng_error_text (png_error)); + return EXIT_FAILURE; + } + printf("Wrote Mandelbrot result to \033[0;35m%s\033[0m\n", output_name); + return EXIT_SUCCESS; +} diff --git a/_downloads/a0020b206613f5c790f55364adfd6125/wave_data_profile.png b/_downloads/a0020b206613f5c790f55364adfd6125/wave_data_profile.png new file mode 100644 index 000000000..d54bcac1f Binary files /dev/null and b/_downloads/a0020b206613f5c790f55364adfd6125/wave_data_profile.png differ diff --git a/_downloads/a10a1cbe5dff5f1d8ba3e7e366f27369/submit_cpu.sh b/_downloads/a10a1cbe5dff5f1d8ba3e7e366f27369/submit_cpu.sh new file mode 100644 index 000000000..e716a0ce2 --- /dev/null +++ b/_downloads/a10a1cbe5dff5f1d8ba3e7e366f27369/submit_cpu.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name=TestGPUOnSaga +#SBATCH --account=nnk +#SBATCH --time=05:00 +#SBATCH --mem-per-cpu=4G +#SBATCH --qos=devel + +## Set up job environment: +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error + +module --quiet purge # Reset the modules to the system default +module load TensorFlow/2.6.0-foss-2021a-CUDA-11.3.1 +module list + +python gpu_intro.py diff --git a/_downloads/a31f35be66ecfe7159328bd04a809e8e/async.qdrep b/_downloads/a31f35be66ecfe7159328bd04a809e8e/async.qdrep new file mode 100644 index 000000000..784d1451b Binary files /dev/null and b/_downloads/a31f35be66ecfe7159328bd04a809e8e/async.qdrep differ diff --git a/_downloads/a5978034e9a9ed430df07b0fad304ddc/async.zip b/_downloads/a5978034e9a9ed430df07b0fad304ddc/async.zip new file mode 100644 index 000000000..07b5c63c1 Binary files /dev/null and b/_downloads/a5978034e9a9ed430df07b0fad304ddc/async.zip differ diff --git a/_downloads/aa466374eee099fbea253f26b7d1e9ef/multi.zip b/_downloads/aa466374eee099fbea253f26b7d1e9ef/multi.zip new file mode 100644 index 000000000..0310b5e73 Binary files /dev/null and b/_downloads/aa466374eee099fbea253f26b7d1e9ef/multi.zip differ diff --git a/_downloads/ae848bfa7940645f80558ff9cb186e71/wave_acc.c b/_downloads/ae848bfa7940645f80558ff9cb186e71/wave_acc.c new file mode 100644 index 000000000..5e27cd2a5 --- /dev/null +++ b/_downloads/ae848bfa7940645f80558ff9cb186e71/wave_acc.c @@ -0,0 +1,269 @@ +/** + * OpenACC + MPI implementation of the 1D wave equation + */ + +#include +#include +#include +#include +#include + +// Default number of points to calculate over, if not given on command line +static const int NUM_POINTS = 400; +// Default number of steps to perform per point, if not given on command line +static const int NUM_STEPS = 4000; +// Default time interval, if not given on command line +static const double DEFAULT_DT = 0.00125; +// Speed of sound used for calculation +static const double SOUND_SPEED = 1.0; + +// Define MPI tags for program +static const int lower_tag = 1010; // Send to lower rank +static const int upper_tag = 2020; // Send to higher rank +static const int scatter_tag = 3030; // Gather / Scatter data +static const int gather_tag = 4040; // Gather / Scatter data +// MPI Error codes +static const int ALLOC_WAVE_FAIL = 1001; +static const int ALLOC_WAVES_FAIL = 1002; +static const int INITIAL_DIST_RECV = 1003; +static const int LAST_DIST_RECV = 1004; + +// Helper macro to check an MPI call and print error if it failed +#define check_mpi(code, err) \ +if (code != MPI_SUCCESS) { \ + printf("\033[0;31m%s\033[0m\n", err); \ + printf("\tError code: \033[0;31m%d\033[0m\n", code); \ + MPI_Abort(MPI_COMM_WORLD, 1337); \ + return EXIT_FAILURE; \ +} + +/** + * Helper method to calculate the exact solution at 'x' with time step 't' and + * speed of sound 'c' + */ +#pragma acc routine seq +double exact (const double x, const double t, const double c) { + return sin (2. * M_PI * (x - c * t)); +} + +/** + * Helper function to calculate the partial derivative du/dt + */ +#pragma acc routine seq +double dudt (const double x, const double t, const double c) { + return -2. * M_PI * c * cos (2. * M_PI * (x - c * t)); +} + +int main (int argc, char** argv) { + // Define variables to use in calculation, initialized to default values + int points = NUM_POINTS; + int steps = NUM_STEPS; + double dt = DEFAULT_DT; + + /************************** Command line handling ***************************/ + if (argc > 1) { + if (strncmp (argv[1], "-h", 3) == 0 || strncmp (argv[1], "--help", 7) == 0) { + printf("Usage: \033[0;32m%s\033[0m \n", argv[0]); + return EXIT_SUCCESS; + } + points = atoi (argv[1]); + if (points < 1) { + printf("\033[0;31mThe number of points must be a positive number larger than '1'!\033[0m\n"); + return EXIT_FAILURE; + } + } + if (argc > 2) { + steps = atoi (argv[2]); + if (steps < 0) { + printf("\033[0;31mThe number of steps must be a positive number!\033[0m\n"); + return EXIT_FAILURE; + } + } + if (argc > 3) { + dt = atof (argv[3]); + if (dt <= 0.) { + printf("\033[0;31mTime interval must be larger than '0.0'!\033[0m\n"); + return EXIT_FAILURE; + } + } + + /*************************** MPI work sharing *******************************/ + // Initialize MPI + check_mpi (MPI_Init(&argc, &argv), "Could not initialize MPI!"); + // Extract MPI size and current rank + int num_processes = 1; + int rank = 0; + check_mpi (MPI_Comm_size(MPI_COMM_WORLD, &num_processes), "Could not fetch COMM_WORLD size"); + check_mpi (MPI_Comm_rank(MPI_COMM_WORLD, &rank), "Could not fetch COMM_WORLD rank"); + if (points % num_processes != 0) { + if (rank == 0) { + printf("\033[0;31m%d points can't be split into %d processes!\033[0m\n", points, num_processes); + } + MPI_Finalize(); + return EXIT_FAILURE; + } + const int equal_share = points / num_processes; + // The first and last rank calculates one additional element, while all other + // ranks calculates two additional points + const int local_points = (rank == 0 || rank == num_processes - 1) ? equal_share + 1 : equal_share + 2; + const int local_start = (rank == 0) ? 0 : equal_share * rank - 1; + + /*************************** Implementation *********************************/ + // Define pointer to global result so that we can compile, this variable is + // only allocated on the root rank + double* wave = NULL; + if (rank == 0) { + printf("Calculating 1D wave equation with \033[0;35m%d\033[0m points over \033[0;35m%d\033[0m steps with \033[0;35m%f\033[0m time step\n", + points, steps, dt); + printf("\t...split over \033[0;35m%d\033[0m processes, processing \033[0;35m%d\033[0m points each\n", + num_processes, local_points); + // On the root rank we allocate enough space for the full wave, + // it is used as the full result + wave = calloc (points, sizeof (double)); + if (wave == NULL) { + printf("\033[0;31mCould not allocate %d points for wave results\033[0m\n", points); + // No need to check output, we will shortly exit anyway + MPI_Abort(MPI_COMM_WORLD, ALLOC_WAVE_FAIL); + return EXIT_FAILURE; + } + } + // Allocate memory for local work arrays + double* wave0 = calloc (local_points, sizeof (double)); + double* wave1 = calloc (local_points, sizeof (double)); + double* wave2 = calloc (local_points, sizeof (double)); + if (wave0 == NULL || wave1 == NULL || wave2 == NULL) { + printf("\033[0;31mRank %d could not allocate enough space for arrays!\033[0m\n", rank); + MPI_Abort(MPI_COMM_WORLD, ALLOC_WAVES_FAIL); + return EXIT_FAILURE; + } + const double dx = 1. / ((double) points - 1); + const double alpha = SOUND_SPEED * dt / dx; + const double alpha2 = alpha * alpha; + if (rank == 0) { + if (fabs (alpha) >= 1.) { + printf("\033[0;33mComputation will be unstable with the given parameters\033[0m\n"); + printf("\tdt = %f\n", dt); + printf("\tdx = %f (1. / %d)\n", dx, points); + printf("\t|alpha| = %f\n", fabs (alpha)); + } + // Initialize the wave only on the root rank + #pragma acc parallel loop copyout(wave[:points]) + for (int i = 0; i < points; i++) { + const double x = (double) i / (double) (points - 1); + wave[i] = exact (x, 0., SOUND_SPEED); + } + // Distribute computation to all other ranks + for (int r = 1; r < num_processes; r++) { + const int index = r * equal_share - 1; + const int num_points = (r < num_processes - 1) ? equal_share + 2 : equal_share + 1; + check_mpi (MPI_Send(&wave[index], num_points, MPI_DOUBLE, r, scatter_tag, MPI_COMM_WORLD), + "Could not distribute data"); + } + // Distribute data to root rank also + for (int i = 0; i < local_points; i++) { + wave0[i] = wave[i]; + wave1[i] = wave0[i]; + } + } else { + MPI_Status out; + check_mpi (MPI_Recv(wave0, local_points, MPI_DOUBLE, 0, scatter_tag, MPI_COMM_WORLD, &out), + "Could not receive data"); + if (out.MPI_ERROR != MPI_SUCCESS) { + printf("\033[0;31mMPI Recv error!\033[0m count: %ld, cancelled: %d, error: %d\n", + out._ucount / sizeof (double), out._cancelled, out.MPI_ERROR); + MPI_Abort(MPI_COMM_WORLD, INITIAL_DIST_RECV); + return EXIT_FAILURE; + } + for (int i = 0; i < local_points; i++) { + wave1[i] = wave0[i]; + } + } + // Subsequent steps utilize the existing arrays for computation + #pragma acc data copy(wave1[:local_points]) copyin(wave0[:local_points]) \ + create(wave2[:local_points]) + for (int s = 1; s < steps + 1; s++) { + const double t = (double) s * dt; + if (s == 1) { + // First time step we use the initial derivative information to calculate + // the solution + #pragma acc parallel loop + for (int i = 1; i < local_points - 1; i++) { + const double x = (double) (i + local_start) / (double) (points - 1); + wave2[i] = (1. - alpha2) * wave1[i] + + 0.5 * alpha2 * (wave1[i - 1] + wave1[i + 1]) + + dt * dudt (x, t, SOUND_SPEED); + } + } else { + // After first step we use previous calculations for future values + #pragma acc parallel loop + for (int i = 1; i < local_points - 1; i++) { + wave2[i] = 2. * (1. - alpha2) * wave1[i] + + alpha2 * (wave1[i - 1] + wave1[i + 1]) + - wave0[i]; + } + } + // Copy data from GPU to CPU to prepare for MPI sharing + #pragma acc update self(wave2[1:1]) + #pragma acc update self(wave2[local_points - 2:1]) + // Share data with neighboors + if (rank > 0) { + MPI_Send(&wave2[1], 1, MPI_DOUBLE, rank - 1, lower_tag, MPI_COMM_WORLD); + MPI_Status out; + MPI_Recv(&wave2[0], 1, MPI_DOUBLE, rank - 1, upper_tag, MPI_COMM_WORLD, &out); + } else { + wave2[0] = exact (0., t, SOUND_SPEED); + } + if (rank < num_processes - 1) { + MPI_Status out; + MPI_Recv(&wave2[local_points - 1], 1, MPI_DOUBLE, rank + 1, lower_tag, MPI_COMM_WORLD, &out); + MPI_Send(&wave2[local_points - 2], 1, MPI_DOUBLE, rank + 1, upper_tag, MPI_COMM_WORLD); + } else { + wave2[local_points - 1] = exact (1., t, SOUND_SPEED); + } + // Copy data we got from MPI neighbors back to GPU + #pragma acc update device(wave2[0:1]) + #pragma acc update device(wave2[local_points - 1:1]) + // Shift data + #pragma acc parallel loop + for (int i = 0; i < local_points; i++) { + wave0[i] = wave1[i]; + wave1[i] = wave2[i]; + } + } + // Synchronize data back to root rank + if (rank == 0) { + printf("Synchronizing results\033[0;33m...\033[0m "); + // Copy root rank data back into result array + for (int i = 0; i < local_points; i++) { + wave[i] = wave1[i]; + } + // Receive data from all other ranks + for (int r = 1; r < num_processes; r++) { + const int index = r * equal_share - 1; + const int num_points = (r < num_processes - 1) ? equal_share + 2 : equal_share + 1; + MPI_Status out; + check_mpi (MPI_Recv(&wave[index], num_points, MPI_DOUBLE, r, gather_tag, MPI_COMM_WORLD, &out), + "Could not receive data when gathering result"); + if (out.MPI_ERROR != MPI_SUCCESS) { + printf("\033[0;31mMPI Recv error!\033[0m count: %ld, cancelled: %d, error: %d\n", + out._ucount / sizeof (double), out._cancelled, out.MPI_ERROR); + MPI_Abort(MPI_COMM_WORLD, LAST_DIST_RECV); + return EXIT_FAILURE; + } + } + printf("\033[0;32mcompleted\033[0m!\n"); + printf("Calculation ended \033[0;32msuccesfully\033[0m!\n"); + } else { + check_mpi (MPI_Send(wave1, local_points, MPI_DOUBLE, 0, gather_tag, MPI_COMM_WORLD), + "Could not send data back to root when gathering results"); + } + // Free data before exit + free(wave0); + free(wave1); + free(wave2); + if (rank == 0) { + free(wave); + } + MPI_Finalize(); + return EXIT_SUCCESS; +} diff --git a/_downloads/b962df39385f53106ec8b31b860281b6/jacobi_memcpy.cpp b/_downloads/b962df39385f53106ec8b31b860281b6/jacobi_memcpy.cpp new file mode 100644 index 000000000..8085188f2 --- /dev/null +++ b/_downloads/b962df39385f53106ec8b31b860281b6/jacobi_memcpy.cpp @@ -0,0 +1,89 @@ +/** + * SYCL accelerated implementation of the Jacobi iteration + */ + +#include +#include + +#include + +// Number of rows and columns in our matrix +static const int NUM_ELEMENTS = 2000; +// Total number of elements in our matrix +static const int TOT_ELEMENTS = NUM_ELEMENTS * NUM_ELEMENTS; +// Maximum number of iterations before quiting +static const int MAX_ITER = 10000; +// Error tolerance for iteration +static const float MAX_ERROR = 0.01; +// Seed for random number generator +static const int SEED = 12345; + +int main (int argc, char** argv) { + // Create default SYCL queue and print name of device + auto Q = sycl::queue{sycl::default_selector{}}; + std::cout << "Chosen device: " + << Q.get_device().get_info() + << std::endl; + + // Initialize random number generator + srand (SEED); + + // Create *SHARED* array to store the input/output + float *arr_s = sycl::malloc_shared(TOT_ELEMENTS, Q); + + // Fill *SHARED* array with data + for (int i = 0; i < TOT_ELEMENTS; i++) { + // The following will create random values between [0, 1] + arr_s[i] = (float) rand () / (float) RAND_MAX; + } + + // Create *SHARED* array to calculate on + float *tmp_s = sycl::malloc_shared(TOT_ELEMENTS, Q); + float err = __FLT_MAX__; + + // We copy here to get the boundary elements, which will be copied back and forth unchanged + Q.memcpy(tmp_s, arr_s, TOT_ELEMENTS*sizeof(float)).wait(); + + int iterations = 0; + // Perform Jacobi iterations until we either have low enough error or too many iterations + while (err > MAX_ERROR && iterations < MAX_ITER) { + err = 0.; + // Submit work item to the SYCL queue + Q.submit( + [&](sycl::handler &h) { + // Define work kernel as single loop + h.parallel_for( + sycl::range{(NUM_ELEMENTS - 2) * (NUM_ELEMENTS - 2)}, + [=](sycl::id<1> idx) { + // Retain array indices from single loop variable + int i = (idx[0] / NUM_ELEMENTS) + 1; + int j = (idx[0] % NUM_ELEMENTS) + 1; + // For each element take the average of the surrounding elements + tmp_s[i * NUM_ELEMENTS + j] = 0.25 * (arr_s[i * NUM_ELEMENTS + j+1] + + arr_s[i * NUM_ELEMENTS + j-1] + + arr_s[(i-1) * NUM_ELEMENTS + j] + + arr_s[(i+1) * NUM_ELEMENTS + j]); + } + ); + } + ).wait(); // Wait for completion before moving on + + // Find maximum error (cannot be done in the loop kernel above) + for (int i = 0; i < TOT_ELEMENTS; i++) { + err = std::max(err, std::abs(tmp_s[i] - arr_s[i])); + } + + // Transfer new array to old (including boundary, which was untouched in the loop) + Q.memcpy(arr_s, tmp_s, TOT_ELEMENTS*sizeof(float)).wait(); + + iterations++; + } + + std::cout << "Iterations : " << iterations << " | Error : " << err << std::endl; + + // Free *SHARED* memory + sycl::free(arr_s, Q); + sycl::free(tmp_s, Q); + + return EXIT_SUCCESS; +} diff --git a/_downloads/bed068a3099aa3dc83fbde991905f93a/jacobi_reduction.cpp b/_downloads/bed068a3099aa3dc83fbde991905f93a/jacobi_reduction.cpp new file mode 100644 index 000000000..71dbde0fb --- /dev/null +++ b/_downloads/bed068a3099aa3dc83fbde991905f93a/jacobi_reduction.cpp @@ -0,0 +1,90 @@ +/** + * SYCL accelerated implementation of the Jacobi iteration + */ + +#include +#include + +#include + +// Number of rows and columns in our matrix +static const int NUM_ELEMENTS = 2000; +// Total number of elements in our matrix +static const int TOT_ELEMENTS = NUM_ELEMENTS * NUM_ELEMENTS; +// Maximum number of iterations before quiting +static const int MAX_ITER = 10000; +// Error tolerance for iteration +static const float MAX_ERROR = 0.01; +// Seed for random number generator +static const int SEED = 12345; + +int main (int argc, char** argv) { + // Create default SYCL queue and print name of device + auto Q = sycl::queue{sycl::default_selector{}}; + std::cout << "Chosen device: " + << Q.get_device().get_info() + << std::endl; + + // Initialize random number generator + srand (SEED); + + // Create *SHARED* array to store the input/output + float *arr_s = sycl::malloc_shared(TOT_ELEMENTS, Q); + + // Fill *SHARED* array with data + for (int i = 0; i < TOT_ELEMENTS; i++) { + // The following will create random values between [0, 1] + arr_s[i] = (float) rand () / (float) RAND_MAX; + } + + // Create *SHARED* array to calculate on + float *tmp_s = sycl::malloc_shared(TOT_ELEMENTS, Q); + float *err_s = sycl::malloc_shared(1, Q); + *err_s = __FLT_MAX__; + + // We copy here to get the boundary elements, which will be copied back and forth unchanged + Q.memcpy(tmp_s, arr_s, TOT_ELEMENTS*sizeof(float)).wait(); + + int iterations = 0; + // Perform Jacobi iterations until we either have low enough error or too many iterations + while (*err_s > MAX_ERROR && iterations < MAX_ITER) { + *err_s = 0.; + // Submit work item to the SYCL queue + Q.submit( + [&](sycl::handler &h) { + // Attach a reduction operation to the err_s shared variable, to be used in the parallel_for + auto max_err = sycl::reduction(err_s, sycl::maximum()); + + // Define work kernel as single loop + h.parallel_for( + sycl::range{(NUM_ELEMENTS - 2) * (NUM_ELEMENTS - 2)}, max_err, + [=](sycl::id<1> idx, auto &max) { + // Retain array indices from single loop variable + int i = (idx[0] / NUM_ELEMENTS) + 1; + int j = (idx[0] % NUM_ELEMENTS) + 1; + // For each element take the average of the surrounding elements + tmp_s[i * NUM_ELEMENTS + j] = 0.25 * (arr_s[i * NUM_ELEMENTS + j+1] + + arr_s[i * NUM_ELEMENTS + j-1] + + arr_s[(i-1) * NUM_ELEMENTS + j] + + arr_s[(i+1) * NUM_ELEMENTS + j]); + max.combine(std::abs(tmp_s[i * NUM_ELEMENTS + j] - arr_s[i * NUM_ELEMENTS + j])); + } + ); + } + ).wait(); // Wait for completion before moving on + + // Transfer new array to old (including boundary, which was untouched in the loop) + Q.memcpy(arr_s, tmp_s, TOT_ELEMENTS*sizeof(float)).wait(); + + iterations++; + } + + std::cout << "Iterations : " << iterations << " | Error : " << *err_s << std::endl; + + // Free *SHARED* memory + sycl::free(arr_s, Q); + sycl::free(tmp_s, Q); + sycl::free(err_s, Q); + + return EXIT_SUCCESS; +} diff --git a/_downloads/c98f994b5f4f63ad7bb1adc39e3a0771/array_howto.sh b/_downloads/c98f994b5f4f63ad7bb1adc39e3a0771/array_howto.sh new file mode 100644 index 000000000..f3f92a3a6 --- /dev/null +++ b/_downloads/c98f994b5f4f63ad7bb1adc39e3a0771/array_howto.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +##################### +# job-array example # +##################### + +## Substitute with your project name: +#SBATCH --account=YourProject + +#SBATCH --job-name=array_example + +# 16 jobs will run in this array at the same time +#SBATCH --array=1-16 + +# each job will run for maximum five minutes +# d-hh:mm:ss +#SBATCH --time=0-00:05:00 + +# you must not place bash commands before the last #SBATCH directive + +## Set safer defaults for bash +set -o errexit +set -o nounset + +module --quiet purge # Clear any inherited modules + +# each job will see a different ${SLURM_ARRAY_TASK_ID} +echo "now processing task id:: " ${SLURM_ARRAY_TASK_ID} +python test.py > output_${SLURM_ARRAY_TASK_ID}.txt diff --git a/_downloads/cadf1dfc1ee4793314d0f5696e4062ad/HeatEq2D_Stencil.tar.gz b/_downloads/cadf1dfc1ee4793314d0f5696e4062ad/HeatEq2D_Stencil.tar.gz new file mode 100644 index 000000000..af192f79d Binary files /dev/null and b/_downloads/cadf1dfc1ee4793314d0f5696e4062ad/HeatEq2D_Stencil.tar.gz differ diff --git a/_downloads/d0f67ca3daceeb6ffa22f97b20008c42/laplace_acc.f90 b/_downloads/d0f67ca3daceeb6ffa22f97b20008c42/laplace_acc.f90 new file mode 100644 index 000000000..aa3eb5fd9 --- /dev/null +++ b/_downloads/d0f67ca3daceeb6ffa22f97b20008c42/laplace_acc.f90 @@ -0,0 +1,94 @@ + program laplace_acc + + use openacc + + implicit none + integer :: i,j,k,ii + integer :: iter,count_rate, count_max,count,nenv_var + integer :: t_start,t_final + integer, parameter :: nx=8192,ny=nx + integer, parameter :: max_iter=525 + double precision, parameter :: pi=4d0*datan(1d0) + real, parameter :: error=0.05 + double precision :: max_err,time_s,& + d2fx,d2fy,max_err_part + double precision, allocatable :: f(:,:),f_k(:,:) + + call system_clock(count_max=count_max, count_rate=count_rate) + + call system_clock(t_start) + + allocate(f(0:nx+1,0:ny+1)); allocate(f_k(1:nx,1:ny)) + + f=0d0; f_k=0d0 + +!Generate the Initial Conditions (ICs) + CALL RANDOM_NUMBER(f) + + iter = 0 + + print*,"" + print*, "--Start iterations",iter + print*,"" + +!Structed data locality +!$acc data copyin(f) copyout(f_k) + + do while (max_err.gt.error.and.iter.le.max_iter) + +!$acc parallel loop present(f,f_k) collapse(2) + do j=1,ny + do i=1,nx + d2fx = f(i+1,j) + f(i-1,j) + d2fy = f(i,j+1) + f(i,j-1) + f_k(i,j) = 0.25*(d2fx + d2fy) + enddo + enddo +!$acc end parallel loop + + max_err=0. + +!$acc parallel loop present(f,f_k) collapse(2) & +!$acc reduction(max:max_err) + do j=1,ny + do i=1,nx + max_err = max(dabs(f_k(i,j) - f(i,j)),max_err) + f(i,j) = f_k(i,j) + enddo + enddo +!$acc end parallel loop + +!max_err is copied back to the CPU-host by default + + if(mod(iter,50).eq.0 )write(*,'(i5,f10.6)')iter,max_err + + iter = iter + 1 + + enddo +!$acc end data + + deallocate(f) + + write(*,'(i5,f10.6)') iter,max_err + + call system_clock(t_final) + + time_s = real(t_final - t_start)/real(count_rate) + + print*, '--Time it takes (s)', time_s + + print*, '--Job is completed successfully--' + print*,'' + +!to check the result + + do j=1,ny + write(111,*)j,sum(f_k(:,j)) + enddo + print*,"--Sum",sum(f_k(:,:))/nx/2 + print*,"--END :)" + + + deallocate(f_k) + + end diff --git a/_downloads/d2023f5c7351a148093c1ae252db63bd/parallel_steps_node.sh b/_downloads/d2023f5c7351a148093c1ae252db63bd/parallel_steps_node.sh new file mode 100644 index 000000000..a72070ef2 --- /dev/null +++ b/_downloads/d2023f5c7351a148093c1ae252db63bd/parallel_steps_node.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +#SBATCH --account=YourProject # Substitute with your project name +#SBATCH --job-name=parallel_tasks_node +#SBATCH --nodes=4 +#SBATCH --time=00:05:00 + +# Safety settings +set -o errexit +set -o nounset + +# Load MPI module +module --quiet purge +module load OpenMPI/4.1.1-GCC-11.2.0 +module list + +# This is needed for job types that hand out whole nodes: +unset SLURM_MEM_PER_NODE +export SLURM_MEM_PER_CPU=1888 # This is for Fram. For betzy, use 1952. + +# This is needed with the current version of Slurm (21.08.x): +export SLURM_JOB_NUM_NODES=1-$SLURM_JOB_NUM_NODES + +# The set of parallel runs: +srun --ntasks=16 --exact ./my-binary & +srun --ntasks=16 --exact ./my-binary & +srun --ntasks=16 --exact ./my-binary & +srun --ntasks=16 --exact ./my-binary & +srun --ntasks=16 --exact ./my-binary & +srun --ntasks=16 --exact ./my-binary & +srun --ntasks=16 --exact ./my-binary & +srun --ntasks=16 --exact ./my-binary & + +wait diff --git a/_downloads/d221ad95285cacc0b949d9e3fdd7cc65/parallel_steps_cpu.sh b/_downloads/d221ad95285cacc0b949d9e3fdd7cc65/parallel_steps_cpu.sh new file mode 100644 index 000000000..ece84bc0b --- /dev/null +++ b/_downloads/d221ad95285cacc0b949d9e3fdd7cc65/parallel_steps_cpu.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +#SBATCH --account=YourProject # Substitute with your project name +#SBATCH --job-name=parallel_tasks_cpu +#SBATCH --ntasks=20 +#SBATCH --time=0-00:05:00 +#SBATCH --mem-per-cpu=2000M + +# Safety settings +set -o errexit +set -o nounset + +# Load MPI module +module --quiet purge +module load OpenMPI/4.1.1-GCC-11.2.0 +module list + +# This is needed with the current version of Slurm (21.08.x): +export SLURM_JOB_NUM_NODES=1-$SLURM_JOB_NUM_NODES + +# The set of parallel runs: +srun --ntasks=4 --exact ./my-binary & +srun --ntasks=4 --exact ./my-binary & +srun --ntasks=4 --exact ./my-binary & +srun --ntasks=4 --exact ./my-binary & +srun --ntasks=4 --exact ./my-binary & + +wait diff --git a/_downloads/dd6c0903cee197775467fce463185d54/laplace_omp.f90 b/_downloads/dd6c0903cee197775467fce463185d54/laplace_omp.f90 new file mode 100644 index 000000000..e22e9f38f --- /dev/null +++ b/_downloads/dd6c0903cee197775467fce463185d54/laplace_omp.f90 @@ -0,0 +1,93 @@ + program laplace_omp + + use omp_lib + + implicit none + integer :: i,j,k,ii + integer :: iter,count_rate, count_max,count,nenv_var + integer :: t_start,t_final + integer, parameter :: nx=2*8192,ny=nx + integer, parameter :: max_iter=525 + double precision, parameter :: pi=4d0*datan(1d0) + real, parameter :: error=0.05 + double precision :: max_err,time_s,& + d2fx,d2fy,max_err_part + + double precision, allocatable :: f(:,:),f_k(:,:) + + call system_clock(count_max=count_max, count_rate=count_rate) + + call system_clock(t_start) + + allocate(f(0:nx+1,0:ny+1)); allocate(f_k(1:nx,1:ny)) + + f=0d0; f_k=0d0 + +!Generate the Initial Conditions (ICs) + CALL RANDOM_NUMBER(f) + + iter = 0 + + print*,"" + print*, "--Start iterations",iter + print*,"" + +!Structed data locality +!$omp target data device(myDevice) map(to:f) map(from:f_k) + + do while (max_err.gt.error.and.iter.le.max_iter) + +!$omp target teams distribute parallel do collapse(2) schedule(static,1) + do j=1,ny + do i=1,nx + d2fx = f(i+1,j) + f(i-1,j) + d2fy = f(i,j+1) + f(i,j-1) + f_k(i,j) = 0.25*(d2fx + d2fy) + enddo + enddo +!$omp end target teams distribute parallel do + + max_err=0. + +!$omp target teams distribute parallel do reduction(max:max_err) & +!$omp collapse(2) schedule(static,1) + do j=1,ny + do i=1,nx + max_err = max(dabs(f_k(i,j) - f(i,j)),max_err) + f(i,j) = f_k(i,j) + enddo + enddo +!$omp end target teams distribute parallel do + +!max_err is copied back to the CPU-host by default + + if(mod(iter,50).eq.0 )write(*,'(i5,f10.6)')iter,max_err + + iter = iter + 1 + + enddo +!$omp end target data + + deallocate(f) + + write(*,'(i5,f10.6)') iter,max_err + + call system_clock(t_final) + + time_s = real(t_final - t_start)/real(count_rate) + + print*, '--Time it takes (s)', time_s + + print*, '--Job is completed successfully--' + print*,'' + +!to check the result + do j=1,ny + write(111,*)j,sum(f_k(:,j)) + enddo + print*,"--Sum",sum(f_k(:,:))/nx/2 + print*,"--END :)" + + deallocate(f_k) + + end diff --git a/_downloads/e64ebb0a37aba83bb786e4620140d7c7/optimized.qdrep b/_downloads/e64ebb0a37aba83bb786e4620140d7c7/optimized.qdrep new file mode 100644 index 000000000..10c378d9e Binary files /dev/null and b/_downloads/e64ebb0a37aba83bb786e4620140d7c7/optimized.qdrep differ diff --git a/_downloads/ec7a0f8bcf2cd85bfbf6d0dd5430080c/syclomatic.def b/_downloads/ec7a0f8bcf2cd85bfbf6d0dd5430080c/syclomatic.def new file mode 100644 index 000000000..12d5f364a --- /dev/null +++ b/_downloads/ec7a0f8bcf2cd85bfbf6d0dd5430080c/syclomatic.def @@ -0,0 +1,11 @@ +Bootstrap: docker +From: nvidia/cuda:12.0.1-devel-ubuntu22.04 + +%post + apt-get update && apt-get install wget -y +%post + ls + wget https://github.com/oneapi-src/SYCLomatic/releases/download/20230208/linux_release.tgz + mkdir syclomatic + tar -xvzf linux_release.tgz -C ./syclomatic + echo "export PATH=${SINGULARITY_ROOTFS}/syclomatic/bin:$PATH" >> $SINGULARITY_ENVIRONMENT \ No newline at end of file diff --git a/_downloads/ed6198c78244a7ded352baab0657654f/submit_gpu.sh b/_downloads/ed6198c78244a7ded352baab0657654f/submit_gpu.sh new file mode 100644 index 000000000..2b7c6176e --- /dev/null +++ b/_downloads/ed6198c78244a7ded352baab0657654f/submit_gpu.sh @@ -0,0 +1,18 @@ +#!/bin/bash +#SBATCH --job-name=TestGPUOnSaga +#SBATCH --account=nnk +#SBATCH --time=05:00 +#SBATCH --mem-per-cpu=4G +#SBATCH --qos=devel +#SBATCH --partition=accel +#SBATCH --gpus=1 + +## Set up job environment: +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error + +module --quiet purge # Reset the modules to the system default +module load TensorFlow/2.6.0-foss-2021a-CUDA-11.3.1 +module list + +python gpu_intro.py diff --git a/_downloads/f5bd1278cd15a4be009201cf2031a2a3/laplace_mpi.f90 b/_downloads/f5bd1278cd15a4be009201cf2031a2a3/laplace_mpi.f90 new file mode 100644 index 000000000..938dd553f --- /dev/null +++ b/_downloads/f5bd1278cd15a4be009201cf2031a2a3/laplace_mpi.f90 @@ -0,0 +1,167 @@ + program laplace_mpi + + use mpi + + implicit none + integer status(MPI_STATUS_SIZE) + integer :: i,j,k,ii + integer :: iter,count_rate, count_max,count,nenv_var + integer :: myid,ierr,nproc,nxp,nyp,tag,tag1,tag2,nsend + integer, parameter :: nx=8192,ny=nx + integer, parameter :: max_iter=525 + double precision, parameter :: pi=4d0*datan(1d0) + real, parameter :: error=0.05 + double precision :: max_err,time_s,& + d2fx,d2fy,max_err_part + real :: t_start,t_final + double precision, allocatable :: f(:,:),f_k(:,:) + double precision, allocatable :: f_send(:,:),f_full(:,:) + + !MPI starts + ! Initialise OpenMPI communication. + call MPI_INIT(ierr) + ! Get number of active processes (from 0 to nproc-1). + call MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr ) + ! Identify the ID rank (process). + call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr ) + + t_start = MPI_WTIME() + + if (mod(nx,nproc).ne.0) then + if (myid.eq.0) write(*,*) 'nproc has to divide nx' + stop + else + nxp = nx/nproc + endif + if (mod(ny,nproc).ne.0) then + if (myid.eq.0) write(*,*) 'nproc has to divide ny' + stop + else + nyp = ny/nproc + endif + + if(myid.eq.0) then + print*,'--nbr of proc', nproc + write(*,*)'--nbr of points nx,ny',nx,ny + write(*,*)'--nbr of elmts on each proc, nyp=ny/nproc', nyp + endif + +!Generate the Initial Conditions (ICs) +!Distribute the ICs over all processes using the operation MPI_Scatter + allocate(f(0:nx+1,0:nyp+1)) + + f=0d0; tag1=2020; tag2=2021 + + if(myid.eq.0) then + allocate(f_send(1:nx,1:ny)) + CALL RANDOM_NUMBER(f_send) + endif + + call MPI_Scatter(f_send,nx*nyp,MPI_DOUBLE_PRECISION,& + f(1:nx,1:nyp), nx*nyp,MPI_DOUBLE_PRECISION,& + 0,MPI_COMM_WORLD, ierr) + + call MPI_Barrier(MPI_COMM_WORLD, ierr) + + if(myid.eq.0) deallocate(f_send) + + allocate(f_k(1:nx,1:nyp)) + + iter = 0 + + if(myid.eq.0) then + print*,"" + print*, "--Start iterations",iter + print*,"" + endif + + do while (max_err.gt.error.and.iter.le.max_iter) + +!transfer the data at the boundaries to the neighbouring MPI-process +!send f(:,nyp) from myid-1 to be stored in f(:,0) in myid+1 + if(myid.lt.nproc-1) then + call MPI_Send(f(:,nyp),(nx+2)*1,MPI_DOUBLE_PRECISION,myid+1,tag1,& + MPI_COMM_WORLD, ierr) + endif + +!receive f(:,0) from myid-1 + if(myid.gt.0) then + call MPI_Recv(f(:,0),(nx+2)*1,MPI_DOUBLE_PRECISION,myid-1, & + tag1,MPI_COMM_WORLD, status,ierr) + endif + +!send f(:,1) from myid+1 to be stored in f(:,nyp+1) in myid-1 + if(myid.gt.0) then + call MPI_Send(f(:,1),(nx+2)*1,MPI_DOUBLE_PRECISION,myid-1,tag2,& + MPI_COMM_WORLD, ierr) + endif + +!receive f(:,npy+1) from myid-1 + if(myid.lt.nproc-1) then + call MPI_Recv(f(:,nyp+1),(nx+2)*1,MPI_DOUBLE_PRECISION,myid+1,& + tag2,MPI_COMM_WORLD, status,ierr) + endif + + do j=1,nyp + do i=1,nx + d2fx = f(i+1,j) + f(i-1,j) + d2fy = f(i,j+1) + f(i,j-1) + f_k(i,j) = 0.25*(d2fx + d2fy) + enddo + enddo + + max_err=0. + + do j=1,nyp + do i=1,nx + max_err = max(dabs(f_k(i,j) - f(i,j)),max_err) + f(i,j) = f_k(i,j) + enddo + enddo + + call MPI_ALLREDUCE(MPI_IN_PLACE,max_err,1,& + MPI_DOUBLE_PRECISION,MPI_MAX, MPI_COMM_WORLD,ierr ) + + if(myid.eq.0) then + if(mod(iter,50).eq.0 )write(*,'(i5,f10.6)')iter,max_err + endif + + iter = iter + 1 + + enddo + + deallocate(f) + + if(myid.eq.0) write(*,'(i5,f10.6)') iter,max_err + + call MPI_Barrier(MPI_COMM_WORLD, ierr) + + t_final = MPI_WTIME() + time_s = t_final - t_start + + if(myid.eq.0)print*, '--Time it takes (s)', time_s + + if(myid.eq.0) then + print*, '--Job is completed successfully--' + print*,'' + endif + +!to check the result + allocate(f_full(nx,ny)) + call MPI_Gather(f_k, nx*nyp, MPI_DOUBLE_PRECISION, & + f_full, nx*nyp, MPI_DOUBLE_PRECISION, 0, & + MPI_COMM_WORLD, ierr) + + if(myid.eq.0) then + do j=1,ny + write(111,*)j,sum(f_full(:,j)) + enddo + print*,"--Sum",sum(f_full(:,:))/nx/2 + print*,"--END :)" + endif + + deallocate(f_full,f_k) + + call MPI_FINALIZE( ierr ) + + end diff --git a/_downloads/f75ca82541d728076eaba2dc43cfff03/initial.qdrep b/_downloads/f75ca82541d728076eaba2dc43cfff03/initial.qdrep new file mode 100644 index 000000000..65f8ddc61 Binary files /dev/null and b/_downloads/f75ca82541d728076eaba2dc43cfff03/initial.qdrep differ diff --git a/_downloads/fad9b49630c4e58e2cb381240f9951ec/wave_mpi.c b/_downloads/fad9b49630c4e58e2cb381240f9951ec/wave_mpi.c new file mode 100644 index 000000000..c5d23eea1 --- /dev/null +++ b/_downloads/fad9b49630c4e58e2cb381240f9951ec/wave_mpi.c @@ -0,0 +1,259 @@ +/** + * MPI implementation of the 1D wave equation + */ + +#include +#include +#include +#include +#include + +// Default number of points to calculate over, if not given on command line +static const int NUM_POINTS = 400; +// Default number of steps to perform per point, if not given on command line +static const int NUM_STEPS = 4000; +// Default time interval, if not given on command line +static const double DEFAULT_DT = 0.00125; +// Speed of sound used for calculation +static const double SOUND_SPEED = 1.0; + +// Define MPI tags for program +static const int lower_tag = 1010; // Send to lower rank +static const int upper_tag = 2020; // Send to higher rank +static const int scatter_tag = 3030; // Gather / Scatter data +static const int gather_tag = 4040; // Gather / Scatter data +// MPI Error codes +static const int ALLOC_WAVE_FAIL = 1001; +static const int ALLOC_WAVES_FAIL = 1002; +static const int INITIAL_DIST_RECV = 1003; +static const int LAST_DIST_RECV = 1004; + +// Helper macro to check an MPI call and print error if it failed +#define check_mpi(code, err) \ +if (code != MPI_SUCCESS) { \ + printf("\033[0;31m%s\033[0m\n", err); \ + printf("\tError code: \033[0;31m%d\033[0m\n", code); \ + MPI_Abort(MPI_COMM_WORLD, 1337); \ + return EXIT_FAILURE; \ +} + +/** + * Helper method to calculate the exact solution at 'x' with time step 't' and + * speed of sound 'c' + */ +double exact (const double x, const double t, const double c) { + return sin (2. * M_PI * (x - c * t)); +} + +/** + * Helper function to calculate the partial derivative du/dt + */ +double dudt (const double x, const double t, const double c) { + return -2. * M_PI * c * cos (2. * M_PI * (x - c * t)); +} + +int main (int argc, char** argv) { + // Define variables to use in calculation, initialized to default values + int points = NUM_POINTS; + int steps = NUM_STEPS; + double dt = DEFAULT_DT; + + /************************** Command line handling ***************************/ + if (argc > 1) { + if (strncmp (argv[1], "-h", 3) == 0 || strncmp (argv[1], "--help", 7) == 0) { + printf("Usage: \033[0;32m%s\033[0m \n", argv[0]); + return EXIT_SUCCESS; + } + points = atoi (argv[1]); + if (points < 1) { + printf("\033[0;31mThe number of points must be a positive number larger than '1'!\033[0m\n"); + return EXIT_FAILURE; + } + } + if (argc > 2) { + steps = atoi (argv[2]); + if (steps < 0) { + printf("\033[0;31mThe number of steps must be a positive number!\033[0m\n"); + return EXIT_FAILURE; + } + } + if (argc > 3) { + dt = atof (argv[3]); + if (dt <= 0.) { + printf("\033[0;31mTime interval must be larger than '0.0'!\033[0m\n"); + return EXIT_FAILURE; + } + } + + /*************************** MPI work sharing *******************************/ + // Initialize MPI + check_mpi (MPI_Init(&argc, &argv), "Could not initialize MPI!"); + // Extract MPI size and current rank + int num_processes = 1; + int rank = 0; + check_mpi (MPI_Comm_size(MPI_COMM_WORLD, &num_processes), "Could not fetch COMM_WORLD size"); + check_mpi (MPI_Comm_rank(MPI_COMM_WORLD, &rank), "Could not fetch COMM_WORLD rank"); + if (points % num_processes != 0) { + if (rank == 0) { + printf("\033[0;31m%d points can't be split into %d processes!\033[0m\n", points, num_processes); + } + MPI_Finalize(); + return EXIT_FAILURE; + } + const int equal_share = points / num_processes; + // The first and last rank calculates one additional element, while all other + // ranks calculates two additional points + const int local_points = (rank == 0 || rank == num_processes - 1) ? equal_share + 1 : equal_share + 2; + const int local_start = (rank == 0) ? 0 : equal_share * rank - 1; + + /*************************** Implementation *********************************/ + // Define pointer to global result so that we can compile, this variable is + // only allocated on the root rank + double* wave = NULL; + if (rank == 0) { + printf("Calculating 1D wave equation with \033[0;35m%d\033[0m points over \033[0;35m%d\033[0m steps with \033[0;35m%f\033[0m time step\n", + points, steps, dt); + printf("\t...split over \033[0;35m%d\033[0m processes, processing \033[0;35m%d\033[0m points each\n", + num_processes, local_points); + // On the root rank we allocate enough space for the full wave, + // it is used as the full result + wave = calloc (points, sizeof (double)); + if (wave == NULL) { + printf("\033[0;31mCould not allocate %d points for wave results\033[0m\n", points); + // No need to check output, we will shortly exit anyway + MPI_Abort(MPI_COMM_WORLD, ALLOC_WAVE_FAIL); + return EXIT_FAILURE; + } + } + // Allocate memory for local work arrays + double* wave0 = calloc (local_points, sizeof (double)); + double* wave1 = calloc (local_points, sizeof (double)); + double* wave2 = calloc (local_points, sizeof (double)); + if (wave0 == NULL || wave1 == NULL || wave2 == NULL) { + printf("\033[0;31mRank %d could not allocate enough space for arrays!\033[0m\n", rank); + MPI_Abort(MPI_COMM_WORLD, ALLOC_WAVES_FAIL); + return EXIT_FAILURE; + } + const double dx = 1. / ((double) points - 1); + const double alpha = SOUND_SPEED * dt / dx; + const double alpha2 = alpha * alpha; + if (rank == 0) { + if (fabs (alpha) >= 1.) { + printf("\033[0;33mComputation will be unstable with the given parameters\033[0m\n"); + printf("\tdt = %f\n", dt); + printf("\tdx = %f (1. / %d)\n", dx, points); + printf("\t|alpha| = %f\n", fabs (alpha)); + } + // Initialize the wave only on the root rank + for (int i = 0; i < points; i++) { + const double x = (double) i / (double) (points - 1); + wave[i] = exact (x, 0., SOUND_SPEED); + } + // Distribute computation to all other ranks + for (int r = 1; r < num_processes; r++) { + const int index = r * equal_share - 1; + const int num_points = (r < num_processes - 1) ? equal_share + 2 : equal_share + 1; + check_mpi (MPI_Send(&wave[index], num_points, MPI_DOUBLE, r, scatter_tag, MPI_COMM_WORLD), + "Could not distribute data"); + } + // Distribute data to root rank also + for (int i = 0; i < local_points; i++) { + wave0[i] = wave[i]; + wave1[i] = wave0[i]; + } + } else { + MPI_Status out; + check_mpi (MPI_Recv(wave0, local_points, MPI_DOUBLE, 0, scatter_tag, MPI_COMM_WORLD, &out), + "Could not receive data"); + if (out.MPI_ERROR != MPI_SUCCESS) { + printf("\033[0;31mMPI Recv error!\033[0m count: %ld, cancelled: %d, error: %d\n", + out._ucount / sizeof (double), out._cancelled, out.MPI_ERROR); + MPI_Abort(MPI_COMM_WORLD, INITIAL_DIST_RECV); + return EXIT_FAILURE; + } + for (int i = 0; i < local_points; i++) { + wave1[i] = wave0[i]; + } + } + // Subsequent steps utilize the existing arrays for computation + for (int s = 1; s < steps + 1; s++) { + const double t = (double) s * dt; + if (s == 1) { + // First time step we use the initial derivative information to calculate + // the solution + for (int i = 1; i < local_points - 1; i++) { + const double x = (double) (i + local_start) / (double) (points - 1); + wave2[i] = (1. - alpha2) * wave1[i] + + 0.5 * alpha2 * (wave1[i - 1] + wave1[i + 1]) + + dt * dudt (x, t, SOUND_SPEED); + } + } else { + // After first step we use previous calculations for future values + for (int i = 1; i < local_points - 1; i++) { + wave2[i] = 2. * (1. - alpha2) * wave1[i] + + alpha2 * (wave1[i - 1] + wave1[i + 1]) + - wave0[i]; + } + } + // Share data with neighboors + if (rank > 0) { + check_mpi (MPI_Send(&wave2[1], 1, MPI_DOUBLE, rank - 1, lower_tag, MPI_COMM_WORLD), + "Could not send lower update"); + MPI_Status out; + check_mpi (MPI_Recv(&wave2[0], 1, MPI_DOUBLE, rank - 1, upper_tag, MPI_COMM_WORLD, &out), + "Could not receive data for lower update"); + } else { + wave2[0] = exact (0., t, SOUND_SPEED); + } + if (rank < num_processes - 1) { + MPI_Status out; + check_mpi (MPI_Recv(&wave2[local_points - 1], 1, MPI_DOUBLE, rank + 1, lower_tag, MPI_COMM_WORLD, &out), + "Could not receive data for upper update"); + check_mpi (MPI_Send(&wave2[local_points - 2], 1, MPI_DOUBLE, rank + 1, upper_tag, MPI_COMM_WORLD), + "Could not send upper update"); + } else { + wave2[local_points - 1] = exact (1., t, SOUND_SPEED); + } + // Shift data + for (int i = 0; i < local_points; i++) { + wave0[i] = wave1[i]; + wave1[i] = wave2[i]; + } + } + // Synchronize data back to root rank + if (rank == 0) { + printf("Synchronizing results\033[0;33m...\033[0m "); + // Copy root rank data back into result array + for (int i = 0; i < local_points; i++) { + wave[i] = wave1[i]; + } + // Receive data from all other ranks + for (int r = 1; r < num_processes; r++) { + const int index = r * equal_share - 1; + const int num_points = (r < num_processes - 1) ? equal_share + 2 : equal_share + 1; + MPI_Status out; + check_mpi (MPI_Recv(&wave[index], num_points, MPI_DOUBLE, r, gather_tag, MPI_COMM_WORLD, &out), + "Could not receive data when gathering result"); + if (out.MPI_ERROR != MPI_SUCCESS) { + printf("\033[0;31mMPI Recv error!\033[0m count: %ld, cancelled: %d, error: %d\n", + out._ucount / sizeof (double), out._cancelled, out.MPI_ERROR); + MPI_Abort(MPI_COMM_WORLD, LAST_DIST_RECV); + return EXIT_FAILURE; + } + } + printf("\033[0;32mcompleted\033[0m!\n"); + printf("Calculation ended \033[0;32msuccesfully\033[0m!\n"); + } else { + check_mpi (MPI_Send(wave1, local_points, MPI_DOUBLE, 0, gather_tag, MPI_COMM_WORLD), + "Could not send data back to root when gathering results"); + } + // Free data before exit + free(wave0); + free(wave1); + free(wave2); + if (rank == 0) { + free(wave); + } + MPI_Finalize(); + return EXIT_SUCCESS; +} diff --git a/_downloads/fb81b0e087aaf4bc9ab7f3fd64e490e7/laplace_mpiacc_aware.f90 b/_downloads/fb81b0e087aaf4bc9ab7f3fd64e490e7/laplace_mpiacc_aware.f90 new file mode 100644 index 000000000..a673599b5 --- /dev/null +++ b/_downloads/fb81b0e087aaf4bc9ab7f3fd64e490e7/laplace_mpiacc_aware.f90 @@ -0,0 +1,237 @@ + program laplace_mpiacc_aware + + use mpi + use openacc + + implicit none + integer status(MPI_STATUS_SIZE) + integer :: i,j,k,ii + integer :: iter,count_rate, count_max,count,nenv_var + integer :: myid,ierr,nproc,nxp,nyp,tag,tag1,tag2,nsend + integer, parameter :: nx=20000,ny=nx + integer, parameter :: max_iter=525 + double precision, parameter :: pi=4d0*datan(1d0) + real, parameter :: error=0.05 + double precision :: max_err,time_s,& + d2fx,d2fy,max_err_part + real :: t_start,t_final + double precision, allocatable :: f(:,:),f_k(:,:) + double precision, allocatable :: f_send(:,:),f_full(:,:) + character(len=300) :: env_var + integer(kind=acc_device_kind) deviceType + integer :: myDevice,numDevice,host_rank,host_comm + + !MPI starts + ! Initialise OpenMPI communication. + call MPI_INIT(ierr) + ! Get number of active processes (from 0 to nproc-1). + call MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr ) + ! Identify the ID rank (process). + call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr ) + +!check if GPU-aware support is enabled + if(myid.eq.0) then + print*, '' + call getenv("MPICH_GPU_SUPPORT_ENABLED", env_var) + read(env_var, '(i10)' ) nenv_var + if (nenv_var.eq. 1) then + print *, '--MPICH_GPU_SUPPORT_ENABLED is enabled!' + print*, '' + elseif (nenv_var.ne. 1) then + print *, '--MPICH_GPU_SUPPORT_ENABLED is NOT enabled!' + print *, '--I exit' + call exit(1) + endif + endif + + t_start = MPI_WTIME() + + if (mod(nx,nproc).ne.0) then + if (myid.eq.0) write(*,*) 'nproc has to divide nx' + stop + else + nxp = nx/nproc + endif + if (mod(ny,nproc).ne.0) then + if (myid.eq.0) write(*,*) 'nproc has to divide ny' + stop + else + nyp = ny/nproc + endif + + if(myid.eq.0) then + print*,'--nbr of proc', nproc + write(*,*)'--nbr of points nx,ny',nx,ny + write(*,*)'--nbr of elmts on each proc, nyp=ny/nproc', nyp + endif + +!Generate the Initial Conditions (ICs) +!Distribute the ICs over all processes using the operation MPI_Scatter + allocate(f(0:nx+1,0:nyp+1)) + + f=0d0; tag1=2020; tag2=2021 + + if(myid.eq.0) then + allocate(f_send(1:nx,1:ny)) + CALL RANDOM_NUMBER(f_send) + endif + + call MPI_Scatter(f_send,nx*nyp,MPI_DOUBLE_PRECISION,& + f(1:nx,1:nyp), nx*nyp,MPI_DOUBLE_PRECISION,& + 0,MPI_COMM_WORLD, ierr) + + call MPI_Barrier(MPI_COMM_WORLD, ierr) + + if(myid.eq.0) deallocate(f_send) + +!Set a device: Determine which processes are on each node +!such that each process will be connected to a GPU + +!!Split the world communicator into subgroups of commu, each of which +!contains processes that run on the same node, and which can create a +!shared +!memory region (via the type MPI_COMM_TYPE_SHARED). +!The call returns a new communicator "host_comm", which is created by +!each subgroup. + + call MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,& + MPI_INFO_NULL, host_comm,ierr) + call MPI_COMM_RANK(host_comm, host_rank,ierr) + + myDevice = host_rank + +!returns the device type to be used + deviceType = acc_get_device_type() + +!returns the number of devices available on the host + numDevice = acc_get_num_devices(deviceType) + +!sets the device number and the device type to be used + call acc_set_device_num(myDevice, deviceType) + + if(myid.eq.0)print*, "--Number of devices per node:", numDevice + if(myid.eq.0)print*,"" + + print*, "--MPI rank", myid, "is connected to GPU", myDevice + + allocate(f_k(1:nx,1:nyp)) + + iter = 0 + + if(myid.eq.0) then + print*,"" + print*, "--Start iterations",iter + print*,"" + endif + +!Unstructed data locality +!$acc enter data copyin(f) create(f_k) + do while (max_err.gt.error.and.iter.le.max_iter) + +!Performing MPI_send and MPI_Recv between GPUs without passing through +!the host +!$acc host_data use_device(f) + +!transfer the data at the boundaries to the neighbouring MPI-process +!send f(:,nyp) from myid-1 to be stored in f(:,0) in myid+1 + if(myid.lt.nproc-1) then + call MPI_Send(f(:,nyp),(nx+2)*1,MPI_DOUBLE_PRECISION,myid+1,tag1,& + MPI_COMM_WORLD, ierr) + endif + +!receive f(:,0) from myid-1 + if(myid.gt.0) then + call MPI_Recv(f(:,0),(nx+2)*1,MPI_DOUBLE_PRECISION,myid-1, & + tag1,MPI_COMM_WORLD, status,ierr) + endif + +!send f(:,1) from myid+1 to be stored in f(:,nyp+1) in myid-1 + if(myid.gt.0) then + call MPI_Send(f(:,1),(nx+2)*1,MPI_DOUBLE_PRECISION,myid-1,tag2,& + MPI_COMM_WORLD, ierr) + endif + +!receive f(:,npy+1) from myid-1 + if(myid.lt.nproc-1) then + call MPI_Recv(f(:,nyp+1),(nx+2)*1,MPI_DOUBLE_PRECISION,myid+1,& + tag2,MPI_COMM_WORLD, status,ierr) + endif + +!$acc end host_data + +!$acc parallel loop present(f,f_k) collapse(2) + do j=1,nyp + do i=1,nx + d2fx = f(i+1,j) + f(i-1,j) + d2fy = f(i,j+1) + f(i,j-1) + f_k(i,j) = 0.25*(d2fx + d2fy) + enddo + enddo +!$acc end parallel loop + + max_err=0. + +!$acc parallel loop present(f,f_k) collapse(2) & +!$acc reduction(max:max_err) + do j=1,nyp + do i=1,nx + max_err = max(dabs(f_k(i,j) - f(i,j)),max_err) + f(i,j) = f_k(i,j) + enddo + enddo +!$acc end parallel loop + +!max_err is copied back to the CPU-host by default + +!$acc enter data copyin(max_err) +!Performing MPI_Allreduce between GPUs without passing through the host +!$acc host_data use_device(max_err) + call MPI_ALLREDUCE(MPI_IN_PLACE,max_err,1,& + MPI_DOUBLE_PRECISION,MPI_MAX, MPI_COMM_WORLD,ierr ) +!$acc end host_data +!$acc exit data copyout(max_err) + + if(myid.eq.0) then + if(mod(iter,50).eq.0 )write(*,'(i5,f10.6)')iter,max_err + endif + + iter = iter + 1 + + enddo +!$acc exit data copyout(f_k) delete(f) + + deallocate(f) + + if(myid.eq.0) write(*,'(i5,f10.6)') iter,max_err + + call MPI_Barrier(MPI_COMM_WORLD, ierr) + + t_final = MPI_WTIME() + time_s = t_final - t_start + + if(myid.eq.0)print*, '--Time it takes (s)', time_s + + if(myid.eq.0) then + print*, '--Job is completed successfully--' + print*,'' + endif + +!to check the result + allocate(f_full(nx,ny)) + call MPI_Gather(f_k, nx*nyp, MPI_DOUBLE_PRECISION, & + f_full, nx*nyp, MPI_DOUBLE_PRECISION, 0, & + MPI_COMM_WORLD, ierr) + + if(myid.eq.0) then + do j=1,ny + write(111,*)j,sum(f_full(:,j)) + enddo + print*,"--Sum",sum(f_full(:,:))/nx/2 + print*,"--END :)" + endif + + deallocate(f_full,f_k) + + call MPI_FINALIZE( ierr ) + + end diff --git a/_images/Advisor-1.png b/_images/Advisor-1.png new file mode 100644 index 000000000..ebcb94e06 Binary files /dev/null and b/_images/Advisor-1.png differ diff --git a/_images/Advisor-2.png b/_images/Advisor-2.png new file mode 100644 index 000000000..0ed27243d Binary files /dev/null and b/_images/Advisor-2.png differ diff --git a/_images/Advisor-4.png b/_images/Advisor-4.png new file mode 100644 index 000000000..f8c38c61a Binary files /dev/null and b/_images/Advisor-4.png differ diff --git a/_images/Advisor-6.png b/_images/Advisor-6.png new file mode 100644 index 000000000..c42bc6803 Binary files /dev/null and b/_images/Advisor-6.png differ diff --git a/_images/Advisor-7.png b/_images/Advisor-7.png new file mode 100644 index 000000000..11d3d7cea Binary files /dev/null and b/_images/Advisor-7.png differ diff --git a/_images/Advisor-8.png b/_images/Advisor-8.png new file mode 100644 index 000000000..cf90991b8 Binary files /dev/null and b/_images/Advisor-8.png differ diff --git a/_images/Fig1.png b/_images/Fig1.png new file mode 100644 index 000000000..2b9b43703 Binary files /dev/null and b/_images/Fig1.png differ diff --git a/_images/Fig2.png b/_images/Fig2.png new file mode 100644 index 000000000..f5b06d12b Binary files /dev/null and b/_images/Fig2.png differ diff --git a/_images/Fig3.png b/_images/Fig3.png new file mode 100644 index 000000000..c05988dec Binary files /dev/null and b/_images/Fig3.png differ diff --git a/_images/Gather.svg b/_images/Gather.svg new file mode 100644 index 000000000..be63aa1ce --- /dev/null +++ b/_images/Gather.svg @@ -0,0 +1,3 @@ + + +

Gather Communication Pattern

<h3>Gather Communication Pattern</h3>
MEMORY OUTPUT
MEMORY OUTPUT
MEMORY INPUT
MEMORY INPUT
Threads
Threads
\ No newline at end of file diff --git a/_images/Log-in-feide-innsyn.png b/_images/Log-in-feide-innsyn.png new file mode 100644 index 000000000..ae2b88658 Binary files /dev/null and b/_images/Log-in-feide-innsyn.png differ diff --git a/_images/MAP.svg b/_images/MAP.svg new file mode 100644 index 000000000..dbac830d8 --- /dev/null +++ b/_images/MAP.svg @@ -0,0 +1,3 @@ + + +

MAP Communication Pattern

<h3>MAP Communication Pattern</h3>
MEMORY OUTPUT
MEMORY OUTPUT
MEMORY INPUT
MEMORY INPUT
Threads
Threads
\ No newline at end of file diff --git a/_images/MainWindowNNS1.png b/_images/MainWindowNNS1.png new file mode 100644 index 000000000..31f5fb564 Binary files /dev/null and b/_images/MainWindowNNS1.png differ diff --git a/_images/NNSprojectexplorer2.png b/_images/NNSprojectexplorer2.png new file mode 100644 index 000000000..4253ea310 Binary files /dev/null and b/_images/NNSprojectexplorer2.png differ diff --git a/_images/Perf-reports-1.png b/_images/Perf-reports-1.png new file mode 100644 index 000000000..d9f980c8c Binary files /dev/null and b/_images/Perf-reports-1.png differ diff --git a/_images/Perf-reports-2.png b/_images/Perf-reports-2.png new file mode 100644 index 000000000..4503b460e Binary files /dev/null and b/_images/Perf-reports-2.png differ diff --git a/_images/Perf-reports-3.png b/_images/Perf-reports-3.png new file mode 100644 index 000000000..91ab6fea5 Binary files /dev/null and b/_images/Perf-reports-3.png differ diff --git a/_images/Perf-reports-4.png b/_images/Perf-reports-4.png new file mode 100644 index 000000000..e00f57b40 Binary files /dev/null and b/_images/Perf-reports-4.png differ diff --git a/_images/Perf-reports-5.png b/_images/Perf-reports-5.png new file mode 100644 index 000000000..75bea3abf Binary files /dev/null and b/_images/Perf-reports-5.png differ diff --git a/_images/Perf-reports-6.png b/_images/Perf-reports-6.png new file mode 100644 index 000000000..97be27ac6 Binary files /dev/null and b/_images/Perf-reports-6.png differ diff --git a/_images/Perf-reports-7.png b/_images/Perf-reports-7.png new file mode 100644 index 000000000..77dca5046 Binary files /dev/null and b/_images/Perf-reports-7.png differ diff --git a/_images/Picture1.png b/_images/Picture1.png new file mode 100644 index 000000000..8a18696cf Binary files /dev/null and b/_images/Picture1.png differ diff --git a/_images/Picture2.png b/_images/Picture2.png new file mode 100644 index 000000000..503262bd2 Binary files /dev/null and b/_images/Picture2.png differ diff --git a/_images/Picture3.png b/_images/Picture3.png new file mode 100644 index 000000000..ec64d275b Binary files /dev/null and b/_images/Picture3.png differ diff --git a/_images/Picture4.png b/_images/Picture4.png new file mode 100644 index 000000000..8347a9b00 Binary files /dev/null and b/_images/Picture4.png differ diff --git a/_images/Speedup.png b/_images/Speedup.png new file mode 100644 index 000000000..54ce0b3a9 Binary files /dev/null and b/_images/Speedup.png differ diff --git a/_images/X2Go_First.png b/_images/X2Go_First.png new file mode 100644 index 000000000..34be73fe2 Binary files /dev/null and b/_images/X2Go_First.png differ diff --git a/_images/X2Go_SessionSetup.png b/_images/X2Go_SessionSetup.png new file mode 100644 index 000000000..6dc5e6810 Binary files /dev/null and b/_images/X2Go_SessionSetup.png differ diff --git a/_images/X2Go_SessionSetupSSH.png b/_images/X2Go_SessionSetupSSH.png new file mode 100644 index 000000000..611fce025 Binary files /dev/null and b/_images/X2Go_SessionSetupSSH.png differ diff --git a/_images/X2Go_SessionStart.png b/_images/X2Go_SessionStart.png new file mode 100644 index 000000000..379e00f1a Binary files /dev/null and b/_images/X2Go_SessionStart.png differ diff --git a/_images/all2all.png b/_images/all2all.png new file mode 100644 index 000000000..283ac7fa4 Binary files /dev/null and b/_images/all2all.png differ diff --git a/_images/anim_out.gif b/_images/anim_out.gif new file mode 100644 index 000000000..dddba143d Binary files /dev/null and b/_images/anim_out.gif differ diff --git a/_images/application_list.png b/_images/application_list.png new file mode 100644 index 000000000..eac1450eb Binary files /dev/null and b/_images/application_list.png differ diff --git a/_images/application_overview.png b/_images/application_overview.png new file mode 100644 index 000000000..a4705de8d Binary files /dev/null and b/_images/application_overview.png differ diff --git a/_images/application_reconfigure.png b/_images/application_reconfigure.png new file mode 100644 index 000000000..708c7b3ce Binary files /dev/null and b/_images/application_reconfigure.png differ diff --git a/_images/application_resources.png b/_images/application_resources.png new file mode 100644 index 000000000..9d6a4ea8d Binary files /dev/null and b/_images/application_resources.png differ diff --git a/_images/async_timeline.png b/_images/async_timeline.png new file mode 100644 index 000000000..29b9aef54 Binary files /dev/null and b/_images/async_timeline.png differ diff --git a/_images/b2access1.png b/_images/b2access1.png new file mode 100644 index 000000000..f92eb33af Binary files /dev/null and b/_images/b2access1.png differ diff --git a/_images/b2access2.png b/_images/b2access2.png new file mode 100644 index 000000000..3a03fed3d Binary files /dev/null and b/_images/b2access2.png differ diff --git a/_images/barrier.png b/_images/barrier.png new file mode 100644 index 000000000..739de7b28 Binary files /dev/null and b/_images/barrier.png differ diff --git a/_images/blade_fram.png b/_images/blade_fram.png new file mode 100644 index 000000000..2677e662c Binary files /dev/null and b/_images/blade_fram.png differ diff --git a/_images/chassis_fram.png b/_images/chassis_fram.png new file mode 100644 index 000000000..f8d5a75f3 Binary files /dev/null and b/_images/chassis_fram.png differ diff --git a/_images/compiler-perf.png b/_images/compiler-perf.png new file mode 100644 index 000000000..e38fb7490 Binary files /dev/null and b/_images/compiler-perf.png differ diff --git a/_images/cpus_20tasks.png b/_images/cpus_20tasks.png new file mode 100644 index 000000000..e84538552 Binary files /dev/null and b/_images/cpus_20tasks.png differ diff --git a/_images/custom-image.png b/_images/custom-image.png new file mode 100644 index 000000000..f868298b2 Binary files /dev/null and b/_images/custom-image.png differ diff --git a/_images/dask_taskgraph.png b/_images/dask_taskgraph.png new file mode 100644 index 000000000..a139f0dbe Binary files /dev/null and b/_images/dask_taskgraph.png differ diff --git a/_images/deep_learning_gpu_machine_type.png b/_images/deep_learning_gpu_machine_type.png new file mode 100644 index 000000000..659ef44fa Binary files /dev/null and b/_images/deep_learning_gpu_machine_type.png differ diff --git a/_images/feide-guest-authentication.png b/_images/feide-guest-authentication.png new file mode 100644 index 000000000..d0c0acea6 Binary files /dev/null and b/_images/feide-guest-authentication.png differ diff --git a/_images/feide-innsyn-groups.png b/_images/feide-innsyn-groups.png new file mode 100644 index 000000000..5433c7b6e Binary files /dev/null and b/_images/feide-innsyn-groups.png differ diff --git a/_images/fig-acc.jpeg b/_images/fig-acc.jpeg new file mode 100644 index 000000000..b25376d85 Binary files /dev/null and b/_images/fig-acc.jpeg differ diff --git a/_images/fig-arch-volta.jpg b/_images/fig-arch-volta.jpg new file mode 100644 index 000000000..7123791b8 Binary files /dev/null and b/_images/fig-arch-volta.jpg differ diff --git a/_images/fig-hardware.jpg b/_images/fig-hardware.jpg new file mode 100644 index 000000000..49abef649 Binary files /dev/null and b/_images/fig-hardware.jpg differ diff --git a/_images/fig-omp.jpg b/_images/fig-omp.jpg new file mode 100644 index 000000000..b84768471 Binary files /dev/null and b/_images/fig-omp.jpg differ diff --git a/_images/fig-software.jpg b/_images/fig-software.jpg new file mode 100644 index 000000000..067e5c1ca Binary files /dev/null and b/_images/fig-software.jpg differ diff --git a/_images/fig0.png b/_images/fig0.png new file mode 100644 index 000000000..fdfc46489 Binary files /dev/null and b/_images/fig0.png differ diff --git a/_images/fig00.png b/_images/fig00.png new file mode 100644 index 000000000..c46c8f42b Binary files /dev/null and b/_images/fig00.png differ diff --git a/_images/fig1.png b/_images/fig1.png new file mode 100644 index 000000000..72f3694a0 Binary files /dev/null and b/_images/fig1.png differ diff --git a/_images/fig11.png b/_images/fig11.png new file mode 100644 index 000000000..e59d90e54 Binary files /dev/null and b/_images/fig11.png differ diff --git a/_images/fig2.png b/_images/fig2.png new file mode 100644 index 000000000..66d46fec7 Binary files /dev/null and b/_images/fig2.png differ diff --git a/_images/fig21.png b/_images/fig21.png new file mode 100644 index 000000000..0d19ff891 Binary files /dev/null and b/_images/fig21.png differ diff --git a/_images/fig3.png b/_images/fig3.png new file mode 100644 index 000000000..ece32e776 Binary files /dev/null and b/_images/fig3.png differ diff --git a/_images/fig31.png b/_images/fig31.png new file mode 100644 index 000000000..ded34b9d8 Binary files /dev/null and b/_images/fig31.png differ diff --git a/_images/fig4.png b/_images/fig4.png new file mode 100644 index 000000000..350813031 Binary files /dev/null and b/_images/fig4.png differ diff --git a/_images/fig41.png b/_images/fig41.png new file mode 100644 index 000000000..96071870e Binary files /dev/null and b/_images/fig41.png differ diff --git a/_images/fig5.png b/_images/fig5.png new file mode 100644 index 000000000..16ff3dff5 Binary files /dev/null and b/_images/fig5.png differ diff --git a/_images/fig51.png b/_images/fig51.png new file mode 100644 index 000000000..41075e512 Binary files /dev/null and b/_images/fig51.png differ diff --git a/_images/fig6.png b/_images/fig6.png new file mode 100644 index 000000000..ee6ddcfa2 Binary files /dev/null and b/_images/fig6.png differ diff --git a/_images/figure_10_screenshot_of_the_edit_dataset_page._the_manage_menu_is_indicated_by_a_black_box.png b/_images/figure_10_screenshot_of_the_edit_dataset_page._the_manage_menu_is_indicated_by_a_black_box.png new file mode 100644 index 000000000..4fb3c5c34 Binary files /dev/null and b/_images/figure_10_screenshot_of_the_edit_dataset_page._the_manage_menu_is_indicated_by_a_black_box.png differ diff --git a/_images/figure_11_screenshot_of_the_landing_page._the_manage_link_is_indicated_by_black_box.png b/_images/figure_11_screenshot_of_the_landing_page._the_manage_link_is_indicated_by_black_box.png new file mode 100644 index 000000000..37d70f29a Binary files /dev/null and b/_images/figure_11_screenshot_of_the_landing_page._the_manage_link_is_indicated_by_black_box.png differ diff --git a/_images/figure_12_screenshot_of_the_manage_dataset_menu.png b/_images/figure_12_screenshot_of_the_manage_dataset_menu.png new file mode 100644 index 000000000..192c70d09 Binary files /dev/null and b/_images/figure_12_screenshot_of_the_manage_dataset_menu.png differ diff --git a/_images/figure_13_screenshot_of_the_version_dataset_upload.png b/_images/figure_13_screenshot_of_the_version_dataset_upload.png new file mode 100644 index 000000000..02a67e7da Binary files /dev/null and b/_images/figure_13_screenshot_of_the_version_dataset_upload.png differ diff --git a/_images/figure_14_screenshot_of_the_advanced_search_interface.png b/_images/figure_14_screenshot_of_the_advanced_search_interface.png new file mode 100644 index 000000000..596977f20 Binary files /dev/null and b/_images/figure_14_screenshot_of_the_advanced_search_interface.png differ diff --git a/_images/figure_15_screenshot_of_the_landing_page_table_of_contents.png b/_images/figure_15_screenshot_of_the_landing_page_table_of_contents.png new file mode 100644 index 000000000..7c0d48f82 Binary files /dev/null and b/_images/figure_15_screenshot_of_the_landing_page_table_of_contents.png differ diff --git a/_images/figure_17_screenshot_of_landing_page_with_dataset_citation.png b/_images/figure_17_screenshot_of_landing_page_with_dataset_citation.png new file mode 100644 index 000000000..31109fd75 Binary files /dev/null and b/_images/figure_17_screenshot_of_landing_page_with_dataset_citation.png differ diff --git a/_images/figure_1_screenshot_of_the_archive_web_interface_front_page.png b/_images/figure_1_screenshot_of_the_archive_web_interface_front_page.png new file mode 100644 index 000000000..bf9e0bc8e Binary files /dev/null and b/_images/figure_1_screenshot_of_the_archive_web_interface_front_page.png differ diff --git a/_images/figure_2_screenshot_of_the_terms_and_conditions_page.png b/_images/figure_2_screenshot_of_the_terms_and_conditions_page.png new file mode 100644 index 000000000..e3e8276ba Binary files /dev/null and b/_images/figure_2_screenshot_of_the_terms_and_conditions_page.png differ diff --git a/_images/figure_3_screenshot_of_the_publication_form.png b/_images/figure_3_screenshot_of_the_publication_form.png new file mode 100644 index 000000000..c3197619e Binary files /dev/null and b/_images/figure_3_screenshot_of_the_publication_form.png differ diff --git a/_images/figure_4_screenshot_of_the_primary_metadata_form_0.png b/_images/figure_4_screenshot_of_the_primary_metadata_form_0.png new file mode 100644 index 000000000..81110b458 Binary files /dev/null and b/_images/figure_4_screenshot_of_the_primary_metadata_form_0.png differ diff --git a/_images/figure_5_screenshot_of_the_upload_dataset_page.png b/_images/figure_5_screenshot_of_the_upload_dataset_page.png new file mode 100644 index 000000000..ad01dbf85 Binary files /dev/null and b/_images/figure_5_screenshot_of_the_upload_dataset_page.png differ diff --git a/_images/figure_6_screenshot_of_the_upload_for_feide_users.png b/_images/figure_6_screenshot_of_the_upload_for_feide_users.png new file mode 100644 index 000000000..618471554 Binary files /dev/null and b/_images/figure_6_screenshot_of_the_upload_for_feide_users.png differ diff --git a/_images/figure_7_screenshot_of_the_list_of_datasets.png b/_images/figure_7_screenshot_of_the_list_of_datasets.png new file mode 100644 index 000000000..ee133fe3c Binary files /dev/null and b/_images/figure_7_screenshot_of_the_list_of_datasets.png differ diff --git a/_images/figure_8_screenshot_of_the_secondary_metadata_form.png b/_images/figure_8_screenshot_of_the_secondary_metadata_form.png new file mode 100644 index 000000000..08837ce03 Binary files /dev/null and b/_images/figure_8_screenshot_of_the_secondary_metadata_form.png differ diff --git a/_images/figure_9_screenshot_of_the_table_of_contents.png b/_images/figure_9_screenshot_of_the_table_of_contents.png new file mode 100644 index 000000000..d40117a83 Binary files /dev/null and b/_images/figure_9_screenshot_of_the_table_of_contents.png differ diff --git a/_images/flowchart.svg b/_images/flowchart.svg new file mode 100644 index 000000000..64279f749 --- /dev/null +++ b/_images/flowchart.svg @@ -0,0 +1,3 @@ + + +
Allocate memory to store cells of grid and next_grid
Allocate memory to store cells of grid and next_grid
Apply boundary condition
Apply boundary condition
Calculate 'Temp_Next' for each cell
Calculate 'Temp_Next' for each cell
Initialize constants, like diffusion constant, grid_size, cell_size etc.
Initialize constants, like diffusion constant, grid_size, cell_size etc.
Swap grid with next_grid
Swap grid with next_grid
for simulation time steps
for simulation time steps
Free memory
Free memory
\ No newline at end of file diff --git a/_images/g16-mem.png b/_images/g16-mem.png new file mode 100644 index 000000000..3712b23d9 Binary files /dev/null and b/_images/g16-mem.png differ diff --git a/_images/g16-runtimes.png b/_images/g16-runtimes.png new file mode 100644 index 000000000..2a55e8eea Binary files /dev/null and b/_images/g16-runtimes.png differ diff --git a/_images/g16-speedup.png b/_images/g16-speedup.png new file mode 100644 index 000000000..f568f1588 Binary files /dev/null and b/_images/g16-speedup.png differ diff --git a/_images/grid.svg b/_images/grid.svg new file mode 100644 index 000000000..0c3eec438 --- /dev/null +++ b/_images/grid.svg @@ -0,0 +1,3 @@ + + +
i,j
i,j
i, j-1
i, j-1
i+1, j
i+1, j
i, j+1
i, j+1
i-1, j
i-1, j
h
h
h
h
\ No newline at end of file diff --git a/_images/grid_block.svg b/_images/grid_block.svg new file mode 100644 index 000000000..c7a6110ac --- /dev/null +++ b/_images/grid_block.svg @@ -0,0 +1,3 @@ + + +
Thread
(0,0)
Thread<br>(0,0)
Thread
(0,1)
Thread<br>(0,1)
Thread
(0,2)
Thread<br>(0,2)
Thread
(1,0)
Thread<br>(1,0)
Thread
(2,0)
Thread<br>(2,0)
Thread
(3,0)
Thread<br>(3,0)
Thread
(1,2)
Thread<br>(1,2)
Thread
(2,1)
Thread<br>(2,1)
Thread
(3,1)
Thread<br>(3,1)
Thread
(1,1)
Thread<br>(1,1)
Thread
(2,2)
Thread<br>(2,2)
Thread
(3,2)
Thread<br>(3,2)
GPU
GPU
GRID
[Not supported by viewer]
Block
(0,0)
[Not supported by viewer]
Block
(1,0)
[Not supported by viewer]
Block
(2,0)
[Not supported by viewer]
Block
(0,1)
[Not supported by viewer]
Block
(1,1)
[Not supported by viewer]
Block
(2,1)
[Not supported by viewer]
KERNEL
KERNEL
Block(1, 1)
Block(1, 1)
CPU
CPU
\ No newline at end of file diff --git a/_images/halo.svg b/_images/halo.svg new file mode 100644 index 000000000..cb9ab3ed5 --- /dev/null +++ b/_images/halo.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/_images/initial_timeline_overview.png b/_images/initial_timeline_overview.png new file mode 100644 index 000000000..1a973fff0 Binary files /dev/null and b/_images/initial_timeline_overview.png differ diff --git a/_images/initial_zoom1.png b/_images/initial_zoom1.png new file mode 100644 index 000000000..510e8ecca Binary files /dev/null and b/_images/initial_zoom1.png differ diff --git a/_images/initial_zoom2.png b/_images/initial_zoom2.png new file mode 100644 index 000000000..0e337a02c Binary files /dev/null and b/_images/initial_zoom2.png differ diff --git a/_images/insidenode_fram.png b/_images/insidenode_fram.png new file mode 100644 index 000000000..c619d57f2 Binary files /dev/null and b/_images/insidenode_fram.png differ diff --git a/_images/jobcontrol.png b/_images/jobcontrol.png new file mode 100644 index 000000000..ec9ec51cc Binary files /dev/null and b/_images/jobcontrol.png differ diff --git a/_images/jupyter_package.png b/_images/jupyter_package.png new file mode 100644 index 000000000..5250a3f22 Binary files /dev/null and b/_images/jupyter_package.png differ diff --git a/_images/jupyter_package_install.png b/_images/jupyter_package_install.png new file mode 100644 index 000000000..a0b0ec06a Binary files /dev/null and b/_images/jupyter_package_install.png differ diff --git a/_images/jupyter_pip_install.png b/_images/jupyter_pip_install.png new file mode 100644 index 000000000..e62ccefa6 Binary files /dev/null and b/_images/jupyter_pip_install.png differ diff --git a/_images/jupyter_plotly.png b/_images/jupyter_plotly.png new file mode 100644 index 000000000..0795df575 Binary files /dev/null and b/_images/jupyter_plotly.png differ diff --git a/_images/jupyterhub_admin.png b/_images/jupyterhub_admin.png new file mode 100644 index 000000000..aac9be3ff Binary files /dev/null and b/_images/jupyterhub_admin.png differ diff --git a/_images/jupyterhub_control_panel.png b/_images/jupyterhub_control_panel.png new file mode 100644 index 000000000..9cb293ea3 Binary files /dev/null and b/_images/jupyterhub_control_panel.png differ diff --git a/_images/jupyterhub_jup.png b/_images/jupyterhub_jup.png new file mode 100644 index 000000000..c7c6db98f Binary files /dev/null and b/_images/jupyterhub_jup.png differ diff --git a/_images/jupyterhub_shared_data.png b/_images/jupyterhub_shared_data.png new file mode 100644 index 000000000..28a1e5880 Binary files /dev/null and b/_images/jupyterhub_shared_data.png differ diff --git a/_images/library.png b/_images/library.png new file mode 100644 index 000000000..3874e70b9 Binary files /dev/null and b/_images/library.png differ diff --git a/_images/localscratch.png b/_images/localscratch.png new file mode 100644 index 000000000..10819e236 Binary files /dev/null and b/_images/localscratch.png differ diff --git a/_images/machine_type.png b/_images/machine_type.png new file mode 100644 index 000000000..6a73c2653 Binary files /dev/null and b/_images/machine_type.png differ diff --git a/_images/manage-feide-ad-hoc-group.png b/_images/manage-feide-ad-hoc-group.png new file mode 100644 index 000000000..5fcbf3f95 Binary files /dev/null and b/_images/manage-feide-ad-hoc-group.png differ diff --git a/_images/minio_bucket.png b/_images/minio_bucket.png new file mode 100644 index 000000000..214c6c6f1 Binary files /dev/null and b/_images/minio_bucket.png differ diff --git a/_images/minio_my_bucket.png b/_images/minio_my_bucket.png new file mode 100644 index 000000000..7fdc64731 Binary files /dev/null and b/_images/minio_my_bucket.png differ diff --git a/_images/minio_overview.png b/_images/minio_overview.png new file mode 100644 index 000000000..74cc81be8 Binary files /dev/null and b/_images/minio_overview.png differ diff --git a/_images/minio_sharing.png b/_images/minio_sharing.png new file mode 100644 index 000000000..f83a080ca Binary files /dev/null and b/_images/minio_sharing.png differ diff --git a/_images/minio_upload_file.png b/_images/minio_upload_file.png new file mode 100644 index 000000000..c343da77c Binary files /dev/null and b/_images/minio_upload_file.png differ diff --git a/_images/nird-archive-figure-16.png b/_images/nird-archive-figure-16.png new file mode 100644 index 000000000..f71599cf6 Binary files /dev/null and b/_images/nird-archive-figure-16.png differ diff --git a/_images/njobs.png b/_images/njobs.png new file mode 100644 index 000000000..78c305563 Binary files /dev/null and b/_images/njobs.png differ diff --git a/_images/nnsconst8.png b/_images/nnsconst8.png new file mode 100644 index 000000000..db7f07ab4 Binary files /dev/null and b/_images/nnsconst8.png differ diff --git a/_images/nnscuda7.png b/_images/nnscuda7.png new file mode 100644 index 000000000..afbb72d30 Binary files /dev/null and b/_images/nnscuda7.png differ diff --git a/_images/nnshoverzoom_6.png b/_images/nnshoverzoom_6.png new file mode 100644 index 000000000..a900a0268 Binary files /dev/null and b/_images/nnshoverzoom_6.png differ diff --git a/_images/nnskernels_4.png b/_images/nnskernels_4.png new file mode 100644 index 000000000..0a6aaf96c Binary files /dev/null and b/_images/nnskernels_4.png differ diff --git a/_images/nnsslct_kernel_5.png b/_images/nnsslct_kernel_5.png new file mode 100644 index 000000000..ab2593318 Binary files /dev/null and b/_images/nnsslct_kernel_5.png differ diff --git a/_images/nsight_after_data.png b/_images/nsight_after_data.png new file mode 100644 index 000000000..e03630c6e Binary files /dev/null and b/_images/nsight_after_data.png differ diff --git a/_images/nsight_analysis.png b/_images/nsight_analysis.png new file mode 100644 index 000000000..0560c18f6 Binary files /dev/null and b/_images/nsight_analysis.png differ diff --git a/_images/nsight_annotated.png b/_images/nsight_annotated.png new file mode 100644 index 000000000..fb8dcbcbf Binary files /dev/null and b/_images/nsight_annotated.png differ diff --git a/_images/nsight_blank.png b/_images/nsight_blank.png new file mode 100644 index 000000000..7c2b2ecc3 Binary files /dev/null and b/_images/nsight_blank.png differ diff --git a/_images/nsight_bottom_up.png b/_images/nsight_bottom_up.png new file mode 100644 index 000000000..2ce539c30 Binary files /dev/null and b/_images/nsight_bottom_up.png differ diff --git a/_images/nsight_diagnostics.png b/_images/nsight_diagnostics.png new file mode 100644 index 000000000..52b9ae07f Binary files /dev/null and b/_images/nsight_diagnostics.png differ diff --git a/_images/nsight_final_optimized.png b/_images/nsight_final_optimized.png new file mode 100644 index 000000000..67c581b7d Binary files /dev/null and b/_images/nsight_final_optimized.png differ diff --git a/_images/nsight_optimized_zoom.png b/_images/nsight_optimized_zoom.png new file mode 100644 index 000000000..e459e6491 Binary files /dev/null and b/_images/nsight_optimized_zoom.png differ diff --git a/_images/nsight_timeline.png b/_images/nsight_timeline.png new file mode 100644 index 000000000..336f48fd1 Binary files /dev/null and b/_images/nsight_timeline.png differ diff --git a/_images/nsight_timeline2.png b/_images/nsight_timeline2.png new file mode 100644 index 000000000..b0bd04202 Binary files /dev/null and b/_images/nsight_timeline2.png differ diff --git a/_images/nsight_zoom1.png b/_images/nsight_zoom1.png new file mode 100644 index 000000000..04adb4e25 Binary files /dev/null and b/_images/nsight_zoom1.png differ diff --git a/_images/nsight_zoom2.png b/_images/nsight_zoom2.png new file mode 100644 index 000000000..65f942934 Binary files /dev/null and b/_images/nsight_zoom2.png differ diff --git a/_images/nsight_zoom3.png b/_images/nsight_zoom3.png new file mode 100644 index 000000000..5d62fddc9 Binary files /dev/null and b/_images/nsight_zoom3.png differ diff --git a/_images/ntasks.png b/_images/ntasks.png new file mode 100644 index 000000000..3a644e9b9 Binary files /dev/null and b/_images/ntasks.png differ diff --git a/_images/open-connection-highlighted.png b/_images/open-connection-highlighted.png new file mode 100644 index 000000000..697607128 Binary files /dev/null and b/_images/open-connection-highlighted.png differ diff --git a/_images/open-connection2.png b/_images/open-connection2.png new file mode 100644 index 000000000..0ada0a842 Binary files /dev/null and b/_images/open-connection2.png differ diff --git a/_images/open-folder.png b/_images/open-folder.png new file mode 100644 index 000000000..251d7f689 Binary files /dev/null and b/_images/open-folder.png differ diff --git a/_images/opslog-banner.png b/_images/opslog-banner.png new file mode 100644 index 000000000..5d878b0f5 Binary files /dev/null and b/_images/opslog-banner.png differ diff --git a/_images/opslog-expand-groups.png b/_images/opslog-expand-groups.png new file mode 100644 index 000000000..4b902f003 Binary files /dev/null and b/_images/opslog-expand-groups.png differ diff --git a/_images/opslog-incident-card.png b/_images/opslog-incident-card.png new file mode 100644 index 000000000..12239b717 Binary files /dev/null and b/_images/opslog-incident-card.png differ diff --git a/_images/opslog-incident-details.png b/_images/opslog-incident-details.png new file mode 100644 index 000000000..540750e36 Binary files /dev/null and b/_images/opslog-incident-details.png differ diff --git a/_images/opslog-motd.png b/_images/opslog-motd.png new file mode 100644 index 000000000..cdebd46a0 Binary files /dev/null and b/_images/opslog-motd.png differ diff --git a/_images/opslog-subscribe.png b/_images/opslog-subscribe.png new file mode 100644 index 000000000..8f5655bc4 Binary files /dev/null and b/_images/opslog-subscribe.png differ diff --git a/_images/optgain.png b/_images/optgain.png new file mode 100644 index 000000000..01c11e3d1 Binary files /dev/null and b/_images/optgain.png differ diff --git a/_images/perf_report_barrier.png b/_images/perf_report_barrier.png new file mode 100644 index 000000000..dd6d06364 Binary files /dev/null and b/_images/perf_report_barrier.png differ diff --git a/_images/perf_report_linpack.png b/_images/perf_report_linpack.png new file mode 100644 index 000000000..c5b5eae6f Binary files /dev/null and b/_images/perf_report_linpack.png differ diff --git a/_images/perf_report_stream.png b/_images/perf_report_stream.png new file mode 100644 index 000000000..1d2cc0c32 Binary files /dev/null and b/_images/perf_report_stream.png differ diff --git a/_images/persistent_storage.png b/_images/persistent_storage.png new file mode 100644 index 000000000..43548aea2 Binary files /dev/null and b/_images/persistent_storage.png differ diff --git a/_images/plan3.png b/_images/plan3.png new file mode 100644 index 000000000..87aaa141d Binary files /dev/null and b/_images/plan3.png differ diff --git a/_images/process.png b/_images/process.png new file mode 100644 index 000000000..6d48e3029 Binary files /dev/null and b/_images/process.png differ diff --git a/_images/pvserver_overview.png b/_images/pvserver_overview.png new file mode 100644 index 000000000..47f2a69c4 Binary files /dev/null and b/_images/pvserver_overview.png differ diff --git a/_images/requested_resources.png b/_images/requested_resources.png new file mode 100644 index 000000000..0a642567f Binary files /dev/null and b/_images/requested_resources.png differ diff --git a/_images/rstudio_shiny.png b/_images/rstudio_shiny.png new file mode 100644 index 000000000..866e8f0cb Binary files /dev/null and b/_images/rstudio_shiny.png differ diff --git a/_images/rstudio_shiny_app.png b/_images/rstudio_shiny_app.png new file mode 100644 index 000000000..d6ecb2781 Binary files /dev/null and b/_images/rstudio_shiny_app.png differ diff --git a/_images/scaling.png b/_images/scaling.png new file mode 100644 index 000000000..18733aa76 Binary files /dev/null and b/_images/scaling.png differ diff --git a/_images/settings.png b/_images/settings.png new file mode 100644 index 000000000..cab712e34 Binary files /dev/null and b/_images/settings.png differ diff --git a/_images/spark_app_details.png b/_images/spark_app_details.png new file mode 100644 index 000000000..46560b811 Binary files /dev/null and b/_images/spark_app_details.png differ diff --git a/_images/spark_app_overview.png b/_images/spark_app_overview.png new file mode 100644 index 000000000..96ab040b7 Binary files /dev/null and b/_images/spark_app_overview.png differ diff --git a/_images/spark_jobs_overview.png b/_images/spark_jobs_overview.png new file mode 100644 index 000000000..4c270b528 Binary files /dev/null and b/_images/spark_jobs_overview.png differ diff --git a/_images/ssh-plugin-highlighted.png b/_images/ssh-plugin-highlighted.png new file mode 100644 index 000000000..9bd52d005 Binary files /dev/null and b/_images/ssh-plugin-highlighted.png differ diff --git a/_images/statistics.png b/_images/statistics.png new file mode 100644 index 000000000..fbe464159 Binary files /dev/null and b/_images/statistics.png differ diff --git a/_images/stencil.svg b/_images/stencil.svg new file mode 100644 index 000000000..90783010b --- /dev/null +++ b/_images/stencil.svg @@ -0,0 +1,3 @@ + + +
in [ ]
in [ ]
x+4h
x+4h
x+3h
x+3h
x+2h
x+2h
x+h
x+h
x
x
x-h
x-h
x.-2h
x.-2h
x-3h
x-3h
x-4h
x-4h
f(x)
f(x)
out [ ]
out [ ]
\ No newline at end of file diff --git a/_images/terminal.png b/_images/terminal.png new file mode 100644 index 000000000..70e876fa3 Binary files /dev/null and b/_images/terminal.png differ diff --git a/_images/vtune_bottomup.png b/_images/vtune_bottomup.png new file mode 100644 index 000000000..ac728c591 Binary files /dev/null and b/_images/vtune_bottomup.png differ diff --git a/_images/vtune_opt_gcc.png b/_images/vtune_opt_gcc.png new file mode 100644 index 000000000..c79b7f520 Binary files /dev/null and b/_images/vtune_opt_gcc.png differ diff --git a/_images/vtune_opt_intel.png b/_images/vtune_opt_intel.png new file mode 100644 index 000000000..dcde3e9e5 Binary files /dev/null and b/_images/vtune_opt_intel.png differ diff --git a/_images/vtune_summary.png b/_images/vtune_summary.png new file mode 100644 index 000000000..f7e6a37c6 Binary files /dev/null and b/_images/vtune_summary.png differ diff --git a/_images/wave_loop_profile.png b/_images/wave_loop_profile.png new file mode 100644 index 000000000..71bd80169 Binary files /dev/null and b/_images/wave_loop_profile.png differ diff --git a/_images/wave_scaling.svg b/_images/wave_scaling.svg new file mode 100644 index 000000000..b03eaca17 --- /dev/null +++ b/_images/wave_scaling.svg @@ -0,0 +1,203 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +0 +20 +40 +60 +1200000 +2400000 +3600000 +4800000 +6000000 +7200000 +8400000 +9600000 +10800000 +12000000 +Number of points to solve +Time in seconds + + + + + + + + + + + + +OpenACC +MPI + OpenACC +MPI + OpenMP + diff --git a/_images/xAVX2gain.png b/_images/xAVX2gain.png new file mode 100644 index 000000000..468a2d0ce Binary files /dev/null and b/_images/xAVX2gain.png differ diff --git a/_images/yourplans.png b/_images/yourplans.png new file mode 100644 index 000000000..9bc5cafd8 Binary files /dev/null and b/_images/yourplans.png differ diff --git a/_images/yourplans2.png b/_images/yourplans2.png new file mode 100644 index 000000000..50fbc02f2 Binary files /dev/null and b/_images/yourplans2.png differ diff --git a/_sources/about_us.md.txt b/_sources/about_us.md.txt new file mode 100644 index 000000000..2af495c2b --- /dev/null +++ b/_sources/about_us.md.txt @@ -0,0 +1,35 @@ +--- +orphan: true +--- + +(about-us)= + +# Norwegian Research Infrastructure Services + +The Norwegian research high-performance computing and storage infrastructure is +maintained by [NRIS](https://sigma2.no/nris) (formerly known as *the +Metacenter*), which is a joint collaboration between [UiO](https://www.uio.no), +[UiB](https://www.uib.no), [NTNU](https://www.ntnu.no), [UiT](https://uit.no), +and [Sigma2](https://www.sigma2.no/). + +We provide valuable resources for the research communities. Not only do we +provide state-of-the art {ref}`compute ` and {ref}`storage +facilities `, backed by {ref}`support ` and a guarantee +that your data always stays in Norway. But possibly more important is +{ref}`easy access ` to a wide selection of competences that +can assist, realize or take your project to the next level. + +This website () primarily holds documentation +of resources we provide. For more general information and service overview, +please also see . + + +## Compute, storage, pre/post-processing, visualization, machine learning + +We offer compute resources {ref}`betzy`, {ref}`fram`, and {ref}`saga`, storage +resources {ref}`nird`, as well as the [NIRD +Toolkit](https://www.sigma2.no/nird-toolkit) platform for pre- and +post-processing analysis, data intensive processing, visualization, artificial +intelligence, and machine learning. + +Researchers also have access to {ref}`lumi` through the LUMI consortium. diff --git a/_sources/code-of-conduct.md.txt b/_sources/code-of-conduct.md.txt new file mode 100644 index 000000000..139d51f02 --- /dev/null +++ b/_sources/code-of-conduct.md.txt @@ -0,0 +1,50 @@ +# Code of Conduct + +## Ethics and values + +We strive to cultivate fair, efficient, professional and welcoming working environments and relationships, regardless of education, socio-economic status, gender, nationality, age, personality, religion, disability, size, appearance, ethnicity, political views, identity or orientation in the sexual context. + +In all possible relations we should be honest, unprejudiced and unbiased. We should strive to focus on objective facts. + +## Scope + +Staff of NRIS are expected to follow this code of conduct, as is any users or other entities collaborating with staff or the legal entities that are partners in NRIS in a more formal manner. + +## Standards of operation + +We expect all parties to follow these standards: + +- Use a welcoming and respectful tone when communicating. + +- State and accept constructive criticism. + +- Not withhold any relevant information. + +- Think before you write or speak. + +- Always assume the person you communicate with have the intention to help or want to be helped. + +- Always assume the person you communicate with are pressed on time. + +- Always show empathy. + +- Refrain from letting political viewpoints, or other personal believes such as religion to bias decisions, communications or other work performed. + +- Always follow defined standards for personal information, GDPR, GoFAIR etc. If in doubt and in gray zones, always ask for approval. + +- Always try to consult existing documentation, i.e. this or other before asking question, giving comments or making statements. + + +A few examples that are considered breach of this code of conduct: + +- Use of insulting language or behavior, e.g. sexual, personal, professional or private. + +- Any direct or indirect harassment in all contexts. + +- Carelessness with personal data. + +## Standards of practice for users +All users are obliged to comply with the [Sigma2 user policy](https://www.sigma2.no/acceptable-use-policy). + +## Enforcement +Instances of suspicions, unacceptable behavior or breach of this code of conduct should be [reported](mailto:contact@sigma2.no). NRIS and its staff are obligated to maintain confidentiality with regards to the reporter of an incident. diff --git a/_sources/code_development/Calling-fortran-from-Python.md.txt b/_sources/code_development/Calling-fortran-from-Python.md.txt new file mode 100644 index 000000000..914fc52ad --- /dev/null +++ b/_sources/code_development/Calling-fortran-from-Python.md.txt @@ -0,0 +1,469 @@ +# Calling fortran routines from Python + +## Introduction +While Python an effective language for development it is not very fast +at executing code. There are several tricks available to get high +numerical performance of which calling fortran routines is one. + +While libraries functions in both numpy and scipy perform nicely in +many cases, one often need to write routines for which no library +exist. Either writing from scratch or use fortran routines from +co-workers or other sources. In any case it's a good way of getting +high performance for time consuming part of the run. + +Below is covered usage of : +- Plain fortran with GNU fortran (the default) +- Fortran with calls to math library (MKL) +- The Intel fortran compiler to compile your fortran source code +- Optimising performance for fortran code, compiler flags. +- Intel fortran and MKL +- Intel, fortran and multithreaded MKL +- Python with multithreaded OpenMP fortran routines + +```{note} +A short disclaimer With regards to matrix matrix multiplication the library +in numpy is comparable in performance to the Intel MKL. +``` + +```{note} +Another disclaimer is that this have been tested on Saga. There might some +minor issues on Betzy with AMD processors, not having 512 bits avx. +``` + +## Using the numpy interface +The package [numpy](https://numpy.org/) contains tools to facilitate +calling fortran routines directly from Python. The utility f2py3 can +be used or more indirectly by launching Python with a module and +processing the fortran source code. In both cases the fortran code +containing definitions of subroutines will be compiled using a fortran +compiler into object files which subsequently are linked into a +single shared object library file (an .so file). + +A nice introduction by NTNU is +[available](https://www.numfys.net/howto/F2PY/). It cover some basics +and should be read as an introduction. Issues with arrays arguments +and assumed shapes are explained. + +Modern fortran uses «magic» constants (they can any number, but often +they are equal to the number of bytes, but not always, don't rely on this) +to set the attributes like size or range of variables. Normally specified in +the number of bits for a given variable. This can be done using self +specified ranges with the help of the `kind` function. +```fortran +subroutine foo + implicit none + int32 = selected_int_kind(8) + int64 = selected_int_kind(16) + real32 = selected_real_kind(p=6,r=20) + real64 = selected_real_kind(p=15,r=307) + + integer(int32) :: int + integer(int64) :: longint +``` +or a simpler solution is to use a standard fortran module: +```fortran +subroutine foo + use iso_fortran_env + implicit none + + real(real32) :: float + real(real64) :: longfloat +``` +While the first one is more pedagogic, the second one is simpler and + [iso_fortran_env](https://fortranwiki.org/fortran/show/iso_fortran_env) +contain a lot more information. + +Python support both 32 and 64 bit integers and floats. However, the mapping +between fortran specification and Python/Numpy it not set by default. +In order to map from fortran standard naming to C naming map need to be +provided. The map file need to reside in the working directory and must +have the name `.f2py_f2cmap`. An example mapping fortran syntax to C syntax +for simple integers and floats can look like : +```python +dict(real=dict(real64='double', real32='float'), + complex=dict(real32='complex_float', real64='complex_double'), + integer=dict(int32='int', int64='long') + ) +``` +This helps the f2py3 to map the fortran data types into the +corresponding C data types. Alternative is to use [C mapping directly](https://gcc.gnu.org/onlinedocs/gfortran/ISO_005fC_005fBINDING.html#ISO_005fC_005fBINDING). + +For complex variables the same logic applies, the size is measured in bits to fit +two numbers (real and imaginary parts) occupying 64 bits each, hence 128 bits. +```python +x=np.zeros((n), dtype=np.complex128, order='F') +y=np.zeros((n), dtype=np.complex128, order='F') +``` +and corresponding fortran code, there each number is specified as 64 bits each: +```fortran +complex(real64), dimension(n), intent(in) :: x +complex(real64), dimension(n), intent(inout):: y +``` + +The importance of keeping control over data types and their ranges cannot +be stressed more than pointing to [Ariane-5 failure](https://en.wikipedia.org/wiki/Ariane_flight_V88) or even worse, killing people, the +[Therac-25](https://en.wikipedia.org/wiki/Therac-25) incident. + +### compiling fortran code +To start using Python with fortran code a module need to be loaded, +`module load Python/3.9.6-GCCcore-11.2.0` + +The command line to generate the Python importable module can be one of the +following, with the second could be used if f2py3 is not available. +- `f2py3 -c pi.f90 -m pi` +- `python3 -m numpy.f2py -c pi.f90 -m pi` +In both cases a module will be generated which could be imported as a +normal Python module. The `-m pi` is the given name for the module, here it's +identical to the name of the subroutine, but don't need to be. + +A simple fortran routine to calculate Pi : +```fortran +subroutine pi(p,n) + use iso_fortran_env + implicit none + real(real64), intent(out) :: p + integer(int64), intent(in) :: n + + integer(int64) :: j + real(real64) :: h, x, sum + + sum=0.0_real64 ! set accumulating vars to 0. + h = 1.0_real64/n + do j = 1,n + x = h*(j-0.5_real64) + sum = sum + (4.0_real64/(1.0_real64+x*x)) + end do + p = h*sum + return + end subroutine pi +``` +Be aware that intention of parameters is important. Also that variables are +not initiated during repeated calls, hence set accumulating variables to zero +in the body, not during declaration . Once the routine is loaded into memory +the variables reside in memory. There is no magic initialisation for each +subsequent call (look into the [save statement](https://stackoverflow.com/questions/2893097/fortran-save-statement) in fortran). + +This fortran routine can be called from a Python script like: + ```python +import pi + +p=pi.pi(1000) + +print("Pi calculated ",p) + ``` +With a result like: + ``` +Pi calculated 3.1415927369231227 +``` +We import the module generated, the name is pi which correspond to the last +`-m ` argument, while the function call to `pi` is the same name as +the fortran routine. + +### Performance issues +While Python is easy to write and has many very nice features and applications, +numerical performance is not among them. + +It the following examples matrix matrix multiplication is used as an +example, this is a well known routine making it a good candidate for +performance comparison. + + +The following code is used to illustrate the performance using Python: +```python +print("Matrix multiplication example") +x=np.zeros((n, n), dtype=np.float64, order='F') +y=np.zeros((n, n), dtype=np.float64, order='F') +z=np.zeros((n, n), dtype=np.float64, order='F') +x.fill(1.1) +y.fill(2.2) + +start = time.perf_counter() +for j in range(n): + for l in range(n): + for i in range(n): + z[i,j] = z[i,j] + x[i,l]*y[l,j] +print(f"Python code {time.perf_counter() - start:2.4f} secs") +print(z) +``` + + +The following fortran code is used for matrix matrix multiplication: +```fortran +subroutine mxm(a,b,c,n) + implicit none + integer, parameter :: real64 = selected_real_kind(p=15,r=307) + integer, parameter :: int32 = selected_int_kind(8) + + real(real64), dimension(n,n), intent(in) :: a,b + real(real64), dimension(n,n), intent(inout) :: c + integer(int32), intent(in) :: n + integer(int32) :: i,j,l + + do j = 1,n + do l = 1,n + do i = 1,n + c(i,j) = c(i,j) + a(i,l)*b(l,j) + enddo + enddo + enddo + +end subroutine mxm +``` +Comparing Python with fortran using the following commands: +``` +f2py3 --opt="-Ofast -fomit-frame-pointer -march=skylake-avx512" -c mxm.f90 -m mxm +``` +and running the Python script +`python3 mxm.py` + +The Python script used to call the fortran code is: + +```python +a=np.zeros((n, n), dtype=np.float64, order='F') +b=np.zeros((n, n), dtype=np.float64, order='F') +c=np.zeros((n, n), dtype=np.float64, order='F') +a.fill(1.1) +b.fill(2.2) +start = time.perf_counter() +mxm.mxm(a,b,c,n) +print(f"f90 mxm {time.perf_counter() - start:2.4f} secs") +``` + +The results are staggering, for the matrix matrix multiplication the simple +fortran implementation perform over 2000 times faster than the fortran code. + +| Language | Run time in seconds | +|-----------|---------------------| +| Python | 757.2706 | +| f90 | 0.3099 | + +This expected as the compiled fortran code is quite efficient while Python +is interpreted. + + +### Using libraries, MKL +The Intel Math Kernel Library is assumed to be well known for its +performance. It contains routines that, in most cases, exhibit very +high performance. The routines are also for the most part threaded to +take advantage of multiple cores. + +In addition to the module already loaded +`module load Python/3.9.6-GCCcore-11.2.0` +one more module is needed to use Intel MKL: +`module load imkl/2022.2.1` +(This module set many environment variables, we use `$MKLROOT` to +set the correct path for MKL library files.) + +As f2py3 is a wrapper some extra information is needed to link with +the MKL libraries. The simplest is to use static linking: +```bash +f2py3 --opt="-Ofast -fomit-frame-pointer -march=skylake-avx512"\ + ${MKLROOT}/lib/intel64/libmkl_gf_lp64.a\ + ${MKLROOT}/lib/intel64/libmkl_sequential.a\ + ${MKLROOT}/lib/intel64/libmkl_core.a\ + -c mxm.f90 -m mxm +``` +The above commands link in the `dgemm` routine from MKL. +```fortran +subroutine mlib(c,a,b,n) + implicit none + integer, parameter :: real32 = selected_real_kind(p=6,r=20) + integer, parameter :: real64 = selected_real_kind(p=15,r=307) + integer, parameter :: int32 = selected_int_kind(8) + integer, parameter :: int64 = selected_int_kind(16) + + real(real64), dimension(n,n), intent(in) :: a,b + real(real64), dimension(n,n), intent(out) :: c + integer(int32), intent(in) :: n + real(real64) :: alpha=1.0_real64, beta=1.0_real64 + + call dgemm('n', 'n', n, n, n, alpha, a, n, b, n, beta, c, n) + +end subroutine mlib +``` +and a Python script to call it : +```python +a=np.zeros((n, n), dtype=np.float64, order='F') +b=np.zeros((n, n), dtype=np.float64, order='F') +c=np.zeros((n, n), dtype=np.float64, order='F') +a.fill(1.1) +b.fill(2.2) +c=np.zeros((n, n), dtype=float64, order='F') +start = time.perf_counter() +mxm.mlib(a,b,c,n) +print(f"mxm MKL lib {time.perf_counter() - start:2.4f} secs") +``` +Running the Python script with n=5000 we get the results below. + +| Routine | Run time in seconds | +|--------------|---------------------| +| Fortran code | 88.566 | +| MKL library | 2.90 | + + +### Using different fortran compiler, intel +While the gfortran used by default generate nice executable code it does not +always match the intel fortran compiler when it comes to performance. +It might be beneficial to switch to the intel compiler. + +In order to have Python, Intel compiler and MKL together load the module: +`SciPy-bundle/2022.05-intel-2022a` + +Then we build compile the fortran code, +```bash +f2py3 --fcompiler=intelem --opt="-O3 -xcore-avx512"\ + -c mxm.f90 -m mxm +``` +Running the same Python script with n=5000 we arrive at the following +run times: + +| Compiler/library | Run times seconds | +|-------------------|-------------------| +| GNU fortran | 88.566 | +| Intel ifort | 9.5695 | + +The Intel compiler is known for its performance when compiling the +matrix matrix multiplication. + +We can also use the MKL library on conjunction with the Intel +compiler, but it's a bit more work. First static linking: + +```bash +f2py3 --fcompiler=intelem --opt="-O3 -xcore-avx512"\ + ${MKLROOT}/lib/intel64/libmkl_intel_lp64.a\ + ${MKLROOT}/lib/intel64/libmkl_sequential.a\ + ${MKLROOT}/lib/intel64/libmkl_core.a\ + -c mxm.f90 -m mxm +``` + +| Compiler/library | Run times seconds | +|-------------------|-------------------| +| GNU fortran | 88.566 | +| Intel ifort | 9.5695 | +| MKL dgemm | 2.712 | + +It's also possible to use dynamic linking, +```bash +f2py3 --fcompiler=intelem --opt="-O3 -xcore-avx512"\ + -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lmkl_avx512\ + -c mxm.f90 -m mxm + ``` + Then it's just to launch as before. Performance is comparable as it's the + same library. + +Testing for even higher performance using the Intel compiler `ifort` +we can try more optimising flags (runs with n=10000): + +| ifort flags | Run time | +|-------------------------------------------|-------------| +| Defaults (no flags given) | 1122 secs. | +| -O2 | 1110 secs. | +| -O3 | 153 secs. | +| -O3 -xavx2 | 81.8 secs. | +| -O3 -xcore-avx512 | 72.5 secs. | +| -O3 -xcore-avx512 -qopt-zmm-usage=high | 54.1 secs. | +| -Ofast -xcore-avx512 -qopt-zmm-usage=high | 53.9 secs. | +| -Ofast -unroll -xcore-avx512 -qopt-zmm-usage=high -heap-arrays -fno-alias | 53.7 secs. | +| -fast -unroll -xcore-avx512 -qopt-zmm-usage=high | 53.6 secs. | + +Selecting the _right_ flags can have dramatic affect on performance. Adding to this what's +optimal flag for one routine might not be right for other. + +### Using many cores with MKL library + +As the MKL libraries are multithreaded they can be run on multiple cores. + +To achieve this it just to build using multithreaded versions of the library, +using static linking : + ```bash +f2py3 --fcompiler=intelem --opt="-O3 -xcore-avx512"\ + ${MKLROOT}/lib/intel64/libmkl_intel_lp64.a\ + ${MKLROOT}/lib/intel64/libmkl_intel_thread.a\ + ${MKLROOT}/lib/intel64/libmkl_core.a\ + -c mxm.f90 -m mx + ``` +or dynamic linking: +```bash +f2py3 --fcompiler=intelem --opt="-O3 -xcore-avx512"\ + -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lmkl_avx512 -liomp5\ + -c mxm.f90 -m mxm +``` +The OpenMP `OMP_NUM_THREADS` environment variable can the be used to +control the number of cores to use. + +This time we run the Python script with a bit larger size, n=10000, +`export OMP_NUM_THREADS=2` and larger. + +| Threads | Run times in seconds | +|---------|----------------------| +| 1 | 21.2914 | +| 2 | 12.5923 | +| 4 | 7.0082 | +| 8 | 4.1504 | + +While scaling is not perfect there is a significant speedup by using +extra cores. + + +### Using many cores with fortran with OpenMP +It's possible to call fortran functions with OpenMP directives +getting speedup using several cores. A nice alternative when dealing with +real world code for which no library exist. + +Consider the following fortran OpenMP code: +```fortran +subroutine piomp(p, n) + use iso_fortran_env + real(real64), intent(out) :: p + integer(int64), intent(in) :: n + integer(int64) :: i + real(real64) :: sum, x, h + + h = 1.0_real64/n + sum = 0.0_real64 +!$omp parallel do private(i) reduction(+:sum) +!This OpenMP inform the compiler to generate a multi threaded loop + do i = 1,n + x = h*(i-0.5_real64) + sum = sum + (4.0_real64/(1.0_real64+x*x)) + enddo + p = h*sum +``` + +Building the module for Python using : +```bash +f2py3 --fcompiler=intelem --opt="-qopenmp -O3 -xcore-avx512"\ + -D__OPENMP -liomp5 -c pi.f90 -m pi +``` +The openmp library is linked explicitly `-liomp5` (for GNU it's -lgomp). + +Running using the following Python script : +```python +import time +import pi + +n=50000000000 + +start = time.perf_counter() +p=pi.pi(n) +print("Pi calculated ",p," ",time.perf_counter() - start," seconds") + +start = time.perf_counter() +p=pi.piomp(n) +print("Pi calculated ",p," ",time.perf_counter() - start," seconds") +``` + +Scaling performance is nice: + +|Cores | Run time in seconds | +|------|---------------------| +| 1 | 31.26 | +| 2 | 16.28 | +| 4 | 8.528 | +| 8 | 4.217 | +| 16 | 2.547 | +| 32 | 1.900 | + + + + diff --git a/_sources/code_development/betzy.md.txt b/_sources/code_development/betzy.md.txt new file mode 100644 index 000000000..bdd1e97a0 --- /dev/null +++ b/_sources/code_development/betzy.md.txt @@ -0,0 +1,109 @@ +# Software environment on Betzy + +As on Fram and Saga, scientific software on Betzy will be installed using the +EasyBuild system, and the Lmod modules tool will be used for changing +environment setup via modulefiles. + +The two *common toolchains* `foss` and `intel` will be installed on Betzy. + +## foss toolchain +* GCC compilers (`gcc`, `g++`, `gfortran`) +* Open MPI library +* OpenBLAS (including LAPACK) + ScaLAPACK +* FFTW library + +## intel toolchain +* Intel compilers (`icc`, `icpc`, `ifort`) +* Intel MPI library +* Intel MKL library (including BLAS, LAPACK, ScaLAPACK, FFT) + +Regarding compiler optimization this needs to be investigated case by case. Aggressive optimization should be added only to files +where it makes a difference, as it increases the probability of the compiler generating wrong code or exposing +floating-point issues in the application. A few starting suggestions are: + +## Compiler flags +A set of suggested compiler flags are given below, these have been tested and used, but users are +advised to read the documentation and try other combinations as not all code are alike. + +### gcc/gfortran +* -O3 +* -O3 -march=znver2 -mtune=znver2 (recommended) +* -O3 -march=znver2 -mtune=znver2 -mfma -mavx2 -m3dnow -fomit-frame-pointer + +### ifort/icc/ipcp +* -O3 +* -O3 -march=core-avx2 +* -O3 -xavx +* -O3 -xavx2 (recommended, for all execpt for main() ) +* -O3 -xcore-avx2 + +The above applies to GCC 9.3 and Intel 2019b, and the choices are listed in +increased performance as obtained with a DGEMM matrix multiplication test +(which show significant performance improvement), it is also verified using one +of the major benchmarks used. Please notice that ifort performs substantially +better (2-4x) than gfortran with the dgemm.f test. Also, building the main +routine in the program file with *-xcore-avx2*, *-xavx* or *-xavx2* is not +recommended. It's known that building the main() (C and Fortran) with these +flags trigger the Intel processor run time check, causing the application to +abort. + + +## MPI libraries + +Both OpenMPI and Intel MPI are installed. Built both for GNU and Intel. +Experience have shown that performance varies. Both OpenMPI and Intel MPI are +supported. OpenMPI is built with support for GNU compiler and Intel. + +### OpenMPI +* mpicc +* mpicxx +* mpif90 + +For running with OpenMPI the processor binding is beneficial, adding *-bind-to core* is generally a good idea. + +### Intel MPI +* mpiicc +* mpiicpc +* mpiifort + +For running Intel MPI, these settings are good starting points: +* I_MPI_PIN=1 +* I_MPI_PIN_PROCESSOR_EXCLUDE_LIST=128-255 (run only on physical cores) + +For a correct Slurm job script file the only command needed to launch MPI programs are for : +* OpenMPI : mpirun -bind-to core ./a.out +* Intel MPI: mpirun ./a.out + + +### Running Hybrid models + +For hybrid models it's important to set up Slurm to provide access to all +available cores. An example could look like this: + +``` +#SBATCH --ntasks=2048 +#SBATCH --nodes=64 +#SBATCH --ntasks-per-node=32 +#SBATCH --cpus-per-task=4 +``` + +This will request 32 MPI ranks per node, and leave cores for 4 threads per +rank, e.g. each of the 32 ranks can schedule 4 threads yielding a total of 128 +cores which is the maximum number of cores on each compute node. The +*exclusive* is important, if not set Slurm will only allow the 32 cores +allocated to be used (this will place all 4 threads onto one core). In order to +have free access to all cores the *exclusive* need to be set. + + +## MKL library + +The MKL perform run time check to select correct code to invoke. This test fail +to find any Intel processor and hence select a code compatible with all x86-64 +processors. Setting the environment flag *MKL_DEBUG_CPU_TYPE=5* will force MKL +to select code that uses *AVX2* instructions, hence increase performance +significantly. + +It is possible to use Intel MKL library with the GNU compilers, this require +some work resolve the symbol names at link time and library path at run time, +but can provide a nice performance boost to both linear algebra and Fourier +transforms. diff --git a/_sources/code_development/building.md.txt b/_sources/code_development/building.md.txt new file mode 100644 index 000000000..0bda5c90b --- /dev/null +++ b/_sources/code_development/building.md.txt @@ -0,0 +1,435 @@ +# Building scientific software + +## Introduction +This is just a quick short guide on the topic. For more in depth documentation please +check out the [PRACE Best Practice Guides](https://prace-ri.eu/training-support/best-practice-guides/). + +[Most relevant for Betzy](https://prace-ri.eu/training-support/best-practice-guides/best-practice-guide-amd-epyc/) +and an update covering AMD Rome named "Best Practice Guide Modern Processors" +soon to be published. + + +## Compilers + +### Introduction +Include paths in C/C++ and Fortran are distictly different. The module system set the flag CPATH for us which contain a ':' separated list +of directories to be searched for a include files. This is done behind the scenes for us whein using C/C++. However, with Fortran this is another story. +Fortran compilers uses a set of '-I' options each with a single directory as argument. This prevent us from using `$CPATH` for include path. One might think +that FPATH should be a solution (so did Intel some years ago) but it can interfere with some shells (ksh) and should avoided a general setting. However, it does not +prevent us from doing it locally (avoiding ksh or othes shells that might be affected). + +The FPATH can be set : `export FPATH="-I"${CPATH//:/ -I}` + +Then `$FPATH` can be used in Makefiles and on the command line like `gfortran $FPATH file.f90` +For command line a direct syntax can be used like : `gfortran -I${CPATH//:/ -I/} file.f90` + +### Intel +#### Introduction +The Intel compiler suite is supported on all Sigma2 systems. On the +systems Saga and Fram the processors are from Intel while the +processors on Betzy are from AMD. As the Intel compiler is primarily +compiler written for the Intel processors there are some minor issues +when using it to build core for the AMD processors. + + +#### Documentation +The documentation of the Intel compiler are found at +[Intel compiler](https://software.intel.com/content/www/us/en/develop/tools/compilers.html) +The web site is comprehensive and some browsing are required to find the needed documents. +Most users want to review the reference manual. ++ [C/C++ reference](https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top.html) ++ [Fortran reference](https://software.intel.com/content/www/us/en/develop/tools/compilers/fortran-compilers/documentation.html) + + +#### Compiler flags +The single most common question requested is a set of suggested +compiler flags. The Intel development team have already selected a +very good set of flags and just a simple *-O3* flag will provide quite +good choice. The compiler comes with a set of default optimisation flags already +set. Just invoking the compiler without any such flags will generate reasonably good code. + +The flag for OpenMP is very often needed : *-qopenmp* and must be used in both compiling a linking. + +To ask the compiler generate optimised code have a huge impact in performance. +The following graph show the observed speed using the +[NASA NPB MPI](https://en.wikipedia.org/wiki/NAS_Parallel_Benchmarks) benchmarks built +using the Intel compiler and run using OpenMPI at 64 ranks. + +![Optimisation gain](optgain.png) + +The benefit of selecting optimisation flags is obvious. The effect of vectorisation is +less pronounced with these benchmarks which are extracts from real applications and running with datasets of +serious size. The compiler can recognise some type of code and generate excellent code, often related +to cache and TLB issues. Just looking at the generated code will not tell what the compiler actually did. +See an extreme case with matrix multiplication below. Tuning tools can help looking for cache and +[TLB](https://en.wikipedia.org/wiki/Translation_lookaside_buffer) issues. + +Some optimisation flags are a bit more tricky. As all processors +support AVX2 this can always be used. A suggested list set of flags +than be tried might include: +* -O3 +* -O3 -xHost +* -Ofast +* -O3 -march=core-avx +* -O3 -march=core-avx2 -mtune=core-avx2 +* -O3 -xavx +* -O3 -xavx2 +* -O3 -xcore-avx2 + +The flags above have been tested and yield good results. On Betzy the +flags involving *-xavx*, *-xavx2* and *-xcore-avx2* can cause +problems. As the -x prefix implies it will only generate code for a +processor supporting AVX and AVX2. Intel has implemented a run time +processor check for any program compiled with these flags, which will result +in a message like this: + + Please verify that both the operating system and the processor support + Intel(R) X87, CMOV, MMX, FXSAVE, SSE, SSE2, SSE3, SSSE3, SSE4_1, SSE4_2, + MOVBE, POPCNT, AVX, F16C, FMA, BMI, LZCNT and AVX2 instructions. + +This only apply to the main routine. If the main() function is not compiled +with ``-xavx``/``-xavx2`` flags the test is not inserted and performance +is as expected. + +The safe option is ``-O3 -march=core-avx2 -mtune=core-avx2`` which mostly provide fair performance. + +| Vectorisation flag | Single core performance | +|:---------------------:|:-----------------------:| +| -O3 | 4.33 Gflops/sec | +| -O3 -march=core-avx2 | 4.79 Gflops/sec | +| -O3 -xavx | 17.97 Gflops/sec | +| -O3 -xavx2 | 26.39 Gflops/sec | +| -O3 -xcore-avx2 | 26.38 Gflops/sec | + +```{warning} +The ``-xavx2`` flag is quite intrusive, it's building only AVX2 vector +instructions and if the processor does not support it, you'll get illegal +instruction. +``` +The example above is a best case where the Intel compiler manage to analyse +the code and apply special optimisation for matrix multiplication. Checking the +code show that is does not call external functions like the matmul in MKL. + +For codes that are more realistic and closer to scientific codes like the NPB benchmarks the effect +is much smaller. In some cases there are still a significant gain by using the ``-xAVX2``, the figure +below illustrate this. + +![xAVX2 gain](xAVX2gain.png) + +There are a large range of other flags, and while the web +documentation is very good it can be overwhelming. A simple trick is +to issue the following command `icc -help > icc.hpl` and open the file +in an editor and search and read relevant paragraphs. Except from +language specific flags most of the flags are similar for C/C++ and +Fortran. + +The flags related to optimisation reports can be useful, *-qopt-report*. +To generate a nice optimisation report some of the following flags could +be used. + +* -qopt-report-help +* -qopt-report=1 (any number from 1 through 5 are valid, 0 turn it off) +* -qopt-report-file= +* -qopt-report-annotate + +An example is : `-qopt-report=5 -O3 -xavx2 -g -S` which will generate +a comprehensive report and a file containing the generated +code. Reviewing this report and the code can be of great help in cases +where the compiler fail to optimise as expected. + + + +### GNU +#### Introduction +GNU compilers are an integral part of the Linux distribution. However, +the versions of the compilers that comes with the distribution are +generally not the newest version. Look for modules that supply a more +recent version. The compiles support C/C++ and Fortran. + +#### Documentation +The compilers have good man pages covering most of what is commonly needed. More deep +documentation is found here : https://gcc.gnu.org/onlinedocs/ . + + +#### Compiler flags +The default settings of gcc/gfortran are not optimal for performance. A set of optimising flags are needed. The flag for OpenMP is *-fopenmp*. + +There a lot of optimisers available, a list can be generated using the command +`gcc --help=optimizers` + +Some set of flags for optimisation include : +* -O2 (often use for only memory intensive applications) +* -O3 +* -O3 -mfma -mavx2 +* -O3 -march=znver2 -mtune=znver2 (for AMD) +* -O3 -march=skylake-avx512 (for Intel Skylake) + +When gfortran include paths is given by gcc CPATH the following line is bash command line substitute can be beneficial : +`gfortran -O3 -I${CPATH//:/ -I/}` + + +### AMD AOCC/llvm +#### Introduction +AMD support the development of compilers based on llvm. The Software development kit can be found at : https://developer.amd.com/tools-and-sdks/ . +C/C++ and Fortran are supported. + +#### Documentation +The AMD documentation is limited. Documentation can be found at the AMD developer +web site given above. + +#### Compiler flags +The llvm compiler show a huge range of compiler flags, the AMD +documentation provide a nice subset of relevant flags. The flag for +OpenMP is *-fopenmp*. A suggested flags to try is given below. +* -O3 +* -Ofast +* -Ofast -march=znver2 -mtune=znver2 (for AMD) +* -Ofast -march=znver2 -mavx2 -m3dnow (for AMD) + + + +### PGI +#### Introduction +Portland Group compiler, known as PGI compiler is now a part of NVIDIA. The PGI web page +is still available : https://www.pgroup.com/index.htm . + +#### Documentation +Documentation can be found at : https://www.pgroup.com/resources/docs/20.4/x86/index.htm + +#### Compiler flags +Please review the documentation for an updated list of the suggested compiler flags. + +A set of suggested flags are : +* -O3 -tp zen -Mvect=simd -Mcache_align -Mprefetch -Munroll (for AMD) + + +### Performance of compilers +A test using the well known reference implementation of matrix matrix +([dgemm](http://www.netlib.org/lapack/explore-html/d7/d2b/dgemm_8f_source.html)) +multiplication is used for a simple test of the different compilers. + +| Compiler | Flags | Performance | +|:--------------|:-----------------------------------:|:-----------------:| +| GNU gfortran | -O3 -march=znver2 -mtune=znver2 | 4.79 Gflops/s | +| AOCC flang | -Ofast -march=znver2 -mavx2 -m3dnow | 5.21 Gflops/s | +| Intel ifort | -O3 -xavx2 | 26.39 Gflops/s | + +The Intel Fortran compiler do a remarkable job with this nested loop problem. +As we have seen above the matrix matrix multiplication is a special case. For more +realistic examples the performance is more comparable. + +![Compiler performance](compiler-perf.png) + +It turns out that for the EP benchmark (Generate independent Gaussian random variates using the Marsaglia polar method) +the Intel compiler manage to do something smart. + +## Performance libraries + +### Intel MKL +#### Introduction +The Intel Math Kernel Library comes with the compiler suite and is well known as +high performance library. It comes in both sequential and multi threaded functions +and is know for its very high performance. + +MKL have wrappers for FFTW so no rewrite is needed to link any applications using +FFTW with MKL. Both Include files and library functions are provided. + +When using the Intel compiler the compiling and linking is very simple, most +of the times is enough to just add *-mkl*. Adding *=sequential* or *=parallel*. + +When using MKL with the GNU compilers some more work is often needed, both include paths and linking paths. +An example can provide some hints: +`-L$MKLROOT/lib/intel64 -lmkl_gnu_thread -lmkl_avx2 -lmkl_core -lmkl_rt` +The variable *MKLROOT* is set when the Intel module is loaded. + +In many cases the include files are needed and since the CPATH is set by module scripts the following command like might easy the process of +translating a colon separated string of directories to something that the Fortran compiler will accept. +`gfortran -O3 -I${CPATH//:/ -I/} fftw-3d.f90 ${MKLROOT}/lib/intel64/libfftw3xf_intel.a -lmkl_sequential -lmkl` +The above example is an example og using the FFTW wrapper in MKL, using only environment variables set by the module scripts it will +be portable with different versions of MKL. + +The following command can be of help when encounter missing symbols: +`nm -A $MKLROOT/lib/intel64/* | grep ` +Look for symbols with *T* (T means text,global - e.g. it's available, U means undefined). + + +#### Forcing MKL to use best performing routines +MKL issue a run time test to check for genuine Intel processor. If this test fail it will select a generic x86-64 set of routines yielding +inferior performance. This is well documented in [Wikipedia](https://en.wikipedia.org/wiki/Math_Kernel_Library) and remedies in +[Intel MKL on AMD Zen](https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html). + +Research have discovered that MKL call a function called *mkl_serv_intel_cpu_true()* to check the current CPU. If a genuine Intel processor is +found it simply return 1. The solution is simply to override this function by writing a dummy functions which always return 1 and place this +early in the search path. The function is simply: +```c +int mkl_serv_intel_cpu_true() { + return 1; +} +``` +Compiling this file into a shared library using the following command: +`gcc -shared -fPIC -o libfakeintel.so fakeintel.c` + +To put the new shared library first in the search path we can use a preload environment variable: +`export LD_PRELOAD=` +A suggestion is to place the new shared library in `$HOME/lib64` and using +`export LD_PRELOAD=$HOME/lib64/libfakeintel.so` to insert the fake test function. + +In addition the environment variable *MKL_ENABLE_INSTRUCTIONS* can also have a significant effect. +Setting the variable to AVX2 is advised. Just changing it to AVX have a significant negative impact. + +For performance impact and more about running software with MKL please see +{ref}`using-mkl-efficiently`. + + +#### Documentation +Online documentation can be found at : https://software.intel.com/content/www/us/en/develop/documentation/mkl-linux-developer-guide/top.html + +There is a link line helper available : https://software.intel.com/content/www/us/en/develop/articles/intel-mkl-link-line-advisor.html , this can often be of help. + + +### AMD AOCL +#### Introduction +The AMD performance library provide a set of library functions optimised for the AMD processor. +The web page is : https://developer.amd.com/amd-aocl/ . + +#### Documentation +Documentation can be found at https://developer.amd.com/amd-aocl/ . + + +### Performance +Using the MLK library with AMD is straightforward. + +In order to get MKL to select the correct AVX2 enabled routine a flag +need to be set, use : `export MKL_DEBUG_CPU_TYPE=5`. However, this flag +is no longer used in the 2020 version of the MKL. For this newer version +a different workaround is needed. + +For more about MKL performance and AMD see above about +"Forcing MKL to use best performing routines", where usage of a cheating +library is explained. + + +The well known top500 test HPL is using linear algebra library functions, +the following performance data were obtained using a single node. + +| Library | Environment flag |Performance | +|:---------------|:-----------------------------|:-------------:| +| AMD BLIS-mt | none | 3.14 Tflops/s | +| MKL-2019.5.281 | none | 1.71 Tflops/s | +| MKL-2019.5.281 | MKL_DEBUG_CPU_TYPE=5 | 3.23 Tflops/s | +| MKL-2020.4.304 | none | 2.54 Tflops/s | +| MKL-2020.4.304 | MKL_DEBUG_CPU_TYPE=5 | 2.54 Tflops/s | +| MKL-2020.4.304 | MKL_ENABLE_INSTRUCTIONS=AVX2 | 2.54 Tflops/s | +| MKL-2020.4.304 | LD_PRELOAD=./libfakeintel.so | 3.23 Tflops/s | + + +The test below using matrix matrix multiplication, Level 3 BLAS +function dgemm is used to test single core performance of the +libraries. The tests are run on a single node using a single core on +Betzy. + +| Library | Link line | Performance | +|:--------|:-----------------------------------------------------------:|:---------------:| +| AOCL | `gfortran -o dgemm-test.x -O3 dgemm-test.f90 -L$LIB -lblis` | 50.13 Gflops/s | +| AOCL | `flang -o dgemm-test.x -O3 dgemm-test.f90 -L$LIB -lblis` | 50.13 Gflops/s | +| MKL | `ifort -o dgemm-test.x -O3 dgemm-test.f90 -mkl=sequential` | 51.53 Gflops/s | + +At 50 Gflops/s per core the aggregate number is 6.4 Tflops/s quite a +bit more than what's expected from these nodes. This is a nice example +of clock boost when using only a few cores, or in this case only one. + +While linear algebra is widely used Fourier Transform is also heavily used. +The performance data below is obtained for a 3d-complex forward FT with a footprint +of about 22 GiB using a single core. + +| Library | Environment flag |Performance | +|:---------------|:-----------------------------|:----------:| +| FFTW 3.3.8 | none | 62.7 sec. | +| AMD/AOCL 2.1 | none | 61.3 sec. | +| MKL-2020.4.304 | none | 52.8 sec. | +| MKL-2020.4.304 | LD_PRELOAD=./libfakeintel.so | 27.0 sec. | +| MKL-2020.4.304 | LD_PRELOAD=./libfakeintel.so | | +| | MKL_ENABLE_INSTRUCTIONS=AVX | 40.5 sec. | +| MKL-2020.4.304 | LD_PRELOAD=./libfakeintel.so | | +| | MKL_ENABLE_INSTRUCTIONS=AVX2 | 27.0 sec. | + +With the 2020 version of MKL the instruction set variable has a significant effect. + +The performance of MKL is significantly higher than both FFTW and the AMD library. + +For applications spending a lot of time executing library function code a review of +libraries used and some testing using the specific library functions actually used. +Not all library functions are implemented equally good by the authors. + + +## MPI libraries + +### OpenMPI +#### Introduction +The OpenMPI library are based on the old LAM MPI from Ohio +Supercomputing Center. This one of the most widely used MPI +implementations today. The web site is : https://www.open-mpi.org/ . + +OpenMPI is supported on all the Sigma2 systems, with versions for both +GNU and Intel compilers, and in some cases some support for other +compilers. + +#### Usage +The compiler wrappers hiding the include and link environment are called: +* mpicc for C +* mpicxx for C++ +* mpif90 for Fortran +* mpff77 for Fortran + +In practice both mpif90 and mpif77 points to the same Fortran compiler. A quick check for +compiler versions is `mpif90 -v`. + +Compiler flags are propagated to the underlaying compiler. + +To run programs the launched application mpirun is used (Slurm srun is +an option also). There are a range of options to OpenMPI's mpirun of +which `--bind-to` and `--map-by` a the most important when running on +the Sigma2 systems using Slurm as the queue system set the number of +ranks and other run time parameters like list of hosts etc. This is normal +for MPI libraries built and installed with Slurm support. + + + +### Intel MPI +#### Introduction +The Intel MPI is part of the Intel compiler suite and is a widely used MPI implementation. +More information is found on-line at : https://software.intel.com/content/www/us/en/develop/tools/mpi-library.html . + +Intel MPI is supported on all Sigma2 systems, but mostly for use with +the Intel compiler, it can however, to some extent be used with +GNU. The support is present. + +#### Usage +The compiler wrappers have different naming then many other MPI implementations. +* mpiicc for C +* mpiicpc for C++ +* mpiifort for Fortran +* mpicc GNU C +* mpigcc GNU C +* mpicxx GNU C++ +* mpifc GNU Fortran + +There are a lot of environment variables to be used with Intel MPI, they all start with *I_MPI* +* I_MPI_PIN +* I_MPI_PIN_DOMAIN +* I_MPI_PIN_PROCESSOR_EXCLUDE_LIST + +The variable *I_MPI_PIN_DOMAIN* is good when running hybrid codes, +setting it to the number of threads per rank will help the launcher to +place the ranks correct. +Setting *I_MPI_PIN_PROCESSOR_EXCLUDE_LIST=128-255* will make sure only +physical cores 0-127 are used for MPI ranks. This ensures that no two +ranks share the same physical core. + +As with any of these variable and other please review the +documentation pointed to above and do some testing yourself before +employing in large production scale. + +Running applications with Intel MPI is just like a simple as for +OpenMPI as Intel MPI also has support for Slurm. Just `mpirun ./a.out` +is normally enough. diff --git a/_sources/code_development/building_gpu.md.txt b/_sources/code_development/building_gpu.md.txt new file mode 100644 index 000000000..caee836fe --- /dev/null +++ b/_sources/code_development/building_gpu.md.txt @@ -0,0 +1,34 @@ +# Building GPU software + +The login nodes on Betzy and Saga currently do not allow to compile software for the GPUs +as the cuda driver is not installed. +In order to compile the GPU software one needs an interactive session on a GPU node. +If the GPU is not needed, one can ask for a CPU-only allocation, e.g.: + +``` +salloc --nodes=1 --time=00:30:00 --partition= --mem-per-cpu=8G --account=<...> +``` + +or, if GPUs are required, e.g., for testing purposes: + +``` +salloc --nodes=1 --time=00:30:00 --partition= --mem-per-cpu=8G --account=<...> --gpus=1 +``` + +## Saga + +There are two types of GPU nodes on Saga, located in two distinct SLURM partitions: + +* Intel CPU with 4X Tesla P100, 16GB, `--partition=accel` +* AMD CPUs with 4XA100, 80GB, `--partition=a100` + +These are different architectures. By default, Saga loads the Intel software environment, +If you want to run/compile software for the nodes with the AMD CPUs and A100 GPUS +you need to get an allocation on the `a100` partition. Then, inside the allocation, +or inside your job script, switch the module environment + +``` +module --force swap StdEnv Zen2Env + +``` +Note that installed modules can vary between the two node types. diff --git a/_sources/code_development/compilers.md.txt b/_sources/code_development/compilers.md.txt new file mode 100644 index 000000000..a31f4edbf --- /dev/null +++ b/_sources/code_development/compilers.md.txt @@ -0,0 +1,92 @@ +# Compilers + +## Compiling MPI Applications + +### Intel MPI + +The following table shows available Intel MPI compiler commands, the underlying Intel and GNU compilers, and ways to override underlying compilers with environment variables or command line options: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LanguageWrapper scriptDefault compilerEnvironment variableCommand line
CmpiicciccI_MPI_CC-cc=<compiler>
mpigccgcc
C++mpiicpcicpcI_MPI_CXX-cxx=<compiler>
mpigxxg++
FortranmpiifortifortI_MPI_FC-fc=<compiler>
mpifcgfortran
+ +Specify option `-show` with one of the compiler wrapper scripts to see the underlying compiler together with compiler options, link flags and libraries. + +The Intel MPI toolchain is loaded by using `module load`: + + module load intel/2017a + +Please see also {ref}`running-mpi-applications`. + + +### Open MPI + +The Open MPI compiler wrapper scripts listed in the table below add in all relevant compiler and link flags, and the invoke the underlying compiler, i.e. the compiler the Open MPI installation was built with. + + + + + + + + + + + + + + + + + + + + + + + + +
LanguageWrapper scriptEnvironment variable
CmpiccOMPI_CC
C++mpiCC, mpicxx, mpic++OMPI_CXX
FortranmpifortOMPI_FC
+ +It is possible to change the underlying compiler invoked when calling the compiler wrappers using the environment variables listed in the table. Use the option `-showme` to see the underlying compiler, the compile and link flags, and the libraries that are linked. diff --git a/_sources/code_development/debugging.md.txt b/_sources/code_development/debugging.md.txt new file mode 100644 index 000000000..4095c9f78 --- /dev/null +++ b/_sources/code_development/debugging.md.txt @@ -0,0 +1,286 @@ +# Debugging + +## Compiler Debug Options + +The table below shows a list of debugging options for the Intel and GCC +compilers. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CompilerOptionAction
Intel-gGenerate symbolic debugging information
GCC
Intel-check bounds(Fortran only)Add runtime array bounds checking
GCC-fcheck=bounds(Fortran only)
Intel-check=uninit (C/C++)
-check uninit (Fortran)
Check for uninitialized variables
GCC-Wuninitialized
Intel-fp-trap-all=common (C/C++)
-fpe-all=0 (Fortran)
Trap floating point exceptions:
+ - divide by zero
+ - invalid operands
+ - floating point overflow
GCC-ffpe-trap=zero,invalid,overflow (Fortran only)
Intel-tracebackAdd debug information for runtime traceback
GCC-fbacktrace (Fortran only)
+ + +## GNU GDB + +GDB, the GNU Project debugger, is a free software debugger that supports +several programming languages including C, C++ and Fortran. GDB has a +command-line interface and do not contain its own graphical user interface +(GUI). + + +### GDB commands + +To begin a debug session compile the code with the `-g` option to add +debugging information, and start GDB by running the `gdb` command adding the +executable program as argument: + + $ gdb prog + +Once inside the GDB environment, indicated by the `(gdb)` prompt, you can issue +commands. The following shows a list of selected GDB commands: + + +* `help` – display a list of named classes of commands +* `run` – start the program +* `attach` – attach to a running process outside GDB +* `step` - go to the next source line, will step into a function/subroutine +* `next` – go to the next source line, function/subroutine calls are executed without stepping into them +* `continue` – continue executing +* `break` – set breakpoint +* `watch` – set a watchpoint to stop execution when the value of a variable or an expression changes +* `list` – display (default 10) lines of source surrounding the current line +* `print` – print value of a variable +* `backtrace` - display a stack frame for each active subroutine +* `detach` – detach from a process +* `quit` – exit GDB + +Commands can be abbreviated to one or the first few letters of the command +name if that abbreviation is unambiguous or in some cases where a single +letter is specifically defined for a command. E.g. to start a program: + + (gdb) r + Starting program: /path/to/executable/prog + +To execute shell commands during the debugging session issue shell in front of +the command, e.g. + + (gdb) shell ls -l + + +### Attaching to running processes + +GDB can attach to already running processes using the attach *[process-id]* command. After attaching to a process GDB will stop it from running. This allows you to prepare the debug session using GDB commands, e.g. setting breakpoints or watchpoints. Then use the `continue` command to let the process continue running. + +Although GDB is a serial debugger you can examine parallel programs by attaching to individual processes of the program. For instance, when running batch jobs you can log into one of the compute nodes of the job and attach to one of the running processes. + +The listing below displays a sample debug session attaching to one of the +processes of a running MPI job for examining data (lines starting with # are +comments): + + $ gdb + + (gdb) # List the processes of the MPI program + (gdb) shell ps -eo pid,comm | grep mpi_prog + 14957 mpi_prog + 14961 mpi_prog + 14962 mpi_prog + ...etc. + + (gdb) # Attach to one of the MPI processes + (gdb) attach 14961 + Attaching to process 14961 + Reading symbols from /path/to/executable/mpi_prog...done. + ...etc + + (gdb) # Set a watchpoint to stop execution when the variable Uc is updated + (gdb) watch Uc + Hardware watchpoint 1: Uc + + (gdb) # Continue the execution of the program + (gdb) continue + Continuing. + + Hardware watchpoint 1: Uc + Old value = -3.33545399 + New value = -2.11184907 + POTTEMP::ptemp (ldiad=...etc) at ptemp1.f90:298 + 298 Vc= dsdx(2,1,ie2)*u0 + dsdx(2,2,ie2)*v0 + + dsdx(2,3,ie2)*w0 + + (gdb) # Set the list command to display 16 lines... + (gdb) set listsize 16 + (gdb) # ...and display the source backwards starting 2 lines below the current one + (gdb) list +2 + 284 do k= 1, 8 + 285 kp= lnode2(k,ie2) + 286 u0= u0 + u12(kp) + 287 v0= v0 + u22(kp) + 288 w0= w0 + u32(kp) + 289 vt= vt + vtef2(kp) + 290 enddo + 291 + 292 u0= 0.125*u0; v0= 0.125*v0; w0= 0.125*w0; vt= 0.125*vt + 293 + 294 ! + 295 !---- Contravariant velocity + 296 ! + 297 Uc= dsdx(1,1,ie2)*u0 + dsdx(1,2,ie2)*v0 + dsdx(1,3,ie2)*w0 + 298 Vc= dsdx(2,1,ie2)*u0 + dsdx(2,2,ie2)*v0 + dsdx(2,3,ie2)*w0 + 299 Wc= dsdx(3,1,ie2)*u0 + dsdx(3,2,ie2)*v0 + dsdx(3,3,ie2)*w0 + + (gdb) # Print a 5 element slice of the variable u12 + (gdb) print u12(3006:3010) + $1 = (0.0186802763, 0.0188683271, 0.0145201795, 0.00553302653, -0.00918145757) + + (gdb) # Release the process from GDB control + (gdb) detach + Detaching from program: /path/to/executable/mpi_prog, process 14961 + + (gdb) quit + + +### Examining core files + +Core files can be examined specifying both an executable program and the core +file: + + $ gdb prog core + +One can also produce a core file from within the GDB session to preserve a +snapshot of a program’s state using the command: + + (gdb) generate-core-file + + +(totalview_debugging)= + +## TotalView + +TotalView is a GUI-based cource code debugger from [Rogue Wave Software](https://www.roguewave.com) +It allows for debugging of serial and parallel codes. Program execution is +controlled by stepping line by line through the code, setting breakpoints, or +by setting watchpoints on variables. It is also efficient for debugging of +memory errors and leaks, and diagnostic problems like deadlocks. + +TotalView works with C, C++ and Fortran applications, and supports OpenMP and +several MPI implementations including Open MPI and Intel MPI. + + +### Starting Totalview + +After compiling your MPI code with the `-g` flag, load the TotalView module and +start `totalview` with your executable, e.g. *mpi_prog*, by issuing the command + +Open MPI: + + $ mpirun -tv -np ./mpi_prog + +Intel MPI: + + $ totalview mpiexec -a -n ./mpi_prog + +Three windows, the TotalView Root window, the Startup Parameters Dialog Box and +the Process Window, will appear. Click the **_OK_** button in the Startup +Parameters Dialog Box. Now click the **_Go_** button from the execution control +commands in the Process Window. A popup window will ask whether you want to +start the job in a stopped state. Click **_Yes_**, and the source code of your +program will show in the source pane of the Process Window. + +```{figure} process.png +:alt: TotalView process window + +Fig. 1 - TotalView process window +``` + +You are now ready to start the debugging session doing different actions, e.g.: + +* Click the **_Step_** or **_Next_** buttons to go through the code statement by statement. For function calls **_Step_** goes into the function, while **_Next_** executes the function. +* Create a breakpoint by clicking the line number displayed to the left in the Process Window. Click the **_Go_** button to run to this line. +* Monitor a variable's value by creating a watchpoint, select **_Action Points_** → **_Create Watchpoint_**. A watchpoint stops execution when the variable's data changes. +* Examine variables: Dive into a variable by clicking **_View_** → **_Lookup_** or double-click the variable name using the left mouse button. The Variable Window appears. +* Visualize variable across processes by diving into a variable and click **_View_** → **_Show Across_** → **_Processes_** in the Variable Window. +* Examine array data: Dive into an arrray variable. Display array subsections by editing the slice field in the array Varible Window. Show statistics information about the array (or a slice of the array) by clicking **_Tools_** → **_Statistics_** in the Variable Window. + +```{figure} statistics.png +:alt: Examining data + +Fig. 2 - Examining data +``` + + +### Interactive Batch System Debugging + +When running TotalView in the batch system, first start an interactive Slurm +batch job session: + + $ salloc --account= --time -N bash + salloc: Granted job allocation + +Start TotalView with the executable + +Open MPI: + + $ mpirun -tv ./mpi_prog + +Intel MPI: + + $ totalview srun -a --ntasks-per-node= ./mpi_prog + +Your program will now execute within TotalView on the number of nodes specified +in the Slurm job allocation. + +**Note:** Be sure to exit the shell created by the `salloc` command when +finishing the debugging session + + $ exit + salloc: Relinquishing job allocation + + +### Further Information + +For more information see the [TotalView +Documentation](https://www.roguewave.com/help-support/documentation/totalview) +page. diff --git a/_sources/code_development/guides/async_openacc.md.txt b/_sources/code_development/guides/async_openacc.md.txt new file mode 100644 index 000000000..18a8ce0dc --- /dev/null +++ b/_sources/code_development/guides/async_openacc.md.txt @@ -0,0 +1,305 @@ +--- +orphan: true +--- + +```{index} GPU; Async and Multi-GPU OpenACC, OpenACC; Async and Multi-GPU OpenACC, Nvidia Nsight; Async and Multi-GPU OpenACC, Multi-GPU; Async and Multi-GPU OpenACC +``` +(asyncopenacc)= + +# Async and Multi-GPU OpenACC +In this guide we will go over a few advance topics regarding OpenACC. The guide +will cover asynchronous operations, which can overlap memory transfers and +compute for higher throughput, and how to utilize multiple GPUs. + +```{tip} +If you are not sure what OpenACC is we have an [introductory +guide](./openacc.md) which explains the basics. +``` + +## Introduction +Asynchronous programming in OpenACC is a way to schedule work so that the GPU +can work _concurrently_ with the tasks given. Note that this does not mean that +the GPU will necessarily run multiple kernels simultaneously. Often, +asynchronous programming with OpenACC will allow us to overlap memory transfer +with kernel execution. This can improve efficiency since the GPU does not sit +idle while transferring memory back-and-forth, resulting in improved throughput. +If you are just beginning to translate an existing code base to OpenACC, +asynchronous operations should be some of the last optimizations to apply and +can be tricky when the problem is _not_ [embarrassingly +parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel). + +After reading this guide you should be familiar with the following topics and +ideas. + - Understand how asynchronous programming with OpenACC works. + - How memory and kernels can be overlapped. + - How different OpenACC blocks can be made dependent on each other. + - Which problems are suitable for asynchronous OpenACC. + - Know the basics of utilizing multiple GPUs. + +--- + +To get started we will need some piece of code that we would like to accelerate. +This time we have chosen to accelerate the visualization of the [Mandelbrot +set](https://en.wikipedia.org/wiki/Mandelbrot_set). + +Since this code requires a bit more setup than before we have created a [`meson` +project](https://mesonbuild.com/) that can be used to build the code. Below we +have attached the full project, and we will offer `zip` archives of the full +project as we make changes, but will focus on the main code in `mandelbrot.c`. + +```{eval-rst} +:download:`Serial version of project as 'zip' archive <./async_openacc/serial.zip>` +``` + +Below we have attached the full version of `mandelbrot.c` and highlighted the +two main areas of computation where we will focus our efforts. + +```{eval-rst} +.. literalinclude:: async_openacc/mandelbrot_serial.c + :language: c + :emphasize-lines: 30-47, 125-131 +``` + +To build the project on Saga we first need to load a few modules before using +`meson` to build the project. + +```bash +$ module load Python/3.8.2-GCCcore-9.3.0 +$ pip3 install --user meson +# To download the project directly +# wget https://documentation.sigma2.no/_downloads/bdfbca90a90a8d1b824fc6b1154ceee7/serial.zip +$ unzip serial.zip +$ cd AccelBrot-master +``` + +```{note} +We need to manually install `meson` above since we require version `0.56.0` +which only exist as a `pip` package at the time of writing. Check with `module +--show_hidden avail Meson` to see if a sufficiently new version is installed. +``` + +Then to build the project load `NVHPC` + `Ninja` and run the following `meson` +commands + +```bash +$ module load NVHPC/20.7 Ninja/1.10.0-GCCcore-9.3.0 +$ CC=nvc meson setup builddir --buildtype=debugoptimized +$ meson compile -C builddir +``` + +Afterwards, as long as `NVHPC` and `Ninja` is loaded, only the last command +`meson compile -C builddir` is required when making changes. + +To run this on Saga (_without_ GPU) the following `srun` command can be used +```bash +$ srun --account= --time=02:00 --mem-per-cpu=1G ./builddir/src/mandelbrot +``` + +```{tip} +Try different image size or iteration parameters to see how much time the CPU vs +GPU will take, `./builddir/src/mandelbrot 4k`. +``` + +```{tip} +Use this opportunity to try to optimize the above code with OpenACC directives +without focusing on asynchronous programming or multi-GPU setups. +``` + +## Initial translation to OpenACC +To run the above code on a GPU using OpenACC we first need to introduce the +`#pragma acc routine` directive. This directive tells OpenACC that we would like +the function following the directive to be translated into GPU code. When +looking at the code above we can see that the `mandelbrot()` function is used to +separate the calculation of the set and iteration over the image. To be able to +optimize the loop we therefore need to translate the `mandelbrot()` function. + +```{eval-rst} +.. literalinclude:: async_openacc/mandelbrot_initial.c + :language: c + :lines: 21-49 + :emphasize-lines: 2, 11 +``` + +```{note} +In the above code we added `seq` to the end of the directive. This tells OpenACC +that the `routine` must run **seq**uentially on the GPU and no additional +parallelization is possible. Adding `seq` is not necessary, but can be a good +way to ensure that your thinking is correct as the compiler would complain if it +is not correct. See the [quick +reference](https://www.openacc.org/sites/default/files/inline-files/API%20Guide%202.7.pdf) +for further explanation of the possible additions to `#pragma acc routine`. +``` + +After this we can add the `#pragma acc parallel loop` directives around the +image computation. + +```{eval-rst} +.. literalinclude:: async_openacc/mandelbrot_initial.c + :language: c + :lines: 127-137 + :emphasize-lines: 2-5 +``` +```{eval-rst} +:download:`Initial translation of 'mandelbrot.c' to OpenACC <./async_openacc/mandelbrot_initial.c>` +``` + +This initial translation is already quite a lot better than the serial CPU +version (clocking in at around `10x` improvement when generating a `4k` image). +Let's see what insight we can gain from running with `Nsight`. + +--- + +To run with `Nsight` use the following invocation of `srun` +```bash +$ srun --account= --time=02:00 --mem-per-cpu=1G --partition=accel --gpus=1 nsys profile -t cuda,openacc,osrt -o initial ./builddir/src/mandelbrot 4k +``` + +```{eval-rst} +:download:`Nsight profile <./async_openacc/initial.qdrep>` +``` + +![Timeline of initial OpenACC +translation](./async_openacc/initial_timeline_overview.png) + +As we can see, most of the timeline is taken with doing other work and not the +actual compute. We will therefore zoom into the desired range (which we can +identify by following the `CUDA API` row until a cluster of yellow boxes +appear). + +![Selection of what to zoom into on initial +timeline](./async_openacc/initial_zoom1.png) +![Result of zoom on initial timeline](./async_openacc/initial_zoom2.png) + +From the above screenshot we can see that we are running on the GPU, but +computation and memory copies are sequential. For large amounts of data this is +less than optimal and we can try to improve the situation with asynchronous +scheduling of compute and memory transfer. + +## Async OpenACC +Translating a piece of OpenACC code to run asynchronously requires us to split +our work into smaller tasks that we know could run concurrently. Looking at our +main loop we can see that every computation is independent of any other +iteration or computation. This make the Mandelbrot example quite simple to +translate, but that does not mean asynchronous operations is only for +embarrassingly parallel problems. + +```{tip} +One way to quickly utilize asynchronous OpenACC is to identify blocks of code +that are run sequentially (e.g. one loop that does something to `A` and another +loop that does something to `B`) and does not involve data from one another. +Loops like that can be run asynchronously which can increase throughput by +overlapping computation and memory transfer. +``` + +In our case we can split the computation on rows and process the image in +blocks. This will allow us to use the `#pragma acc update` directive to copy +data from the GPU per image block instead of doing this all at the end. + +```{eval-rst} +:download:`Async OpenACC project as 'zip' <./async_openacc/async.zip>` +``` + +To split the image into blocks we will create a new command line parameter and +add an additional loop around our computation. + +```{eval-rst} +.. literalinclude:: async_openacc/mandelbrot_async.c + :language: c + :lines: 134-150 + :emphasize-lines: 2, 8, 15, 17 + :linenos: +``` + +In the above code the `num_blocks` value divides our image into a given number +of blocks. Then we create and copy the necessary data before beginning our +actual computation. Notice in particular the `async` directive added on line +`8`. This directive tells OpenACC that it should launch the kernel and +immediately continue working. The parameter given to `async` is the queue +number, kernels submitted to the same queue must wait for previous work in that +queue to finish before being launched. Notice therefore that we, on line `15`, +ensure that we have the same variable `block` which means that we do not update +the data before the computation is complete. Lastly, on line `17` we wait for +all previously launched asynchronous work to finish before continuing. + +We will run this again with `Nsight` to see if we were able to perform the work +asynchronously. Use the following command on Saga (don't forget to compile with +`meson compile -C builddir`) + +```bash +$ srun --account= --time=02:00 --mem-per-cpu=1G --partition=accel --gpus=1 nsys profile -t cuda,openacc,osrt -o async ./builddir/src/mandelbrot 4k +``` + +```{eval-rst} +:download:`Nsight async profile <./async_openacc/async.qdrep>` +``` + +This new code runs about `1.25x` faster than the initial translation, which +shows the value in overlapping memory and compute. Below we have attached the +zoomed in view of the `Nsight` timeline to show how asynchronous OpenACC looks. + +![View of Nsight's timeline after the asynchronous +transition](./async_openacc/async_timeline.png) + +## Utilizing multiple GPUs +To utilize multiple GPUs on Saga we will have to dip into the OpenACC runtime +calls to query and set which GPU we want to run on. We will use the +`acc_get_num_devices` and `acc_set_device_num` methods to assign work to GPUs. + +```{tip} +To see all the directives and runtime methods of OpenACC consult the [quick +reference](https://www.openacc.org/sites/default/files/inline-files/API%20Guide%202.7.pdf). +``` + +```{tip} +For most real world code it can be very difficult to find places to split the +work over multiple GPUs and so a different technique might be called for. + +Instead of trying to split loops into work for multiple GPUs, try to see already +existing possibilities to split the work. One natural way is to give individual +MPI ranks their own GPU to utilize. + +This can be easily accomplished through Slurm with the `--gpus-per-task` flag +which will allocate a number of GPUs appropriate for the number of Slurm tasks. +``` + +```{eval-rst} +:download:`Multi-GPU OpenACC project as 'zip' <./async_openacc/multi.zip>` +``` + +```{eval-rst} +.. literalinclude:: async_openacc/mandelbrot_multi.c + :language: c + :lines: 134-161 + :emphasize-lines: 1, 5-8, 13, 24-28 +``` + +Notice that we copy the data to each GPU and then assign a device after the +current block number. We also have to take special care when exiting all loops +that we wait for all GPUs to finish and remove the data since the `#pragma acc +enter data` directive keeps the data in GPU memory until otherwise stated (in +contrast to `#pragma acc data` which keeps the data only for the extent of the +following block of code). + +This last iterations is about `1.5x` faster using `--gpus=2` with +diminishing, or even negative, returns for additional GPUs. + +## Summary +We have shown how to translate the Mandelbrot calculation into OpenACC code, how +such code can be made asynchronous to overlap computation and memory transfer, +and how to utilize multiple GPUs on Saga. + +Below is the summary of speedup where the improvement is shown relative to the +previous entry in the table (take the measured times with a grain of salt, they +are more an illustration of possible speedup, not guaranteed speedup). + +| Version | Time in milliseconds | Speedup | +| ------- | -------------------- | ------- | +| Serial | `10757`| N/A | +| OpenMP `--cpus-per-task=6`\* | `3313` | `3.24x` | +| Initial OpenACC | `1020` | `3.25x` | +| Async | `811` | `1.25x` | +| Multi-GPU `--gpus=2` | `547` | `1.48x` | +| Multi-GPU `--gpus=4` | `2932` | `0.18x` | +**\*** To keep the comparison as fair as possible we compare the CPU resources +that would be the equivalent to [the billing resources of 1 GPU on +Saga](../../jobs/projects_accounting.md). diff --git a/_sources/code_development/guides/container_env.md.txt b/_sources/code_development/guides/container_env.md.txt new file mode 100644 index 000000000..81f211f7f --- /dev/null +++ b/_sources/code_development/guides/container_env.md.txt @@ -0,0 +1,262 @@ +--- +orphan: true +--- + +# Container with build environment + +```{note} +To follow this tutorial you need to have root access to a Linux computer +with Singularity installed, e.g. your personal laptop/workstation. +Please follow the installation +[instructions](https://sylabs.io/guides/3.7/user-guide/quick_start.html) +from the Singularity documentation. +``` + +Sometimes we encounter applications that have system dependencies which are incompatible +with the global environment on the cluster. This can happen for instance if you download +a precompiled binary from an online archive which has been built for a specific OS version +or depends on a system library which is not available, or if you want to compile your own +application with some non-standard dependencies. One way to resolve such issues is to +containerize the appropriate environment and run/compile your application _through_ this +container on the cluster. In the following examples we will demonstrate such a work flow. + +## Hello world example + +This example demonstrates: +1. how to write a simple Singularity definition file +2. how to install system packages on top of a standard OS base image +3. how to build the container on your laptop +4. how to run commands through the container environment on the cluster + +In this example we will create a very simple container environment with a Ubuntu-16.04 +operating system and a GNU compiler. We will then use this environment to compile a +simple program on the cluster. + +**Writing the definition file** + +We start with the following definition file (we call it `example.def`) +``` +Bootstrap: docker +From: ubuntu16.04 + +%post + apt-get update && apt-get install -y g++ +``` + +This recipe will pull the `ubuntu16.04` image from the [docker](https://hub.docker.com) +registry and install the GNU C++ compiler using the Ubuntu package manager. Any system +package that is available for the base OS can be installed in this way. Other common +`Bootstrap` options include +`library` for the Singularity [Container Library](https://cloud.sylabs.io/library), +`shub` for [Singularity-Hub](https://singularity-hub.org) or +`localimage` if you want to build on top of another image located on your computer. + +```{tip} +You can find much more on Singularity definition files [here](https://sylabs.io/guides/3.7/user-guide/definition_files.html). +``` + +**Building the container** + +We can now build the container with the following command (you need sudo rights for this step): +```console +[me@laptop]$ sudo singularity build example.sif example.def + +[... lots of output ...] + +INFO: Adding environment to container +INFO: Creating SIF file... +INFO: Build complete: example.sif +``` + +**Running the container** + +Once `example.sif` is generated, we can `scp` the container file to the cluster: + +```console +[me@laptop]$ scp example.sif me@saga.sigma2.no +``` + +First we check the default `g++` compiler on the cluster: +```console +[me@login-1.SAGA ~]$ g++ --version +g++ (GCC) 4.8.5 20150623 (Red Hat 4.8.5-44) +Copyright (C) 2015 Free Software Foundation, Inc. +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +``` + +Then we check the `g++` version in the container by running the command through +`singularity exec`: +```console +[me@login-1.SAGA ~]$ singularity exec example.sif g++ --version +g++ (Ubuntu 5.4.0-6ubuntu1~16.04.12) 5.4.0 20160609 +Copyright (C) 2015 Free Software Foundation, Inc. +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +``` + +We write a simple `hello-world.cpp` program: +``` +#include + +int main() { + std::cout << "Hello World!" << std::endl; + return 0; +} +``` +and compile it _through_ the container environment: +```console +[me@login-1.SAGA ~]$ singularity exec example.sif g++ hello-world.cpp +[me@login-1.SAGA ~]$ singularity exec example.sif ./a.out +Hello World! +``` + +Remember that you also need to *run* the program through the container if it is +dynamically linked to some of the containerized libraries. + + +## Real world example: pdflatex + +This example demonstrates: +1. how to build a container from a definition file +2. how to set environment variables inside the container +4. how to document your container +3. how to make your container look like an executable application +4. how to run your container application on the cluster + +[Latex](https://www.latex-project.org) is a software package with a plethora of different +package options which can easily mess up your global environment. It is something that is +typically not installed on compute clusters, but could still be useful e.g. for building +code documentation. In this example we will create a fully functional container for the +`pdflatex` command for building PDFs from `tex` files. + +**Writing the definition file** + +``` +Bootstrap: library +From: ubuntu:20.04 + +%post + apt-get install -y software-properties-common + add-apt-repository universe + apt-get update -y + apt-get install -y texlive texlive-fonts-extra + +%environment + export LC_ALL=C + +%runscript + pdflatex $@ + +%labels + Author Me + Description PDF latex on a Ubuntu-20.04 base image + Version v1.0.0 + +%help + How to run the container on a tex file: + $ ./.sif .tex +``` + +Here we use the Ubuntu package manager to install a few `texlive` packages on top of a +Ubuntu-20.04 base image, and we set the `LC_ALL` environment variable inside the container +at run time. The `%runscript` section specifies the commands to be run inside the container +when you launch the image file as an executable, where the `$@` will capture an argument string. +In this particular example it means that we can run the image as +```console +$ ./.sif .tex +``` +which will be equivalent of running the given `%runscript` command (`pdflatex` in this case) +through the container with `singularity exec`: +```console +$ singularity exec .sif pdflatex .tex +``` + +Finally, we add a few labels (accessible through `singularity inspect .sif`) and a help +string (accessible through `singularity run-help .sif`) for documentation. + +**Building the container** + +We build the container on a local computer (requires sudo rights), where we have called the +definition and image files `pdflatex.def` and `pdflatex.sif`, respectively: +```console +[me@laptop]$ sudo singularity build pdflatex.sif pdflatex.def + +[... lots of output ...] + + This may take some time... done. +INFO: Adding help info +INFO: Adding labels +INFO: Adding environment to container +INFO: Adding runscript +INFO: Creating SIF file... +INFO: Build complete: pdflatex.sif +``` + +**Inpecting the container** + +When the image is ready we can inspect the metadata that we put into it + +```console +[me@laptop]$ singularity inspect pdflatex.sif +Author: Me +Description: PDF latex on a Ubuntu-20.04 base image +Version: v1.0.0 +org.label-schema.build-arch: amd64 +org.label-schema.build-date: Thursday_10_June_2021_13:12:27_CEST +org.label-schema.schema-version: 1.0 +org.label-schema.usage: /.singularity.d/runscript.help +org.label-schema.usage.singularity.deffile.bootstrap: library +org.label-schema.usage.singularity.deffile.from: ubuntu:20.04 +org.label-schema.usage.singularity.deffile.mirrorurl: http://us.archive.ubuntu.com/ubuntu/ +org.label-schema.usage.singularity.deffile.osversion: focal +org.label-schema.usage.singularity.runscript.help: /.singularity.d/runscript.help +org.label-schema.usage.singularity.version: 3.7.0 +``` + +```console +[me@laptop]$ singularity run-help pdflatex.sif + How to run the container on a tex file: + $ ./.sif .tex +``` + +**Running the container** + +When we are happy with the container we can move it to any machine where we would like +to run `pdflatex`. Here we `scp` to Saga and log in with `-X` in order to browse the +produced PDF: +```console +[me@laptop]$ scp pdflatex.sif me@saga.sigma2.no +[me@laptop]$ ssh -X me@saga.sigma2.no +``` +We write a simple `hello-world.tex` file +``` +\documentclass[12pt]{article} +\begin{document} +Hello World! +\end{document} +``` +and run our container on it: +```console +[me@login-1.SAGA ~]$ ./pdflatex.sif hello-world.tex +This is pdfTeX, Version 3.14159265-2.6-1.40.20 (TeX Live 2019/Debian) (preloaded format=pdflatex) restricted \write18 enabled. +entering extended mode +(./hello-world.tex +LaTeX2e <2020-02-02> patch level 2 +L3 programming layer <2020-02-14> +(/usr/share/texlive/texmf-dist/tex/latex/base/article.cls +Document Class: article 2019/12/20 v1.4l Standard LaTeX document class +(/usr/share/texlive/texmf-dist/tex/latex/base/size12.clo)) +(/usr/share/texlive/texmf-dist/tex/latex/l3backend/l3backend-pdfmode.def) +No file hello-world.aux. +[1{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] (./hello-world.aux) ) +Output written on hello-world.pdf (1 page, 9893 bytes). +Transcript written on hello-world.log. +``` + +Finally, you can inspect the produced file e.g. in a browser: +```console +[me@login-1.SAGA ~]$ firefox hello-world.pdf +``` +where you will hopefully see an almost blank page with the words "Hello World!" written. diff --git a/_sources/code_development/guides/container_mpi.md.txt b/_sources/code_development/guides/container_mpi.md.txt new file mode 100644 index 000000000..eac4497f0 --- /dev/null +++ b/_sources/code_development/guides/container_mpi.md.txt @@ -0,0 +1,112 @@ +--- +orphan: true +--- + +# Building MPI containers + +The following example will demonstrate how to _build_ Singularity containers on +your laptop which are suitable for execution on our HPC systems (Saga, Fram, Betzy). +If you are only interested in _running_ existing containers, +see {ref}`Running containers `. + +## Creating a Singularity container from a definition file + +**We do this step on our own laptop/computer, not on the cluster**. +This is because Singularity needs root rights to build the container. +We will later +upload the generated container file to the cluster. + +You need to have Singularity installed on your laptop for this to work +(follow e.g. https://sylabs.io/guides/3.3/user-guide/installation.html). + +We start with the following definitions file (`example.def`; this is a simplified +version based on https://sylabs.io/guides/3.3/user-guide/mpi.html and the example provided there): +``` +Bootstrap: docker +From: ubuntu:latest + +%environment + export OMPI_DIR=/opt/ompi + +%post + apt-get update && apt-get install -y wget git bash gcc gfortran g++ make file + export OMPI_DIR=/opt/ompi + export OMPI_VERSION=4.0.1 + export OMPI_URL="https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-$OMPI_VERSION.tar.bz2" + mkdir -p /tmp/ompi + mkdir -p /opt + cd /tmp/ompi && wget -O openmpi-$OMPI_VERSION.tar.bz2 $OMPI_URL && tar -xjf openmpi-$OMPI_VERSION.tar.bz2 + cd /tmp/ompi/openmpi-$OMPI_VERSION && ./configure --prefix=$OMPI_DIR && make install +``` + +From this we build a container (we are still on the laptop, not on the cluster): +``` +$ sudo singularity build example.sif example.def +``` + +This takes a couple of minutes: +``` +[... lots of output ...] + +INFO: Adding environment to container +INFO: Creating SIF file... +INFO: Build complete: example.sif +``` + +Once `example.sif` is generated, we can `scp` +the container file to the cluster. + + +## Running the container on multiple nodes + +We assume that we have the container file `example.sif` from the step before on +the cluster. We will also fetch `mpi_hello_world.c` from +https://mpitutorial.com/tutorials/mpi-hello-world/. + +We are ready to test it out with the following job script on Saga (adjust +"myaccount"; on Fram/Betzy you will need to remove the line containing `#SBATCH +--mem-per-cpu=1000M` but the rest should work as is): +```bash +#!/bin/bash + +#SBATCH --account=myaccount +#SBATCH --job-name=singularity-test +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --mem-per-cpu=1000M +#SBATCH --time=00:03:00 + +singularity exec example.sif /opt/ompi/bin/mpirun --version +singularity exec example.sif /opt/ompi/bin/mpicc mpi_hello_world.c + +module purge +module load foss/2020a + +mpirun --bind-to core -n ${SLURM_NTASKS} singularity exec example.sif ./a.out +``` + +The interesting part of the output is: +``` +mpirun (Open MPI) 4.0.1 + +[...] + +Hello world from processor c2-4, rank 2 out of 16 processors +Hello world from processor c2-4, rank 3 out of 16 processors +Hello world from processor c2-4, rank 0 out of 16 processors +Hello world from processor c2-4, rank 1 out of 16 processors +Hello world from processor c2-21, rank 14 out of 16 processors +Hello world from processor c2-21, rank 13 out of 16 processors +Hello world from processor c2-21, rank 12 out of 16 processors +Hello world from processor c2-21, rank 15 out of 16 processors +Hello world from processor c2-18, rank 8 out of 16 processors +Hello world from processor c2-18, rank 11 out of 16 processors +Hello world from processor c2-18, rank 9 out of 16 processors +Hello world from processor c2-18, rank 10 out of 16 processors +Hello world from processor c2-11, rank 6 out of 16 processors +Hello world from processor c2-11, rank 4 out of 16 processors +Hello world from processor c2-11, rank 7 out of 16 processors +Hello world from processor c2-11, rank 5 out of 16 processors +``` + +Looks like it's working! diff --git a/_sources/code_development/guides/container_openacc.md.txt b/_sources/code_development/guides/container_openacc.md.txt new file mode 100644 index 000000000..ac71e580e --- /dev/null +++ b/_sources/code_development/guides/container_openacc.md.txt @@ -0,0 +1,97 @@ +--- +orphan: true +--- + +# Container with GPU support (OpenACC) + +```{note} +To follow this tutorial you need to have root access to a Linux computer +with Singularity installed, e.g. your personal laptop/workstation. +Please follow the installation +[instructions](https://sylabs.io/guides/3.7/user-guide/quick_start.html) +from the Singularity documentation. +``` + +This example demonstrates: +1. how to build a (Nvidia) GPU container using a Nvidia GPU Cloud (NGC) base image +2. how to copy a file into the container +3. how to compile a simple OpenACC program inside the container +4. how to run the container on a cluster with GPU resourses + +This example is based on the {ref}`OpenACC tutorial `, and we will simply copy +the fully optimized source code for the Jacobi iteration to serve as our GPU application: + +```{eval-rst} +:download:`jacobi_optimized.c <./openacc/jacobi_optimized.c>` +``` + +**Writing the definition file** + +For this example we will use a base image from the +[Nvidia GPU Cloud](https://ngc.nvidia.com/), +which includes everything we need for this simple application. The NGC hosts a +wide variety of different Nvidia based container images, supporting different +CUDA versions and operating systems. We will choose the +[NVIDIA HPC SDK](https://ngc.nvidia.com/catalog/containers/nvidia:nvhpc) +container since we need the `nvc` compiler for OpenACC (the standard NVIDIA CUDA +image does not have this). We select the latest development package on Ubuntu-20.04: +``` +Bootstrap: docker +From: nvcr.io/nvidia/nvhpc:21.5-devel-cuda11.3-ubuntu20.04 + +%post + apt-get update + +%files + jacobi_optimized.c /work/jacobi_optimized.c + +%post + nvc -g -fast -acc -Minfo=accel -o /usr/local/bin/jacobi /work/jacobi_optimized.c +``` + +Here we assume that we have downloaded the source file `jacobi_optimized.c` from the link above +and put it in the same directory as the definition file. We then copy the source file into the +container with the `%files` section, and then compile it with `nvc` using the same string of options +as in the original tutorial while putting the output executable into a runtime path (`/usr/local/bin`). + +```{tip} +We need the `devel` base image in order to compile the application inside the container, +but once it's built we can get away with a `runtime` base container for running it. +Check the official documentation on how to do +[multi-stage builds](https://sylabs.io/guides/3.7/user-guide/definition_files.html#multi-stage-builds), +which can significantly reduce the size of the final container image. +``` + +**Building the container** + +We can now build the container with the following command (you need sudo rights for this step): +```console +[me@laptop]$ sudo singularity build example-openacc.sif example-openacc.def + +[... lots of output ...] + +INFO: Creating SIF file... +INFO: Build complete: example-openacc.sif +``` + +**Running the container** + +Once `example-openacc.sif` is generated, we can `scp` the container file to the cluster: + +```console +[me@laptop]$ scp example-openacc.sif me@saga.sigma2.no +``` + +We test the container in an interactive job on a GPU node: +```console +[me@login-1.SAGA]$ srun --ntasks=1 --gpus-per-task=1 --mem=2G --time=10:00 --partition=accel --account= --pty bash +[me@c7-8.SAGA]$ time singularity exec --nv example-openacc.sif jacobi +real 0m3.748s +user 0m1.715s +sys 0m1.770s +``` + +Remember the `--nv` flag to expose the Nvidia GPU resources to the container. +Here we can see that the execution time is comparable to what is reported in the +original {ref}`tutorial `. + diff --git a/_sources/code_development/guides/containers.md.txt b/_sources/code_development/guides/containers.md.txt new file mode 100644 index 000000000..50c26b4be --- /dev/null +++ b/_sources/code_development/guides/containers.md.txt @@ -0,0 +1,309 @@ +--- +orphan: true +--- + +(running-containers)= +# Containers on NRIS HPC systems + +```{note} +Currently, [Singularity](https://sylabs.io/singularity/) is the only supported container +solution on our HPC systems (Saga, Fram, Betzy). However, since Singularity can build +containers from Docker images, it is also possible to run [Docker](https://www.docker.com/) +containers through Singularity. +``` +## What is a container image +Container image is a package with all cargo needed for a software to work. This +includes the operating system, system packages, libraries and applications as +a single unit. It only uses the host operating systems kernel. + + +## What is not covered in this document +We are showing how to use existing container images on our systems as regular users. +Operations that require root access, like building or making changes to an images +not discussed. + +## When to use containers on NRIS HPC systems + +```{note} +Please let us know if you find more reasons for using containers +``` + - If you have a software stack or a pipeline already setup somewhere else and + you want to bring it as it is to one of the HPC systems + - Containers give the users the flexibility to bring a full software stack to the + cluster which has already been set up, which can make software installations and + dependencies more reproducible and more portable across clusters. + - The software you want to use is only available as a container image + - You need to use system level installations, e.g. the procedure involved + `apt-get install SOMETHING` or similar (`yum`, `rpm`, etc). + - You have a old software that needs some older dependencies. + - You need a specific version of a software to run another software, e.g. CUDA. + +## When not to use containers on NRIS HPC systems + - If the software you are planing to to use already installed as a module(s), then + better to use that module or collection of modules + - Windows containers. On NRIS HPC systems only containers that uses UNIX kernel would + work + - If you do not know what the container is exactly for. i.e. found a command on + the internet and just want to try it out + +## How to access singularity on NRIS HPC systems +Singularity is already installed globally on all our systems, and should be +immediately available on your command line (no `module load` necessary): +```console +[SAGA]$ singularity --version +singularity version 3.6.4-1.el7 +``` +## How to find container images + - [Docker hub](https://hub.docker.com/) + - [NVidia](https://ngc.nvidia.com/catalog/containers) + - [Singularity Cloud](https://cloud.sylabs.io/library) + - [Singularity Hub](https://singularity-hub.org/) + - [RedHat](https://quay.io/) + - [BioContainers](https://biocontainers.pro/) + - [AMD](https://www.amd.com/en/technologies/infinity-hub) + - From software developers + + +## How to get container images + +Singularity images can be fetched from the web using the `singularity pull` command, +which will download a SIF (Singularity Image Format) file to your current directory. +Notice that with Singularity, an image is just a simple binary file, and there's nothing +special about the directory in which you run the `singularity pull` command. This means +that you can move your image around as you please, and even `scp` it to a different +machine and execute it there (as long as you have Singularity installed, of course). + +There are a number of different online repositories for hosting images, some of the +more common ones are listed below. Notice how you can pull Docker images +directly from Docker-Hub using Singularity. + +```console +#Fetching from a [Singularity](https://singularityhub.github.io/) registry: +$ singularity pull --name hello-world.sif shub://vsoch/hello-world + +#Fetching from a [Sylabs](https://cloud.sylabs.io/library) registry: +$ singularity pull --name alpine.sif library://alpine:latest + +#Fetching from a [Docker-Hub](https://hub.docker.com/) registry: +$ singularity pull --name alpine.sif docker://alpine:latest + +#Fetching from a [Quay](https://quay.io) registry: +$ singularity pull --name openmpi-i8.sif docker://quay.io/bast/openmpi-i8:4.0.4-gcc-9.3.0 +``` + +```{note} + + singularity run Vs singularity exec +- [singularity exec](https://sylabs.io/guides/3.1/user-guide/cli/singularity_exec.html): + Run a command within a container +- [singularity run](https://sylabs.io/guides/3.1/user-guide/cli/singularity_run.html): + Run the user-defined default command within a container + +``` + +Example +```console + +[SAGA]$ singularity pull --name hello-world.sif shub://vsoch/hello-world +The image created (hello-world.sif) has a user defined command called "rawr.sh" + +[SAGA]$ singularity exec hello-world.sif cat /singularity +#!/bin/sh + +exec /bin/bash /rawr.sh +[SAGA]$ singularity exec hello-world.sif cat /rawr.sh +#!/bin/bash + +echo "RaawwWWWWWRRRR!! Avocado!" + + +[SAGA]$ singularity run hello-world.sif +RaawwWWWWWRRRR!! Avocado! + +With run, the default command is what is excuted, +[SAGA]$ singularity run hello-world.sif cat /etc/os-release +RaawwWWWWWRRRR!! Avocado! +So we need to use exec to get the expected result +[SAGA]$ singularity exec hello-world.sif cat /etc/os-release +NAME="Ubuntu" +VERSION="14.04.6 LTS, Trusty Tahr" +... +.. + +``` + + +Following are some example use cases we have seen on NRIS HPC systems. + +```{note} +Example 1: A user wants to use different version of the TensorFlow than +what is installed in SAGA. So she googles and ends up here +[https://www.tensorflow.org/install](https://www.tensorflow.org/install) +There she finds the following command sequence +``` +```console + docker pull tensorflow/tensorflow:latest # Download latest stable image + docker run -it -p 8888:8888 tensorflow/tensorflow:latest-jupyter # Start Jupyter server +``` +But she knows that we do not have Docker on SAGA so she uses Singularity to pull +the image, yes it is possible to pull docker images using singularity +```console +[SAGA]$ singularity pull docker://tensorflow/tensorflow:latest +``` +To test she prints the version +```console +[SAGA]$ singularity run tensorflow_latest.sif python -c "import tensorflow as tf;print(tf.__version__)" +``` + + +```{note} +Example 2: +A user needs to use a software that runs only on a specific vesion of Ubuntu +``` + +```console +[SAGA]$ singularity pull docker://bioperl/bioperl +[SAGA]$ singularity exec bioperl_latest.sif cat /etc/os-release + NAME="Ubuntu" + VERSION="14.04.5 LTS, Trusty Tahr" + +[SAGA]$ singularity exec bioperl_latest.sif perl -e 'use Bio::SeqIO; print join "\n", %INC; print "\n"' + base.pm + /usr/share/perl/5.18/base.pm + File/Path.pm + /usr/share/perl/5.18/File/Path.pm + + +``` +```{warning} +If a ready made image is not available with the software. Then you need to pull +Ubuntu image to a machine where you have root access, install the software, +repackage it and take it to SAGA. This step is not covered here. +If you want to learn how to _build_ your own containers, +see our code development {ref}`guides `. +``` + +## Singularity in Job scripts + +This example demonstrates: +1. how to run a container in a job script +2. how to execute a command from inside a container: `singularity exec .sif ` +3. that the container runs it's own operating system (using the same kernel as the host) +4. that your `$HOME` directory is mounted in the container by default, which means that it +will have access to input files etc. located somewhere in this directory (you will have read/write +permissions according to your own user) + +First we pull a "hello world" Singularity image from Singularity-Hub. This we need +to do from the login node, before the job is submitted. i.e. we do not pull +images from within a job. +```console +[SAGA]$ singularity pull --name hello-world.sif shub://vsoch/hello-world +``` + +Once we have the SIF file, we can test it out with the following +job script on Saga (adjust ``; on Fram/Betzy you will need to remove +the line containing `#SBATCH --mem-per-cpu=1000M` but the rest should work as +is): + +```bash +#!/bin/bash + +#SBATCH --account= +#SBATCH --job-name=singularity-test +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --mem-per-cpu=1000M +#SBATCH --time=00:03:00 + + +echo +echo "what is the operating system on the host?" +cat /etc/os-release + +echo +echo "what is the operating system in the container?" +singularity exec hello-world.sif cat /etc/os-release +``` + +This produces the following output. Notice how in the container we are on a +Ubuntu operating system while the host is CentOS: +``` +check that we can read the current directory from the container: +hello-world.sif +run.sh +slurm-1119935.out + +what is the operating system on the host? +NAME="CentOS Linux" +VERSION="7 (Core)" +ID="centos" +ID_LIKE="rhel fedora" +VERSION_ID="7" +PRETTY_NAME="CentOS Linux 7 (Core)" +ANSI_COLOR="0;31" +CPE_NAME="cpe:/o:centos:centos:7" +HOME_URL="https://www.centos.org/" +BUG_REPORT_URL="https://bugs.centos.org/" + +CENTOS_MANTISBT_PROJECT="CentOS-7" +CENTOS_MANTISBT_PROJECT_VERSION="7" +REDHAT_SUPPORT_PRODUCT="centos" +REDHAT_SUPPORT_PRODUCT_VERSION="7" + +what is the operating system in the container? +NAME="Ubuntu" +VERSION="14.04.6 LTS, Trusty Tahr" +ID=ubuntu +ID_LIKE=debian +PRETTY_NAME="Ubuntu 14.04.6 LTS" +VERSION_ID="14.04" +HOME_URL="http://www.ubuntu.com/" +SUPPORT_URL="http://help.ubuntu.com/" +BUG_REPORT_URL="http://bugs.launchpad.net/ubuntu/" +``` + +```{note} +The behavior described in the above example is only accurate if you run it from somewhere within your `$HOME` +directory. If you run it from somewhere else, like `/cluster/projects/` or `/cluster/work/` you will *not* +enter the container environment from the current directory, but rather from your root `$HOME` directory, i.e. +the output from the first `ls` command in the script will be equivalent to `$ ls $HOME`. If you want to access +files that are *not* located in your `$HOME` you'll need to `--bind` that directory explicitly as described below. +``` + +## Access project area from the container + +Singularity container can access the home directory +but to access the project directory we need to bind it first. + +Lets try it out +```console +[SAGA]$ head -n2 data/input.txt +1 +2 +[SAGA]$ singularity exec hello-world.sif head -n2 data/input.txt +/usr/bin/head: cannot open 'data/input.txt' for reading: No such file or directory +[SAGA]$ pwd +/cluster/projects/nnxxxxk/containers +[SAGA]$ singularity exec hello-world.sif head -n2 /cluster/projects/nnxxxxk/containers/data/input.txt +/usr/bin/head: cannot open '/cluster/projects/nnxxxxk/containers/data/input.txt' for reading: No such file or directory +``` + +Now we use binding to attach local storage and then the container would have access. + +```console +[SAGA]$ singularity exec --bind /path/containers/data:/data bioperl_latest.sif head -n2 /data/input.txt +1 +2 + +``` + +## Real world container examples + +```{eval-rst} +.. toctree:: + :maxdepth: 1 + + containers/bigdft.md +``` +## Other notes +### singularity cache diff --git a/_sources/code_development/guides/containers/bigdft.md.txt b/_sources/code_development/guides/containers/bigdft.md.txt new file mode 100644 index 000000000..0eb6e0a22 --- /dev/null +++ b/_sources/code_development/guides/containers/bigdft.md.txt @@ -0,0 +1,188 @@ +(bigdft-cuda-example)= +# BigDFT with MPI and CUDA + +```{note} +Parts of the following example require access to NVIDIA GPU resources. It has been tested +successfully on Saga, but there's no guarantee it will work seamlessly on other systems. +``` + +This example demonstrates: +1. how to bind mount a work directory into the container +2. how to copy files from the container to the host +3. how to run an interactive shell inside the container +4. how to launch a hybrid MPI+OpenMP container using the host MPI runtime +5. how to launch a CUDA container + +[BigDFT](https://bigdft-suite.readthedocs.io/en/latest) is an electronic structure code targeting large molecular +systems with density functional theory. The program is written for heterogeneous computing +environments with support for both MPI, OpenMP and CUDA. This makes for a good test case +as a more advanced container application. All the following is based on the official +tutorial which can be found [here](https://ngc.nvidia.com/catalog/containers/hpc:bigdft). + +A BigDFT Docker image with CUDA support is provided by the +[NVIDIA GPU Cloud (NGC)](https://ngc.nvidia.com/catalog) +and can be built using Singularity with the following command (here into a folder called +`$HOME/containers`, but this is arbitrary): +```console +[SAGA]$ singularity pull --name $HOME/containers/bigdft-cuda.sif docker://nvcr.io/hpc/bigdft:cuda10-ubuntu1804-ompi4-mkl +``` + +```{warning} +Container images are typically a few GiB in size, so you might want to keep your +containers in a project storage area to avoid filling up your limited `$HOME` disk quota. +Also beware that pulled images are cached, by default under `$HOME/.singularity/cache`. +This means that if you pull the same image twice, it will be immediately available from +the cache without downloading/building, but it also means that it will consume disk space. +To avoid this you can either add `--disable-cache` to the `pull` command, change the cache +directory with the `SINGULARITY_CACHEDIR` environment variable, or clean up the cache +regularly with `singularity cache clean`. +``` + +## MPI + OpenMP version + +The BigDFT container comes bundled with a couple of test cases that can be used to verify +that everything works correctly. We will start by extracting the necessary input files +for a test case called FeHyb which can be found in the `/docker/FeHyb` directory _inside_ +the container (starting here with the non-GPU version): +```console +[SAGA]$ mkdir $HOME/bigdft-test +[SAGA]$ singularity exec --bind $HOME/bigdft-test:/work-dir $HOME/containers/bigdft-cuda.sif /bin/bash -c "cp -r /docker/FeHyb/NOGPU /work-dir" +``` +Here we first create a job directory for our test calculation on the host called `$HOME/bigdft-test` +and then bind mount this to a directory called `/work-dir` _inside_ the container. Then we execute +a bash command in the container to copy the example files from `/docker/FeHyb/NOGPU` into this +work directory, which is really the `$HOME/bigdft-test` directory on the host. You should now see a +`NOGPU` folder on the host file system with the following content: +```console +[SAGA]$ ls $HOME/bigdft-test/NOGPU +input.yaml log.ref.yaml posinp.xyz psppar.Fe tols-BigDFT.yaml +``` + +```{note} +Container images are read-only, so it is not possible to copy things _into_ the container +or change it in any other way without sudo access on the host. This is why all container +_construction_ needs to be done on your local machine where you have such privileges, see [guides](code_development) for more info on building containers. +``` + +The next thing to do is to write a job script for the test calculation, we call it +`$HOME/bigdft-test/NOGPU/FeHyb.run`: +```bash +#!/bin/bash + +#SBATCH --account= +#SBATCH --job-name=FeHyb-NOGPU +#SBATCH --ntasks=2 +#SBATCH --cpus-per-task=10 +#SBATCH --mem-per-cpu=1G +#SBATCH --time=00:10:00 + +# Need to provide a compatible MPI library on the host for launching the calculation +module load OpenMPI/4.0.3-GCC-9.3.0 + +# Run the bigdft command inside the bigdft-cuda.sif container +# We assume we are already in the folder containing the input.yaml +# file and we bind the current directory into the container +mpirun --bind-to none singularity exec --bind $PWD $HOME/containers/bigdft-cuda.sif bigdft + +exit 0 +``` + +The `--bind-to none` option is necessary to avoid all OpenMP threads landing on the +same CPU core. Now set `` to something appropriate and launch the job +```console +[SAGA]$ sbatch FeHyb.run +``` +It should not take more than a minute to finish. After completion, the Slurm output +file should contain the following line (in addition to the usual Slurm statistics output): +```console + log of the run will be written in logfile: ./log.yaml +``` +To check that the calculation succeeded, we can inspect the `log.yaml` output file, +or we can run a test script provided by the container. First start an interactive shell +inside the container (you should run this command in the job directory containing the +`log.yaml` file so that we find it when we bind to the current directory): +```console +[SAGA]$ singularity shell --bind $PWD $HOME/containers/bigdft-cuda.sif +``` +Now you have stepped into the container and your shell prompt should have changed from `$` +to `Singularity>`. Now run the command: +```console +Singularity> python /usr/local/bigdft/lib/python2.7/site-packages/fldiff_yaml.py -d log.yaml -r /docker/FeHyb/NOGPU/log.ref.yaml -t /docker/FeHyb/NOGPU/tols-BigDFT.yaml +``` +which hopefully reports success, something like this: +``` +--- +Maximum discrepancy: 2.4000000000052625e-09 +Maximum tolerance applied: 1.1e-07 +Platform: c1-33 +Seconds needed for the test: 11.92 +Test succeeded: True +Remarks: !!map + Report: {Document: 0, Elapsed Time (s): 11.923177122, Failed_checks: 0, Max_Diff: 2.4000000000052625e-09, + Memory_leaks (B): 0, Missed_items: 0} +``` +To exit the container, type `exit` or press `Ctrl-D`. + +## CUDA version + +We will now run the same example using the CUDA version of BigDFT. We again copy the +bundled input files from within the container, this time the `GPU` directory (see +example above for explanation of the commands): + +```console +[SAGA]$ singularity exec --bind $HOME/bigdft-test:/work-dir $HOME/containers/bigdft-cuda.sif /bin/bash -c "cp -r /docker/FeHyb/GPU /work-dir" +``` +which should contain the following files: +```console +[SAGA]$ ls $HOME/bigdft-test/GPU +input.yaml posinp.xyz psppar.Fe +``` +In order to run this example correctly we need to ask for GPU resources in the job +script, here we call it `$HOME/bigdft-test/GPU/FeHyb.run`. We request a single CPU +core (`--ntasks=1`) with an associated GPU accelerator (`--gpus=1`). Also remember +to use the `accel` partition: +```bash +#!/bin/bash + +#SBATCH --account= +#SBATCH --job-name=FeHyb-GPU +#SBATCH --ntasks=1 +#SBATCH --mem-per-cpu=10G +#SBATCH --gpus=1 +#SBATCH --partition=accel +#SBATCH --time=00:10:00 + +# Run the bigdft command inside the bigdft-cuda.sif container +# We assume we are already in the folder containing the input.yaml +# file and we bind the current directory into the container +singularity exec --nv --bind $PWD $HOME/containers/bigdft-cuda.sif bigdft + +exit 0 +``` +With BigDFT, the CUDA request is handled through the input file, so we run the same +`bigdft` executable as before. There is an extra `--nv` option for the `singularity exec` +command though, which will make the container aware of the available NVIDIA hardware. +Set `` to something appropriate and launch the job +```console +[SAGA]$ sbatch FeHyb.run +``` +We can again check that the calculation completed successfully by shell-ing into the +container and running the diff script (note that we still compare against the `NOGPU` +reference as there is no specific GPU reference available in the container): +```console +[SAGA]$ singularity shell --bind $PWD $HOME/containers/bigdft-cuda.sif +Singularity> python /usr/local/bigdft/lib/python2.7/site-packages/fldiff_yaml.py -d log.yaml -r /docker/FeHyb/NOGPU/log.ref.yaml -t /docker/FeHyb/NOGPU/tols-BigDFT.yaml +--- +Maximum discrepancy: 2.4000000000052625e-09 +Maximum tolerance applied: 1.1e-07 +Platform: c7-6 +Seconds needed for the test: 11.85 +Test succeeded: True +Remarks: !!map + Report: {Document: 0, Elapsed Time (s): 11.84816281, Failed_checks: 0, Max_Diff: 2.4000000000052625e-09, + Memory_leaks (B): 0, Missed_items: 0} +``` +As we can see from the timings, this small test case run more or less equally fast (11-12 sec) +on a single GPU as on 2x10 CPU cores. For comparison, the same example takes about 90 sec +to complete on a single CPU core. + diff --git a/_sources/code_development/guides/converting_acc2omp/openacc2openmp.md.txt b/_sources/code_development/guides/converting_acc2omp/openacc2openmp.md.txt new file mode 100644 index 000000000..65de2acad --- /dev/null +++ b/_sources/code_development/guides/converting_acc2omp/openacc2openmp.md.txt @@ -0,0 +1,413 @@ +--- +orphan: true +--- + +(acc2omp)= + +# Porting OpenACC to OpenMP offloading + +# Summary + +This documentation is designed for beginners in Graphics Processing Unit (GPU)-programming and who want to get familiar with OpenACC and OpenMP offloading models. Here we present an overview of these two programming models as well as of the GPU-architectures. Specifically, we provide some insights into the functionality of these models and perform experiments involving different directives and discuss their performance. This is achieved through the use of a mini-application based on solving numerically the Laplace equation. Such experiments reveal the benefit of the use of GPU, which in our case manifests by an increase of the performance by almost a factor of 52. We further carry out a comparative study between the OpenACC and OpenMP models in the aim of porting OpenACC to OpenMP on heterogeneous systems. In this context, we present a short overview of the open-source OpenACC compiler Clacc, which is designed based on translating OpenACC to OpenMP in Clang. + +This documentation ultimately aims at initiating developers'/users' interest in GPU-programming. We therefore expect developers/users, by the end of this documentation, to be able to: + +* Recognise the benefits of GPU-programming. +* Acquire some basic knowledge of the GPU-architecture and the functionality of the underlying models. +* Use appropriate constructs and clauses on either programming model to offload compute regions to a GPU device. +* Identify and assess differences and similarities between the OpenACC and OpenMP offload features. +* Convert an OpenACC mini-application to OpenMP offloading. +* Get some highlights of available open-source OpenACC compilers. + + +```{contents} Table of Contents +``` + + +# Introduction + +[OpenACC](https://www.openacc.org/tools) and [OpenMP](https://www.openmp.org/updates/openmp-accelerator-support-gpus/) are the most widely used programming models for heterogeneous computing on modern HPC architectures. OpenACC was developed a decade ago and was designed for parallel programming of heterogenous systems (i.e. CPU host and GPU device). Whereas OpenMP is historically known to be directed to shared-memory multi-core programming, and only recently has provided support for heterogenous systems. OpenACC and OpenMP are directive-based programming models for offloading compute regions from CPU host to GPU devices. These models are referred to as Application Programming Interfaces (APIs), which here enable to communicate between two heterogenous systems and specifically enable offloading to target devices. The offloading process is controlled by a set of compiler directives, library runtime routines as well as environment variables. These components will be addressed in the following for both models with a special focus on directives and clauses. Furthermore, differences and similarities will be assessed in the aim of converting OpenACC to OpenMP. + +*Motivation:* NVIDIA-based Programming models are bounded by some barriers related to the GPU-architecture. +Such models do not have direct support on different devices nor by +the corresponding compilers. Removing such barriers is one of the bottleneck +in GPU-programming, which is the case for instance with OpenACC. The latter is one of +the most popular programming model that requires a special attention in terms of support on available architectures. + +As far as we know, the only compiler that fully supports OpenACC for offloading to both NVIDIA and AMD devices is GCC. The GCC's performance, however, suffers from some weaknesses and poses some [challenges](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8639349), which limit its extension. Although the Cray Compilation Environment [(CCE)](https://support.hpe.com/hpesc/public/docDisplay?docId=a00115296en_us&page=OpenACC_Use.html) has full support of OpenACC 2.0 and partial support of OpenACC 2.6, the support is limited only to Fortran, and thus no support for C or C++. This lack of support for OpenACC calls for an alternative that goes beyond the GCC compiler, and which ensures higher performance. On the other hand, the OpenMP offloading is supported on multiple devices by a set of compilers such as *Clang/Flang* and *Cray*, and *Icx/Ifx* which are well-known to provide the highest performance with respect to GCC. Therefore, converting OpenACC to OpenMP becomes a necessity to overcome the lack of stable implementations for all relevant hardware vendors, and to extend the OpenACC implementations to cover various GPU-architectures. In this context, there has been a project funded by [Exascale Computing Project](https://www.exascaleproject.org/highlight/clacc-an-open-source-openacc-compiler-and-source-code-translation-project/) and published [here](https://ieeexplore.ieee.org/document/8639349), which aims at developing an open-source OpenACC compiler. This documentation is inspired by this project and is motivated by the need to document how to translate OpenACC to OpenMP on heterogeneous systems. + + +This documentation is organised as follows. In {ref}`sec. II `, +we provide a computational model, which is based on solving the Laplace +equation. {ref}`Section III ` is devoted to +the analysis of experiments performed using the OpenACC and OpenMP offload +features and to a one-to-one mapping of these two models. {ref}`Section +IV ` is directed to a discussion about +open-source OpenACC compilers. Finally, conclusions are given in {ref}`Sec. V `. + + +(computational-model)= + +# Computational model + +We give a brief description of the numerical model used to solve the Laplace equation Δf=0. For the sake of simplicity, we solve the equation in a two-dimensional (2D) uniform grid according to + + +```{math} \Delta f(x,y)=\frac{\partial^{2} f(x,y)}{\partial x^{2}} + \frac{\partial^{2} f(x,y)}{\partial y^{2}}=0. \ \ \ \ (1) +``` +Here we use the finite-difference method to approximate the partial derivative of the form $`\frac{\partial^{2} f(x)}{\partial x^{2}}`$. The spatial discretization in the second-order scheme can be written as + +```{math} \frac{\partial^{2} f(x,y)}{\partial^{2} x}=\frac{f(x_{i+1},y) - 2f(x_{i},y) + f(x_{i-1},y)}{\Delta x}. \ \ \ \ (2) +``` + +Inserting Eq. (2) into Eq. (1) leads to this final expression + +```{math} f(x_i,y_j)=\frac{f(x_{i+1},y) + f(x_{i-1},y) + f(x,y_{i+1}) + f(x,y_{i-1})}{4}. \ \ \ \ (3) +``` + +The Eq. (3) can be solved iteratively by defining some initial conditions that reflect the geometry of the problem at-hand. This can be done either using Gauss–Seidel method or Jacobi method. Here, we apt for the Jacobi algorithm due to its simplicity. The corresponding compute code is written in *Fortran 90* and is given below in a serial form. Note that a *C*-based code can be found {ref}`here `. + +```fortran +do while (max_err.gt.error.and.iter.le.max_iter) + do j=2,ny-1 + do i=2,nx-1 + d2fx = f(i+1,j) + f(i-1,j) + d2fy = f(i,j+1) + f(i,j-1) + f_k(i,j) = 0.25*(d2fx + d2fy) + enddo + enddo + + max_err=0. + + do j=2,ny-1 + do i=2,nx-1 + max_err = max(dabs(f_k(i,j) - f(i,j)),max_err) + f(i,j) = f_k(i,j) + enddo + enddo + + iter = iter + 1 +enddo +``` + +(comparative-study-openacc-versus-openmp)= + +# Comparative study: OpenACC versus OpenMP + +In the following we first provide a short description of GPU accelerators and then perform experiments covering both the OpenACC and OpenMP implementations to accelerate the Jacobi algorithm in the aim of conducting a comparative experiment between the two programming models. The experiments are systematically performed with a fixed number of grid points (i.e. 8192 points in both $`x`$ and $`y`$ directions) and a fixed number of iterations that ensure the convergence of the algorithm. This is found to be 240 iterations resulting in an error of 0.001. + +## GPU architecture + +We focus in this section on describing the [NVIDIA GPU accelerator](https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf) as it is considered the most powerful accelerator in the world to be used for artificial intelligence (AI) and high-performance computing (HPC). The NVIDIA GPU-device consists of a block of a Streaming Multiprocessor (SM) each of which is organized as a matrix of CUDA cores, as shown in *Fig. 1*. As an example, the [NVIDIA P100 GPU-accelerators](https://images.nvidia.com/content/tesla/pdf/nvidia-tesla-p100-PCIe-datasheet.pdf) have 56 SMs and each SM has 64 CUDA cores with a total of 3584 CUDA cores/GPU, while the [NVIDIA V100](https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf) has 80 SMs and each SM has 64 CUDA cores with a total of 5120 CUDA core/GPU. + +
+ +![Fig1](figs/fig-hardware.jpg) + +**Fig. 1.** *A simplified representation of a NVIDIA GPU-architecture.* +
+ +Various NVIDIA [GPU-architectures](https://gpltech.com/wp-content/uploads/2018/11/NVIDIA-Turing-Architecture-Whitepaper.pdf) exist. As an example, we present in *Fig. 2* the characteristic of the NVIDIA V100 Volta architecture. As shown in the figure, the peak performance of the NVIDIA Volta depends on the specified architecture: V100 PCle, V100 SXMe and V100S PCle, which in turn depends, in particular, on the Memory Bandwidth. For instance, the double precision performance associated with each architecture is respectively, 7, 7.8 and 8.2 TFLOPS (or TeraFLOPS). Here 1 TFLOPS= $`10^{12}`$ calculations per second, where FLOPS (Floiting-Point of Opertaions Per Second), in general, defines a measure of the speed of a computer to perform arithmetic operations. The peak performance can be calculated theoretically based on the following [expression](https://en.wikipedia.org/wiki/FLOPS#cite_note-en.community.dell.com-5) for a single processor + +FLOPS = (Clock speed)$`\times`$(cores)$`\times`$(FLOP/cycle), + + where FLOP is a way of encoding real numbers (i.e. FP64 or FP32 or ...). One can check the validity of the expression by calculating, for instance, the peak performance for V100 PCle, in which the Clock speed (or GPU Boost Clock) is [1.38 GHz](https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf). The total FLOPS is (1.38 $`10^9`$ cycle/second)x5120xFLOP/cycle, which is 7.065 $`10^{12}`$ FLOP per second or also 7.065 TFLOPS in accordance with the peak performance indicated in *Fig. 2*. + +
+ +![Fig2](figs/fig-software.jpg) + +**Fig. 2.** *Specification of the architecture of the NVIDIA Volta GPU taken from [here](https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf).* +
+ + +(experiment-on-openacc-offloading)= + +## Experiment on OpenACC offloading + +We begin first by illustrating the functionality of the [OpenACC model](https://www.openacc.org/sites/default/files/inline-files/OpenACC_Programming_Guide_0_0.pdf) in terms of parallelism, which is specified by the directives `kernels` or `parallel loop`. The concept of parallelism is defined precisely by the generic directives: `gang`, `worker` and `vector` as schematically depicted in *Fig. 3*. Here, the compiler initiates the parallelism by generating parallel gangs, in which each gang consists of a set of workers represented by a matrix of threads as depicted in the inset of *Fig. 3*. This group of threads within a gang executes the same instruction (SIMT, Single Instruction Multiple Threads) via a vectorization process. In other words, a block of loops is assigned to each gang, which gets vectorized and executed by a group of threads. Specifically, each thread executes the same kernel program but operates on different parts of the offloaded region. + +By combining the two pictures displayed in *Fig. 1* and *Fig. 2*, one can say that the execution of the parallelism, which is specified by the `parallel loop` construct, is mapped on the GPU-device in the following way: each streaming multiprocessor is associated to one gang of threads generated by the directive `gang`, in which a block of loops is assigned to. In addition, this block of loops is run in parallel on the CUDA cores via the directive `vector`. The description of these directives and others implemented in our OpenACC mini-application is summarized in the *Table 1* + +
+ +![Fig3](figs/fig-arch-volta.jpg) + +**Fig. 3.** *Schematic representation of the concept of parallelism (see text for more details).* +
+ +We move now to discuss our OpenACC experiment, in which we evaluate the performance of different compute constructs and clauses and interpret their role. The OpenACC-based code is shown below. In the left-hand side of the code, only the directive `parallel loop` is introduced. Here the construct `parallel` indicates that the compiler will generate a number of parallel gangs to execute the compute region redundantly. When it is combined with the clause `loop`, the compiler will perform the parallelism over all the generated gangs for the offloaded region. In this case the compiler copies the data first to a device in the beginning of the loop and then copies it back to the host at the end of the loop. This process repeats itself at each iteration, which makes it time consuming, thus rending the GPU-acceleration inefficient. This inefficiency is shown in *Fig. 4* and manifests by the increase of the computing time: 111.2 s compared to 101.77 s in the serial case. This low performance is also observed when using the construct `kernels`. + +To overcome this issue, one needs to copy the data to a device only in the beginning of the iteration and to copy them back to the host at the end of the iteration, once the result converges. This can be done by introducing the data locality concepts via the directives `data`, `copyin` and `copyout`, as shown in the code application (right-hand side). Here, the clause `copyin` transfers the data to a GPU-device, while the clause `copyout` copies the data back to the host. Implementing this approach shows a vast improvement of the performance: the computing time gets reduced by almost a factor of 53: it decreases from 111.2 s to 2.12 s. One can further tune the process by adding additional control, for instance, by introducing the `collapse` clause. Collapsing two or more loops into a single loop is beneficial for the compiler, as it allows to enhance the parallelism when mapping the compute region into a device. In addition, one can specify the clause `reduction`, which allows to compute the maximum of two elements in a parallel way. These additional clauses affect slightly the computing time: it goes from 2.12 s to 1.95 s. + +For completeness, we compare in *Fig. 4* the performance of the compute constructs `kernels` and `parallel loop`. These directives tell the compiler to transfer the control of a compute region to a GPU-device and execute it in a sequence of operations. Although these two constructs have a similar role, they differ in terms of mapping the parallelism into a device. Here, when specifying the `kernels` construct, the compiler performs the partition of the parallelism explicitly by choosing the optimal numbers of gangs, workers and the length of the vectors and also some additional clauses. Whereas, the use of the `parallel loop` construct offers some additional functionality: it allows the programmer to control the execution in a device by specifying additional clauses. At the end, the performance remains roughly the same as shown in *Fig. 4*: the computing time is 1.97 s for the `kernels` directive and 1.95 s for the `parallel loop` directive. + +```bash + **OpenACC without data locality** | **OpenACC with data locality** + | !$acc data copyin(f) copyout(f_k) + do while (max_err.gt.error.and.iter.le.max_iter) | do while (max_err.gt.error.and.iter.le.max_iter) +!$acc parallel loop gang worker vector | !$acc parallel loop gang worker vector collapse(2) + do j=2,ny-1 | do j=2,ny-1 + do i=2,nx-1 | do i=2,nx-1 + d2fx = f(i+1,j) + f(i-1,j) | d2fx = f(i+1,j) + f(i-1,j) + d2fy = f(i,j+1) + f(i,j-1) | d2fy = f(i,j+1) + f(i,j-1) + f_k(i,j) = 0.25*(d2fx + d2fy) | f_k(i,j) = 0.25*(d2fx + d2fy) +        enddo | enddo +      enddo | enddo +!$acc end parallel | !$acc end parallel + | +      max_err=0. | max_err=0. + | +!$acc parallel loop | !$acc parallel loop collapse(2) reduction(max:max_err) + do j=2,ny-1 | do j=2,ny-1 +        do i=2,nx-1 | do i=2,nx-1 +         max_err = max(dabs(f_k(i,j)-f(i,j)),max_err)| max_err = max(dabs(f_k(i,j)-f(i,j)),max_err) +         f(i,j) = f_k(i,j) | f(i,j) = f_k(i,j) + enddo | enddo + enddo | enddo +!$acc end parallel | !$acc end parallel  + | +       iter = iter + 1 | iter = iter + 1 + enddo | enddo + | !$acc end data +``` + +
+ +![Fig4](figs/fig-acc.jpeg) + +**Fig. 4.** *Performance of different OpenACC directives.* +
+ + +```{note} +- When incorporating the constructs `kernels` or `parallel loop`, the compiler will generate arrays that will be copied back and forth + between the host and the device if they are not already present in the device. + +- Different gangs operate independently. +``` + + +### Compiling and running OpenACC-program + +We run our OpenACC-program on the NVIDIA-GPU P100. The syntax of the compilation process is +```bash +$ nvfortran -fast -acc -Minfo=accel -o laplace_acc.exe laplace_acc.f90 +or +$ nvfortran -gpu=tesla:cc60 -Minfo=accel -o laplace_acc.exe laplace_acc.f90 +``` +where the flags `-acc` and `-⁠gpu=[target]` enables OpenACC directives. The option `[target]` reflects the name of the GPU device. The latter is set to be `[tesla:cc60]` for the device name Tesla P100 and `[tesla:cc70]` for the tesla V100 device. This information can be viewed by running the command `pgaccelinfo`. Last, the flag option `-Minfo` enables the compiler to print out the feedback messages on optimizations and transformations. + +The generated binary (i.e. `laplace_acc.exe`) can be launched with the use of a Slurm script as follows +```bash +#!/bin/bash +#SBATCH --account= +#SBATCH --job-name=laplace_acc +#SBATCH --partition=accel --gpus=1 +#SBATCH --qos=devel +#SBATCH --time=00:01:00 +#SBATCH --mem-per-cpu=2G +#SBATCH -o laplace_acc.out + +#loading modules +module purge +module load NVHPC/21.2 + +$ srun ./laplace_acc.exe +``` +In the script above, the option `--partition=accel` enables the access to a GPU device connected to a node, as shown {ref}`here `. One can also use the command `sinfo` to get information about which nodes are connected to the GPUs. + +```{note} +The compilation process requires loading a NVHPC module, e.g. `NVHPC/21.2` or another version. +``` + + +## Experiment on OpenMP offloading + +In this section, we carry out an experiment on [OpenMP](https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5-1.pdf) offloading by adopting the same scenario as in the previous {ref}`section ` but with the use of a different GPU-architecture: AMD Mi100 accelerator. The functionality of OpenMP is similar to the one of OpenACC, although the terminology is different [cf. *Fig. 1*]. In the OpenMP concept, a block of loops is offloaded to a device via the construct `target`. A set of threads is then created on each compute unit (CU) (analogous to a streaming multiprocessor in NVIDIA terminology) [cf. *Fig. 1*] by means of the directive `teams` to execute the offloaded region. Here, the offloaded region (e.g. a block of loops) gets assigned to teams via the clause `distribute`, and gets executed on the processing elements (PEs) or also called stream processors (analogous to CUDA cores) by means of the directive `parallel do simd`. These directives define the concept of parallelism in OpenMP. + +The concept of parallelism is implemented using the same model described in {ref}`Section II `. The implementation is presented below for two cases: (i) OpenMP without introducing the data directive and (ii) OpenMP with the data directive. This Comparison allows us to identify the benefit of data management during the data-transfer between the host and a device. This in turn provides some insights into the performance of the OpenMP offload features. In the left-hand-side of the OpenMP application, the arrays **f** and **f_k**, which define the main components of the compute region, are copied from the host to a device and back, respectively via the clause `map`. Note that specifying the `map` clause in this case is optional. Once the data are offloaded to a device, the parallelism gets executed according to the scenario described above. This scheme repeats itself at each iteration, which causes a low performance as shown in *Fig. 5*. Here the computing time is 119.6 s, which is too high compared to 76.52 s in the serial case. A similar behavior is observed in the OpenACC mini-application. + +The OpenMP performance, however is found to be improved when introducing the directive `data` in the beginning of the iteration. This implementation has the advantage of keeping the data in the device during the iteration process and copying them back to the host only at the end of the iteration. By doing so, the performance is improved by almost a factor of 22, as depicted in *Fig. 5*: it goes from 119.6 s in the absence of the data directive to 5.4 s when the directive is introduced. As in the OpenACC application, the performance can be further tuned by introducing additional clauses, specifically, the clauses `collapse` and `schedule` which are found to reduce the computing time from 5.4 s to 2.15 s. + +The description of the compute constructs and clauses used in our OpenMP mini-application is provided in the *Table 1* together with those of OpenACC. For further OpenMP tutorials, we refer to a different scenario implemented in C, which can be found {ref}`here `. + +```bash + **OpenMP without data directive** | **OpenMP with data directive** + | !$omp target data map(to:f) map(from:f_k) + do while (max_err.gt.error.and.iter.le.max_iter) | do while (max_err.gt.error.and.iter.le.max_iter) +!$omp target teams distribute parallel do simd | !$omp target teams distribute parallel do simd collapse(2) + map(to:f) map(from:f_k) | schedule(static,1) + do j=2,ny-1 | do j=2,ny-1 + do i=2,nx-1 | do i=2,nx-1 + d2fx = f(i+1,j) + f(i-1,j) | d2fx = f(i+1,j) + f(i-1,j) + d2fy = f(i,j+1) + f(i,j-1) | d2fy = f(i,j+1) + f(i,j-1) + f_k(i,j) = 0.25*(d2fx + d2fy) | f_k(i,j) = 0.25*(d2fx + d2fy) +        enddo | enddo +      enddo | enddo +!$omp end target teams distribute parallel do simd | !$omp end target teams distribute parallel do simd + | +      max_err=0. | max_err=0. + | +!$omp target teams distribute parallel do simd | !$omp target teams distribute parallel do simd collapse(2) + reduction(max:max_err) | schedule(static,1) reduction(max:max_err) + do j=2,ny-1 | do j=2,ny-1 +        do i=2,nx-1 | do i=2,nx-1 +         max_err = max(dabs(f_k(i,j)-f(i,j)),max_err)| max_err = max(dabs(f_k(i,j)-f(i,j)),max_err) +         f(i,j) = f_k(i,j) | f(i,j) = f_k(i,j) + enddo | enddo + enddo | enddo +!$omp end target teams distribute parallel do simd | !$omp end target teams distribute parallel do simd + | +       iter = iter + 1 | iter = iter + 1 + enddo | enddo + | !$omp end target data +``` + +
+ +![Fig5](figs/fig-omp.jpg) + +**Fig. 5.** *Performance of different OpenMP directives.* +
+ +### Compiling and running OpenMP-program + +Our OpenMP benchmark test runs on AMD Mi100 accelerator. The syntax of the compilation process can be written in the following form: + +```bash +flang -fopenmp=libomp -fopenmp-targets= -Xopenmp-target= -march= laplace_omp.f90 +``` + +The flag `-fopenmp` activates the OpenMP directives (i.e. !$omp [construct] in Fortran). The option `-fopenmp-targets=` is used to enable the target offloading to GPU-accelerators and tells the Flang compiler to use `=amdgcn-amd-amdhsa` as the AMD target. The `-Xopenmp-target` flag enables options to be passed to the target offloading toolchain. In addition, we need to specify the architecture of the GPU to be used. This is done via the flag `-march=`, where `` specifies the name of the GPU-architecture. This characteristic feature can be extracted from the machine via the command `rocminfo`. For instance, the AMD Mi100 accelerator architecture is specified by the flag `-march=gfx908 amd-arch`.    + +```{note} +The compilation process requires loading a AOMP module, e.g. `AOMP/13.0-2-GCCcore-10.2.0` or a newer version. +``` + + +## Mapping OpenACC to OpenMP + +In this section, we present a direct comparison between the OpenACC and OpenMP offload features. This comparison is illustrated in the code below. A closer look at OpenACC and OpenMP codes reveals some similarities and differences in terms of constructs and clauses. The meaning of these directives is summarized in the *Table 1*. Here, evaluating the behavior of OpenACC and OpenMP by one-to-one mapping is a key feature for an effort of porting OpenACC to OpenMP on heterogeneous systems. Based on this comparison, it is seen that the syntax of both programming models is so similar, thus making the implementation of a translation procedure at the syntactic level straightforward. Therefore, carrying out such a comparison is critical for determining the correct mappings to OpenMP offloading. + +```bash + **OpenACC** | **OpenMP** +!$acc data copyin(f) copyout(f_k) | !$omp target data map(to:f) map(from:f_k) + do while (max_err.gt.error.and.iter.le.max_iter) | do while (max_err.gt.error.and.iter.le.max_iter) +!$acc parallel loop gang worker vector collapse(2) | !$omp target teams distribute parallel do simd collapse(2) + | schedule(static,1) + do j=2,ny-1 | do j=2,ny-1 + do i=2,nx-1 | do i=2,nx-1 + d2fx = f(i+1,j) + f(i-1,j) | d2fx = f(i+1,j) + f(i-1,j) + d2fy = f(i,j+1) + f(i,j-1) | d2fy = f(i,j+1) + f(i,j-1) + f_k(i,j) = 0.25*(d2fx + d2fy) | f_k(i,j) = 0.25*(d2fx + d2fy) +        enddo | enddo +      enddo | enddo +!$acc end parallel | !$omp end target teams distribute parallel do simd + | +      max_err=0. | max_err=0. + | +!$acc parallel loop collapse(2) reduction(max:max_err) | !$omp target teams distribute parallel do simd collapse(2) + | schedule(static,1) reduction(max:max_err) + do j=2,ny-1 | do j=2,ny-1 +        do i=2,nx-1 | do i=2,nx-1 +         max_err = max(dabs(f_k(i,j)-f(i,j)),max_err)| max_err = max(dabs(f_k(i,j)-f(i,j)),max_err) +         f(i,j) = f_k(i,j) | f(i,j) = f_k(i,j) + enddo | enddo + enddo | enddo +!$acc end parallel | !$omp end target teams distribute parallel do simd + | +       iter = iter + 1 | iter = iter + 1 + enddo | enddo +!$acc end data | !$omp end target data +``` + +OpenACC | OpenMP | interpretation | +-- | -- | -- | +acc parallel | omp target teams | to execute a compute region on a device| +acc kernels | No explicit counterpart | - -| +acc parallel loop gang worker vector | omp target teams distribute parallel do simd | to parallelize a block of loops on a device| +acc data | omp target data | to share data between multiple parallel regions in a device| +-- | -- | -- | +acc loop | omp teams distribute | to workshare for parallelism on a device| +acc loop gang | omp teams(num_teams) | to partition a loop across gangs/teams| +acc loop worker | omp parallel simd | to partition a loop across threads| +acc loop vector | omp parallel simd | - - | +num_gangs | num_teams | to control how many gangs/teams are created | +num_workers | num_threads | to control how many worker/threads are created in each gang/teams | +vector_length | No counterpart | to control how many data elements can be operated on | +-- | -- | -- | +acc create() | omp map(alloc:) | to allocate a memory for an array in a device| +acc copy() | omp map(tofrom:) | to copy arrays from the host to a device and back to the host| +acc copyin() | omp map(to:) | to copy arrays to a device| +acc copyout()| omp map(from:) | to copy arrays from a device to the host| +-- | -- | -- | +acc reduction(operator:var)| omp reduction(operator:var) | to reduce the number of elements in an array to one value | +acc collapse(N) | omp collapse(N) | to collapse N nested loops into one loop | +No counterpart | omp schedule(,) | to schedule the work for each thread according to the collapsed loops| +private(var) | private(var) | to allocate a copy of the variable `var` on each gang/teams| +firstprivate | firstprivate | to allocate a copy of the variable `var` on each gang/teams and to initialise it with the value of the local thread| + +**Table 1.** *Description of various directives and clauses: OpenACC vs OpenMP.* + +```{note} +Details about library routines can be found [here](https://gcc.gnu.org/onlinedocs/libgomp/OpenACC-Runtime-Library-Routines.html) for OpenACC and [here](https://www.intel.com/content/www/us/en/develop/documentation/get-started-with-cpp-fortran-compiler-openmp/top.html) for OpenMP. +``` + +(open-source-openacc-compilers)= + +# Open-source OpenACC compilers + + For completeness, we provide in this section some highlights of the available open-source OpenACC compilers. According to the work of [J. Vetter et al.](https://ieeexplore.ieee.org/document/8639349) and the [OpenACC website](https://www.openacc.org/tools), the only open-source compiler that supports OpenACC offloading to NVIDIA and AMD accelerators is GCC 10. Recently, there has been an effort in developing an open-source compiler to complement the existing one, thus allowing to perform experiments on a broad range of architectures. The compiler is called [Clacc](https://ieeexplore.ieee.org/document/8639349) and its development is funded by the Exascale Computing Project [Clacc project](https://www.exascaleproject.org/highlight/clacc-an-open-source-openacc-compiler-and-source-code-translation-project/) and is further described by [J. Vetter et al.](https://ieeexplore.ieee.org/document/8639349). We thus focus here on providing some basic features of the Clacc compiler platform, without addressing deeply the fundamental aspect of the compiler, which is beyond the scope of this documentation.. + +Clacc is an open-source OpenACC compiler platform that has support for [Clang](https://clang.llvm.org/) and [LLVM](https://llvm.org/), and aims at facilitating GPU-programming in its broad use. The key behind the design of Clacc is based on translating OpenACC to OpenMP, taking advantage of the existing OpenMP debugging tools to be re-used for OpenACC. Clacc was designed to mimic the exact behavior of OpenMP as explicit as possible. The Clacc strategy for interpreting OpenACC is based on one-to-one mapping of [OpenACC directives to OpenMP directives](https://ieeexplore.ieee.org/document/8639349) as we have already shown in the *Table 1* above. + +Despite the new development of Clacc compiler platform, there is still major need to further extend the compiler as it suffers from some limitations, [mainly](https://ieeexplore.ieee.org/document/8639349): (i) in the Clacc's design, translating OpenACC to OpenMP in Clang/Flang is currently supported only in C and Fortran but not yet in C++. (ii) Clacc has so far focused primarily on compute constructs, and thus lacks support of data-sharing between the CPU-host and a GPU-device. These limitations however are expected to be overcome in the near future. So far, Clacc has been tested and benchmarked against a series of different configurations, and it is found to provide an acceptable GPU-performance, as stated [here](https://www.exascaleproject.org/highlight/clacc-an-open-source-openacc-compiler-and-source-code-translation-project/). Note that Clacc is publicly available [here](https://github.com/llvm-doe-org/llvm-project/wiki). + + +(conclusion)= + +# Conclusion + +In conclusion, we have presented an overview of the GPU-architecture as well as the OpenACC and OpenMP offload features via an application based on solving the Laplace equation in a 2D uniform grid. This benchmark application was used to experiment the performance of some of the basic directives and clauses in order to highlight the gain behind the use of GPU-accelerators. The performance here was found to be improved by almost a factor of 52. We have also presented an evaluation of differences and similarities between OpenACC and OpenMP programming models. Furthermore, we have illustrated a one-to-one mapping of OpenACC directives to OpenMP directives in the aim of porting OpenACC to OpenMP. In this context, we have emphasized the recent development of the Clacc compiler platform, which is an open-source OpenACC compiler, although the platform support is so far limited to C and fortran and lacks data-transfer in host-device. + +Last but not least, writing an efficient GPU-based program requires some basic knowledge of the GPU architecture and how regions of a program is mapped into a target device. This documentation thus was designed to provide such basic knowledge in the aim of triggering the interest of developers/users to GPU-programming. It thus functions as a benchmark for future advanced GPU-based parallel programming models. + + +# Relevant links + +[Various NVIDIA GPU-architectures](https://gpltech.com/wp-content/uploads/2018/11/NVIDIA-Turing-Architecture-Whitepaper.pdf). + +[NVIDIA P100 GPU-accelerator](https://images.nvidia.com/content/tesla/pdf/nvidia-tesla-p100-PCIe-datasheet.pdf). + +[NVIDIA V100 GPU-accelerator](https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf). + +[Detailed description about the NVIDIA V100 GPU-accelerator](https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf) + +[OpenACC programming guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC_Programming_Guide_0_0.pdf). + +[OpenMP offloading programming guide](https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5-1.pdf). + +[OpenACC 2.7 Syntax Reference Guide](https://www.openacc.org/sites/default/files/inline-files/API%20Guide%202.7.pdf). + +[OpenMP 5.0 API Syntax Reference Guide](https://www.openmp.org/wp-content/uploads/OpenMPRef-5.0-111802-web.pdf). + +[OpenACC library routines](https://gcc.gnu.org/onlinedocs/libgomp/OpenACC-Runtime-Library-Routines.html). + +[OpenMP library routines](https://www.intel.com/content/www/us/en/develop/documentation/get-started-with-cpp-fortran-compiler-openmp/top.html). + +[The Clacc compiler platform](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8639349). + +[The Cray Compilation Environment (CCE)](https://support.hpe.com/hpesc/public/docDisplay?docId=a00115296en_us&page=OpenACC_Use.html). + + +```{note} +Users who are interested in porting their applications may contact {ref}`the NRIS GPU team ` for assistance. +``` + + + + + + + diff --git a/_sources/code_development/guides/cuda_translating-tools.md.txt b/_sources/code_development/guides/cuda_translating-tools.md.txt new file mode 100644 index 000000000..743072645 --- /dev/null +++ b/_sources/code_development/guides/cuda_translating-tools.md.txt @@ -0,0 +1,278 @@ +--- +orphan: true +--- + +(cuda2hip0sycl)= +# Translating GPU-accelerated applications + +We present different tools to translate CUDA and OpenACC applications to target various GPU (Graphics Processing Unit) architectures (e.g. AMD and Intel GPUs). A special focus will be given to [`hipify`](https://docs.amd.com/en-US/bundle/HIPify-Reference-Guide-v5.1/page/HIPify.html), [`syclomatic`](https://www.intel.com/content/www/us/en/developer/articles/technical/syclomatic-new-cuda-to-sycl-code-migration-tool.html#gs.o5pj6f) and [`clacc`](https://csmd.ornl.gov/project/clacc). These tools have been tested on the supercomputer [LUMI-G](https://lumi-supercomputer.eu/lumi_supercomputer/) in which the GPU partitions are of [AMD MI250X GPU](https://www.amd.com/en/products/server-accelerators/instinct-mi250x) type. + +The aim of this tutorial is to guide users through a straightforward procedure for converting CUDA codes to HIP and SYCL, and OpenACC codes to OpenMP offloading. By the end of this tutorial, we expect users to learn about: + +- How to use the `hipify-perl` and `hipify-clang` tools to translate CUDA sources to HIP sources. +- How to use the `syclomatic` tool to convert CUDA source to SYCL. +- How to use the `clacc` tool to convert OpenACC application to OpenMP offloading. +- How to compile the generated HIP, SYCL and OpenMP applications. + +```{contents} +:depth: 2 +``` +(cuda2hip)= +## Translating CUDA to HIP with Hipify + +In this section, we cover the use of `hipify-perl` and `hipify-clang` tools to translate a CUDA application to HIP. + +### Hipify-perl + +The `hipify-perl` tool is a script based on perl that translates CUDA syntax into HIP syntax (see .e.g. [here](https://docs.amd.com/en-US/bundle/HIPify-Reference-Guide-v5.1/page/HIPify.html#perl). As an example, in a CUDA code that makes use of the CUDA functions `cudaMalloc` and `cudaDeviceSynchronize`, the tool will replace `cudaMalloc` by the HIP function `hipMalloc`. Similarly for the CUDA function `cudaDeviceSynchronize`, which will be replaced by `hipDeviceSynchronize`. We list below the basic steps to run `hipify-perl` + +- **Step 1**: loading modules + +On LUMI-G, the following modules need to be loaded: + +```console +$module load CrayEnv +``` + +```console +$module load rocm +``` +- **Step 2**: generating `hipify-perl` script + +```console +$hipify-clang --perl +``` +- **Step 3**: running `hipify-perl` + +```console +$perl hipify-perl program.cu > program.cu.hip +``` +- **Step 4**: compiling with `hipcc` the generated HIP code + +```console +$hipcc --offload-arch=gfx90a -o exec_hip program.cu.hip +``` +Despite of the simplicity of the use of `hipify-perl`, the tool might not be suitable for large applications, as it relies heavily on substituting CUDA strings with HIP strings (e.g. it replaces *cuda* with *hip*). In addition, `hipify-perl` lacks the ability of [distinguishing device/host function calls](https://docs.amd.com/bundle/HIPify-Reference-Guide-v5.1/page/HIPify.html#perl). The alternative here is to use `hipify-clang` as we shall describe in the next section. + +(hipify-clang)= +### Hipify-clang + +As described [here](https://docs.amd.com/en-US/bundle/HIPify-Reference-Guide-v5.1/page/HIPify.html#perl), the `hipify-clang` tool is based on clang for translating CUDA sources into HIP sources. The tool is more robust for translating CUDA codes compared to the `hipify-perl` tool. Furthermore, it facilitates the analysis of the code by providing assistance. + +In short, `hipify-clang` requires `LLVM+CLANG` and `CUDA`. Details about building `hipify-clang` can be found [here](https://github.com/ROCm-Developer-Tools/HIPIFY). Note that `hipify-clang` is available on LUMI-G. The issue however might be related to the installation of CUDA-toolkit. To avoid any eventual issues with the installation procedure we opt for CUDA singularity container. Here we present a step-by-step guide for running `hipify-clang`: + +- **Step 1**: pulling a CUDA singularity container e.g. + +```console +$singularity pull docker://nvcr.io/nvidia/cuda:11.4.0-devel-ubuntu20.04 +``` +- **Step 2**: loading a ROCM module before launching the container. + +```console +$ml rocm +``` + +During our testing, we used the rocm version `rocm-5.0.2`. + +- **Step 3**: launching the container + +```console +$singularity shell -B $PWD,/opt:/opt cuda_11.4.0-devel-ubuntu20.04.sif +``` + +where the current directory `$PWD` in the host is mounted to that of the container, and the directory `/opt` in the host is mounted to the that inside the container. + +- **Step 4**: setting the environment variable `$PATH` +In order to run `hipify-clang` from inside the container, one can set the environment variable `$PATH` that defines tha path to look for the binary `hipify-clang` + +```console +$export PATH=/opt/rocm-5.0.2/bin:$PATH +``` + +- **Step 5**: running `hipify-clang` + +```console +$hipify-clang program.cu -o hip_program.cu.hip --cuda-path=/usr/local/cuda-11.4 -I /usr/local/cuda-11.4/include +``` + +Here the cuda path and the path to the *includes* and *defines* files should be specified. The CUDA source code and the generated output code are `program.cu` and `hip_program.cu.hip`, respectively. + +- **Step 6**: the syntax for compiling the generated hip code is similar to the one described in the previous section (see the hipify-perl section). + +(cuda2sycl)= +## Translating CUDA to SYCL with Syclomatic + +[SYCLomatic](https://github.com/oneapi-src/SYCLomatic) is another conversion tool. However, instead of converting CUDA code to HIP syntax, SYCLomatic converts the code to SYCL/DPC++. The use of SYCLomatic requires CUDA libraries, which can be directly installed in an environment or it can be extracted from a CUDA container. Similarly to previous section, we use singularity container. Here is a step-by-step guide for using `SYCLamatic` + +**Step 1** Downloading `SYCLomatic` e.g. the last release from [here](https://github.com/oneapi-src/SYCLomatic/releases) + +```console +wget https://github.com/oneapi-src/SYCLomatic/releases/download/20230208/linux_release.tgz +``` + +**Step 2** Decompressing the tarball into a desired location: + +```console +$tar -xvzf linux_release.tgz -C [desired install location] +``` + +**Step 3** Adding the the executable ```c2s``` which is located in ```[install location]/bin``` in your path, either by setting the environment variable `$PATH` + +```console +$export PATH=[install location]/bin:$PATH +``` + +Or by creating a symbolic link into a local ```bin``` folder: + +```console +$ln -s [install location]/bin/dpct /usr/bin/c2s +``` + +**Step 4** Launching `SYCLomatic`. This is done by running `c2s` from inside a CUDA container. This is similar to steps 1, 3 and 5 in the previous {ref}`section `. + +```console +$c2s [file to be converted] +``` + +This will create a folder in the current directory called ```dpct_output```, in which the converted file is generated. + +**Step 5** Compiling the generated SYCL code + + +**_step 5.1_** Look for errors in the converted file + +In some cases, `SYCLOmatic` might not be able to convert part of the code. In such cases, `SYCLomatyic` will comment on the parts it is unsure about. For example, these comments might look something like this: +``` +/* + DPCT1003:1: Migrated API does not return error code. (*, 0) is inserted. You + may need to rewrite this code. +*/ +``` +Before compiling, these sections will need to be manually checked for errors. + +**_step 5.2_** +Once you have a valid file, you may compile it with the SYCL compiler of your choosing. There are many choices for such compilers, which vary based on the devices you are compiling for. Please confer with the [INTEL SYCL documentation](https://www.intel.com/content/www/us/en/developer/articles/technical/compiling-sycl-with-different-gpus.html) if you are unsure what compiler to use. + +*PS: Syclomatic generates data parallel C++ code (DPC++) in stead of a pure SYCL code. This means that you either need to manually convert the DPC++ code to SYCL if you want to use a pure SYCL compiler, or you need to use the intel OneAPI kit to compile the DPC++ code directly* + +**_Compiling pure SYCL code_** +To compile the SYCL code on out clusters you need access to a SYCL compiler. On SAGA and BETZY this is straightforward and is discussed in this tutorial: [What is SYCL](https://documentation.sigma2.no/code_development/guides/hipsycl.html). At the time of writing, LUMI does not have a global installation of ```hipSYCL```. We must therefore utilize easybuild to get access to it. The guidline for installing ```hipSYCL``` on LUMI can be found [here](https://lumi-supercomputer.github.io/LUMI-EasyBuild-docs/h/hipSYCL/). We assume that this is done in the path `/project/project_xxxxxxx/EasyBuild`. The following modules can be loaded: + +``` +$export EBU_USER_PREFIX=/project/project_xxxxxxx/EasyBuild +$module load LUMI/22.08 +$module load partition/G +$module load rocm +$module load hipSYCL/0.9.3-cpeCray-22.08 +``` +To test `hipSYCL`, the tutorial [mentioned above](https://documentation.sigma2.no/code_development/guides/hipsycl.html) can be considered. + +### Launching SYCLomatic through a singularity container + +An alternative to the steps mentioned above is to create a singularity .def file (see an example [here](./syclomatic_doc/syclomatic.def)). This can be done in the following: + +First, build a container image: + +_OBS: In most systems, you need sudo privileges to build the container. You do not have this on our clusters, you should therefore consider building a container locally and then copying it over to the cluster using [scp](https://documentation.sigma2.no/getting_started/getting_started.html#transferring-files) or something similar._ + +```console +$singularity build syclomatic.sif syclomatic.def +``` + +Then execute the `SYCLomatic` tool from inside the container: + +```console +$singularity exec syclomatic.sif c2s [file to be converted] +``` + +This will create the same ```dpct_output``` folder as mentioned in _step 4_. + +(transl-acc-2-omp)= +## Translate OpenACC to OpenMP with Clacc + +[`Clacc`](https://github.com/llvm-doe-org/llvm-project/tree/clacc/main) is a tool to translate `OpenACC` to `OpenMP` offloading with the Clang/LLVM compiler environment. As indicated in the [GitHub repository](https://github.com/llvm-doe-org/llvm-project/tree/clacc/main) the compiler `Clacc` is the `Clang`'s executable in the subdirectory `\bin` of the `\install` directory as described below. + +In the following we present a step-by-step guide for building and using [`Clacc`](https://github.com/llvm-doe-org/llvm-project/tree/clacc/main): + +**_Step 1.1_** +Load the following modules to be able to build `Clacc` (For LUMI-G): + +```console +module load CrayEnv +module load rocm +``` +**_Step 1.2_** +Build and install [`Clacc`](https://github.com/llvm-doe-org/llvm-project/tree/clacc/main). +The building process will spend about 5 hours. + +```console +$ git clone -b clacc/main https://github.com/llvm-doe-org/llvm-project.git +$ cd llvm-project +$ mkdir build && cd build +$ cmake -DCMAKE_INSTALL_PREFIX=../install \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_PROJECTS="clang;lld" \ + -DLLVM_ENABLE_RUNTIMES=openmp \ + -DLLVM_TARGETS_TO_BUILD="host;AMDGPU" \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=g++ \ + ../llvm +$ make +$ make install +``` +**_Step 1.3_** +Set up environment variables to be able to work from the `/install` directory, which is the simplest way. For more advanced usage, which includes for instance modifying `Clacc`, we refer readers to ["Usage from Build directory"](https://github.com/llvm-doe-org/llvm-project/blob/clacc/main/README.md) + +```console +$ export PATH=`pwd`/../install/bin:$PATH +$ export LD_LIBRARY_PATH=`pwd`/../install/lib:$LD_LIBRARY_PATH +``` +**_Step 2_** +To compile the ported `OpenMP` code, one needs first to load these modules: + +```console +module load CrayEnv +module load PrgEnv-cray +module load craype-accel-amd-gfx90a +module load rocm +``` +**_Step 2.1_** +Compile & run an `OpenACC` code on a CPU-host: +```console +$ clang -fopenacc openACC_code.c && ./executable +``` +**_Step 2.2_** +Compile & run an `OpenACC` code on AMD GPU: +```console +$ clang -fopenacc -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a openACC_code.c && ./executable +``` +**_Step 2.3_** +Source to source mode with `OpenMP` port printed out to the console: +```console +$ clang -fopenacc-print=omp OpenACC_code.c +``` +**_Step 3_** +Compile the code with the [`cc` compiler wrapper](https://docs.lumi-supercomputer.eu/development/compiling/prgenv/) +```console +cc -fopenmp -o executable OpenMP_code.c +``` + +# Conclusion + +We have presented an overview of the usage of available tools to convert CUDA codes to HIP and SYCL, and OpenACC codes to OpenMP offloading. In general the translation process for large applications might cover about 80% of the source code and thus requires manual modification to complete the porting process. It is however worth noting that the accuracy of the translation process requires that applications are written correctly according to the CUDA and OpenACC syntaxes. + +# Relevant links + +[Hipify GitHub](https://github.com/ROCm-Developer-Tools/HIPIFY) + +[HIPify Reference Guide v5.1](https://docs.amd.com/en-US/bundle/HIPify-Reference-Guide-v5.1/page/HIPify.html) + +[HIP example](https://github.com/olcf-tutorials/simple_HIP_examples/tree/master/vector_addition) + +[Porting CUDA to HIP](https://www.admin-magazine.com/HPC/Articles/Porting-CUDA-to-HIP) + +[SYCLomatic Github](https://github.com/oneapi-src/SYCLomatic) + +[Installing SYCLamatic](https://github.com/oneapi-src/SYCLomatic/releases) + +[Clacc Main repository README](https://github.com/llvm-doe-org/llvm-project/blob/clacc/main/README.md) diff --git a/_sources/code_development/guides/dask.md.txt b/_sources/code_development/guides/dask.md.txt new file mode 100644 index 000000000..5cf3297f6 --- /dev/null +++ b/_sources/code_development/guides/dask.md.txt @@ -0,0 +1,331 @@ +--- +orphan: true +--- + +(dask-tutorial)= + +# Using Dask to scale your Python program +Dask is a Python library that allows you to scale your existing Python code for optimal use on HPC systems. More information on Dask can be found [here](https://dask.org/). Here, we demostrate a simple example of how the Dask `delayed` function can be used to parallelize your code and how to create a Dask cluster using the `SLURMCluster`. + +```{contents} Table of Contents +``` +## Parallelize your code with dask.delayed + +### dask.delayed +This example is adapted from the Dask documentation on dask.delayed found [here](https://docs.dask.org/en/stable/delayed.html). + +Imagine you have a large amount of data that needs to be processed before you can do any analysis. If the data can be separated into smaller chuncks that can be processed independently, you can use the `dask.delayed` function to parallelize your code. In this example we consider such a scenario where we have a list of data (`all_data`) that can be processed independently. Using a for-loop, we process all the data independently using the function `increment_data`. Once the for-loop is complete, we do the final analysis using the function `do_analysis`. +```{code-block} python +def increment_data(data): + return data + 1 + +def do_analysis(result): + return sum(result) + +all_data = [1, 2, 3, 4, 5] +results = [] +for data in all_data: + data_incremented = increment_data(data) + results.append(data_incremented) + +analysis = do_analysis(results) +``` + +We will now parallize this code by using `dask.delayed` as a decorator to turn the function `increment_data` into a delayed function. The function will now behave *lazy* and return a `Delayed` object instead of the actual result. The `Delayed` object holds your function and its arguments in order to run it in parallel later using a Dask cluster. The actual computation is delayed until the `compute` method is called, here done by calling `analysis.compute()`. + +```{code-block} python +--- +emphasize-lines: 1, 3, 16, 17, 18 +--- +import dask + +@dask.delayed +def increment_data(data): + return data + 1 + +def do_analysis(result): + return sum(result) + +all_data = [1, 2, 3, 4, 5] +results = [] +for data in all_data: + data_incremented = increment_data(data) + results.append(data_incremented) + +analysis = dask.delayed(do_analysis)(results) # dask.delayed can also be used in + # in this manner +final_result = analysis.compute() # execute all delayed functions +print(f"and the final result is: {final_result}") +``` + +### SLURMCluster and dask-jobqueue +```{warning} +Different HPC clusters often operate with different policies on how to queue jobs. For example, Fram allocate whole nodes to jobs while Saga allocates cores. Sometimes the default configuration of the `SLURMCluster` fits badly with the policy for a given HPC cluster. This is the case for Fram: the `SLURMCluster` needs to know how much memory each worker requires (the `memory` argument when initiating the class) but it will also pass the `--mem` argument to Slurm when initiating a worker. The `--mem` argument is not supported on Fram and will cause the job to fail. To avoid this, use the `job_directives_skip` argument when initiating the SLURMCluster class to specify which Slurm arguments should be skipped. For Fram users, see the example tested on Fram in the section on installing Dask in a virtual environment below. +``` +Next, we will use the `SLURMCluster` class from the `dask-jobqueue` package. This class is used to create a Dask cluster by deploying Slurm jobs as Dask workers. The class takes arguments needed to queue a *single* Slurm job/worker, not the characteristics of your computation as a whole. The arguments are similar to the #SBATCH commands in a Slurm script. Using the `scale` method, we can scale the cluster to the desired number of workers. This example is tested on *Saga*. Remember to replace the Slurm parameters with your own and make sure the Slurm commands are suitable for the HPC system you are using. + +```{code-block} python +--- +emphasize-lines: 5, 6, 7,8,9,10,12, 14, 33, 34 +--- +import dask +from dask.distributed import Client +from dask_jobqueue import SLURMCluster + +cluster = SLURMCluster(cores=1, + processes=1, + memory="500M", + walltime="00:05:00", + project="nn9999k", + interface='ib0') + +cluster.scale(5) # scale cluster to 5 workers + +client = Client(cluster) # connect to the cluster + +@dask.delayed +def increment_data(data): + return data + 1 + +def do_analysis(result): + return sum(result) + +all_data = [1, 2, 3, 4, 5] +results = [] +for data in all_data: + data_incremented = increment_data(data) + results.append(data_incremented) + +analysis = dask.delayed(do_analysis(results)) +final_result = analysis.compute() +print(f"and the final result is: {final_result}") + +cluster.close() # shutdown the cluster +client.close() # shutdown the client +``` + +Here, we configured each worker to have 1 core, 500 MB of memory and a walltime of 5 minutes. Using `cluster.scale(5)`, we scaled the the cluster to contain 5 workers. running `squeue -u "username"` after executing your main Slurm script will show that 5 additional Slurm jobs that were created. The figure below shows the task graph created by Dask for this specific Python example. +![task graph using DASK](dask_taskgraph.png) + +## Executing your Python script on the HPC system +First, find available versions of Dask on the HPC system. If there is no Dask installed globally on the cluster, install Dask yourself in a virtual environment. If this is the case, you can skip to the following section on {ref}`venv_and_visualizing`. + +```console +$ module spider dask +``` +Then load the module for the version you want to use: +```console +$ module load dask/your_version +``` + +An example Slurm job script *tested on Saga* is found below. Remember to replace the Slurm parameters with your own and make sure the Slurm commands are suitable for the HPC system you are using. + +```{code-block} bash +--- +emphasize-lines: 15 +--- +#!/bin/bash + +#SBATCH --account=nn9999k +#SBATCH --job-name=dask_example +#SBATCH --ntasks=1 +#SBATCH --mem-per-cpu=1GB +#SBATCH --time=0-00:10:00 + +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +# Loading Software modules +module --quiet purge # Restore loaded modules to the default +module load dask/your_version +module list + +python dask_example.py +exit 0 +``` + +```{note} +It is possible to manually configure a distributed Dask cluster without using the `SLURMCluster` class. This is more advance use of Dask and is not covered in this tutorial. The benefit of this approach is that workers are created inside a main job instead of spawning individual Slurm jobs, thus your calculations are confined to one Slurm job. More information can be found [here](https://docs.dask.org/en/stable/deploying-python-advanced.html). +``` + +(venv_and_visualizing)= +## Installing Dask in a virtual environment and visualizing the task graph + +### Installing Dask in a virtual environment +Installing Dask in a virtual environments enables you to use your preferred version of Dask and to install optional dependencies. Dask can be installed in a virtual environment using `pip`. More information about virtual environments and installing Python packages can be found here: {ref}`installing-python-packages`. Dask-jobqueue has to be installed together with Dask if you are using `SLURMCluster`. +```console +$ module load Python/your_version +$ python -m venv my_new_pythonenv +$ source my_new_pythonenv/bin/activate +$ python -m pip install dask dask-jobqueue +``` +Below you find an example slurm job *tested on Fram*. Remember to replace the Slurm parameters with your own and make sure the Slurm commands are suitable for the HPC system you are using. +```{code-block} bash +--- +emphasize-lines: 15, 16, 17 +--- +#!/bin/bash + +#SBATCH --account=nn9999k +#SBATCH --job-name=dask_example +#SBATCH --ntasks=1 +#SBATCH --time=0-00:10:00 + +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +module --quiet purge +module load Python/your_version + +export PS1=\$ +source my_new_pythonenv/bin/activate # replace my_new_pythonenv with the name + # of your virtual environment + +# It is also recommended to to list loaded modules, for easier debugging +module list + +python dask_example.py +exit 0 +``` + +In the case of Fram, you need to use the `job_directives_skip` argument when configuring the SLURMCluster, if not the job will fail due to forbidden arguments being passed to Slurm. Pass the arguments you want to skip as a list to the `job_directives_skip` argument. Below is an example of the python code tested on Fram. Note that here `project` is replaced with `account`, as `project` is deprecated and replaced with `account` in newer versions of Dask-jobqueue. +```{code-block} python +--- +emphasize-lines: 8 +--- +import dask +from dask.distributed import Client +from dask_jobqueue import SLURMCluster + +cluster = SLURMCluster(cores=1, + processes=1, + memory="500M", + job_directives_skip=['--mem'], + walltime="00:05:00", + account="nn9999k", + interface='ib0') + +cluster.scale(5) # scale cluster to 5 workers + +client = Client(cluster) # connect to the cluster + +@dask.delayed +def increment_data(data): + return data + 1 + +def do_analysis(result): + return sum(result) + +all_data = [1, 2, 3, 4, 5] +results = [] +for data in all_data: + data_incremented = increment_data(data) + results.append(data_incremented) + +analysis = dask.delayed(do_analysis(results)) +final_result = analysis.compute() + +cluster.close() # shutdown the cluster +client.close() # shutdown the client +``` + +### Visualizing the task graph + +Here we install Dask in a virtual environment together with the Graphviz library, which is an optional dependency for needed for visualizing the task graph. You need both the Graphviz system library *and* the Graphviz Python library installed. You can load the Graphviz system library using the `module load` command if it is installed globally. + +First, find available versions of graphviz on the HPC system: +```console +$ module spider graphviz +``` +Then load the module for the version you want to use: +```console +$ module load graphviz/your_version +``` +If the Graphviz module is not installed globally on the HPC system, you can install it yourself using EasyBuild. More information about EasyBuild and how to load manually installed software can be found here: {ref}`easybuild`. + +Now you can create a virtual environment and install Dask, Graphviz and, if you are using `SLURMCluster`, Dask-jobqueue. Here, we will use `pip`. More information about virtual environments and installing Python packages can be found here: {ref}`installing-python-packages`. +```console +$ python -m venv my_new_pythonenv +$ source my_new_pythonenv/bin/activate +$ python -m pip install dask graphviz dask-jobqueue +``` + +```{warning} +If you are using a virtual environment, you need to make sure that the virtual environment is created with the same Python version that the Graphviz module uses. For example, if you are using Graphviz/2.47.2-GCCcore-10.3.0, you need to create the virtual environment with Python 3.9.5. To find the correct Python version, load Graphviz with `module load graphviz/your_version` then run `module list` and look for the Python version which was just loaded with Graphviz. If you create the virtual environment straight after loading Graphviz, the correct Python version will be used. +``` +Below you find an example slurm job and python script using Graphviz *tested on Saga*. Remember to replace the Slurm parameters with your own and make sure the Slurm commands are suitable for the HPC system you are using. + +```{code-block} bash +--- +emphasize-lines: 14, 16, 17, 18 +--- +#!/bin/bash + +#SBATCH --account=nn9999k +#SBATCH --job-name=dask_example +#SBATCH --ntasks=1 +#SBATCH --mem-per-cpu=1GB +#SBATCH --time=0-00:10:00 + +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +module --quiet purge +module load Graphviz/your_version # replace with the version you want to use + +export PS1=\$ +source my_new_pythonenv/bin/activate # replace my_new_pythonenv with the name + # of your virtual environment + +# It is also recommended to to list loaded modules, for easier debugging +module list + +python dask_example.py +exit 0 +``` + +You can now safely include the `visualize` function in your script, which is the function which will produce an image of the task graph. + +```{code-block} python +--- +emphasize-lines: 9, 10, 32 +--- +import dask +from dask.distributed import Client +from dask_jobqueue import SLURMCluster + +cluster = SLURMCluster(cores=1, + processes=1, + memory="500M", + walltime="00:05:00", + account="nn9999k", # NB: in newer versions of dask-jobqueue, "project" + # has been renamed to "account" + interface='ib0') +cluster.scale(5) + +client = Client(cluster) + +@dask.delayed +def increment_data(data): + return data + 1 + +def do_analysis(result): + return sum(result) + +all_data = [1, 2, 3, 4, 5] +results = [] +for data in all_data: + data_incremented = increment_data(data) + results.append(data_incremented) + +analysis = dask.delayed(do_analysis)(results) +final_result = analysis.compute() + +analysis.visualize(filename="visualize_taskgraph.svg") + +cluster.close() +client.close() +``` diff --git a/_sources/code_development/guides/external_libraries.md.txt b/_sources/code_development/guides/external_libraries.md.txt new file mode 100644 index 000000000..065e695f6 --- /dev/null +++ b/_sources/code_development/guides/external_libraries.md.txt @@ -0,0 +1,367 @@ +--- +orphan: true +--- + +# Calling GPU accelerated libraries + +One of the best ways to get the benefit of GPU acceleration is to call an +external library that is already accelerated. All of the major GPU hardware +vendors create such libraries and the advantage of their use is that you will +get the best performance possible for the available hardware. Examples of GPU +accelerated libraries include BLAS (Basic Linear Algebra Subprograms) libraries +such as [`cuBLAS` from Nvidia](https://developer.nvidia.com/cublas), [`rocBLAS` +from AMD](https://rocblas.readthedocs.io/en/latest/) and [`oneMKL` from +Intel](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/api-based-programming/intel-oneapi-math-kernel-library-onemkl.html). + +One challenge with calling an external library is related to its integration +with user accelerated code and how to compile the code so that everything is +linked. To address these issues this tutorial will go through: +- How to call different GPU accelerated libraries from both C/C++ and Fortran. +- How to combine external accelerated libraries and custom offloading code. + - Focusing on OpenACC and OpenMP offloading +- How to compile your code so that the external libraries are linked. + +```{contents} +:depth: 2 +``` + +(cublas_openacc)= +## Calling `cuBLAS` from OpenACC + +> The BLAS (Basic Linear Algebra Subprograms) are routines that provide +> standard building blocks for performing basic vector and matrix operations. - +> [netlib](https://www.netlib.org/blas/) + +As noted in the introduction to this tutorial, all of the major GPU hardware +vendors provide specialised BLAS routines for their own hardware. These +libraries offers the best in class performance and thanks to the shared +interface, one can easily abstract over multiple libraries from different +vendors. Here we will show how to integrate OpenACC with [`cuBLAS` from +Nvidia](https://developer.nvidia.com/cublas). The `cuBLAS` library is a BLAS +implementation for Nvidia GPUs which is compatible with the hardware found on +Saga and Betzy. + +As an example we will use `cuBLAS` to perform a simple vector addition and then +calculate the sum of the vector in our own custom loop. The example allows us +to show how to combine `cuBLAS` and OpenACC, and our recommendation is to +always use BLAS libraries when performing mathematical computations. + +```{eval-rst} +.. literalinclude:: external_libraries/cublas/openacc.c + :language: c +``` +```{eval-rst} +:download:`cublas_openacc.c <./external_libraries/cublas/openacc.c>` +``` + +The main focus of our changes are in the following lines, where we call the +SAXPY routine within the already established OpenACC data region. + +```{eval-rst} +.. literalinclude:: external_libraries/cublas/openacc.c + :language: c + :lines: 46-67 +``` + +In the above section one can see that we first create an OpenACC data region +(`#pragma acc data`) so that our compute vectors are available on the GPU +device. Within this region we would normally have accelerated loops that do +calculations on the data, but when integrating with `cuBLAS` we only need the +address of the memory (`#pragma acc host_data`). After the SAXPY routine is +called we use the data to calculate the sum as a normal OpenACC kernel. + +Combining `cuBLAS` and OpenACC in this manner allows us to call accelerated +libraries without having to perform low-level memory handling as one would +normally do with such a library. + +--- + +To compile this code we will first need to load a few modules. + +`````{tabs} +````{group-tab} Saga + +```console +$ module load NVHPC/21.11 CUDA/11.4.1 +``` +```` +````{group-tab} Betzy + +```console +$ module load NVHPC/21.7 CUDA/11.4.1 +``` +```` +````` + +We first load `NVHPC` which contains the OpenACC C compiler (`nvc`), then we +load `CUDA` which contains the `cuBLAS` library which we will need to link to. + +To compile we can use the following: + +`````{tabs} +````{group-tab} Saga + +```console +$ nvc -acc -Minfo=acc -gpu=cc60 -lcublas -o cublas_acc cublas_openacc.c +``` +```` +````{group-tab} Betzy + +```console +$ nvc -acc -Minfo=acc -gpu=cc80 -lcublas -o cublas_acc cublas_openacc.c +``` +```` +````` + +Finally, we can run the program using the `srun` command which works on both +Saga and Betzy: + +```console +$ srun --account=nnk --ntasks=1 --time=02:00 --mem=1G --partition=accel --gpus=1 ./cublas_acc +srun: job queued and waiting for resources +srun: job has been allocated resources +Starting SAXPY + OpenACC program + Initializing vectors on CPU + Creating cuBLAS handle + Starting calculation + Calculation produced the correct result of '4 * 10000 == 40000'! +Ending SAXPY + OpenACC program +``` + +(cublas_openmp)= +## Calling `cuBLAS` from OpenMP offloading + +OpenMP support offloading to GPUs in the same way as OpenACC. We will therefore +use the same example as above, but this time use OpenMP's offloading +capabilities. + +Since the program has not changed much from above we have highlighted the major +differences from the OpenACC version. + +```{eval-rst} +.. literalinclude:: external_libraries/cublas/omp.c + :language: c + :emphasize-lines: 7,46,54,63 +``` +```{eval-rst} +:download:`cublas_omp.c <./external_libraries/cublas/omp.c>` +``` + +As can be seen in the code above, our interaction with the `cuBLAS` library did +not have to change, we only had to change the directives we used to make the +compute vectors available. As with OpenACC, in OpenMP we start by creating a +data region to make our compute vectors accessible to the GPU (done with +`#pragma omp target data map(...)`). We then make the pointers to this data +available for our CPU code so that we can pass valid pointers to `cuBLAS` +(pointers made available with `#pragma omp target data use_device_ptr(...)`). +Finally we show that we can also use the vectors we uploaded in custom +offloading loops. + +--- + +To compile the above OpenMP code we first need to load the necessary modules: + +`````{tabs} +````{group-tab} Saga + +```console +$ module load Clang/13.0.1-GCCcore-11.2.0-CUDA-11.4.1 +``` + +Since the GPUs on Saga are a couple of generation older we can't use `NVHPC` +for OpenMP offloading. We instead use `Clang` to show that it works on Saga as +well. +```` +````{group-tab} Betzy + +```console +$ module load NVHPC/21.7 CUDA/11.4.1 +``` +```` +````` + +And then we compile with: + +`````{tabs} +````{group-tab} Saga + +```console +$ clang -o cublas_omp cublas_omp.c -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_60 -lcublas +``` + +Since the GPUs on Saga are a couple of generation older we can't use `NVHPC` +for OpenMP offloading. We instead use `Clang` to show that it works on Saga as +well. +```` +````{group-tab} Betzy + +```console +$ nvc -mp=gpu -Minfo=mp -gpu=cc80 -lcublas -o cublas_omp cublas_omp.c +``` +```` +````` + +Finally we can run the program with the following call to `srun` (note that +this call works on both Saga and Betzy): + +```console +$ srun --account=nnk --ntasks=1 --time=02:00 --mem=1G --partition=accel --gpus=1 ./cublas_omp +srun: job queued and waiting for resources +srun: job has been allocated resources +Starting SAXPY + OpenMP offload program + Initializing vectors on CPU + Creating cuBLAS handle + Starting calculation + Calculation produced the correct result of '4 * 10000 == 40000'! +Ending SAXPY + OpenMP program +``` + +(cufft_openacc)= +## Calling `cuFFT` from OpenACC + +(summary)= +### Summary + +In this section we provide an overview on how to implement a GPU-accelerated library FFT (Fast Fourier Transform) in an OpenACC application and the serial version of the FFTW library. Here we distinguish between two GPU-based FFT libraries: [`cuFFT`](https://docs.nvidia.com/cuda/cufft/index.html) and [`cuFFTW`](https://docs.nvidia.com/cuda/cufft/index.html#fftw-supported-interface). The `cuFFT` library is the NVIDIA-GPU based design, while `cuFFTW` is a porting version of the existing [`FFTW`](https://www.fftw.org/) library. In this tutorial, both libraries will be addressed with a special focus on the implementation of the `cuFFT` library. Specifically, the aim of this tutorial is to: + +* Show how to incorporate the `FFTW` library in a serial code. +* Describe how to use the `cuFFTW` library. +* Show how to incorporate the `cuFFT` library in an OpenACC application interface. +* Describe briefly how to enable `cuFFT` to run on OpenACC stream. +* Describe the compilation process of `FFTW` and `cuFFT`. + +The implementation will be illustrated for a one-dimensional (1D) scenario and will be further described for 2D and 3D cases. + +(generality-fft)= +### Generality of FFT + +In general, the implementation of an FFT library is based on three major steps as defined below: + +- Creating plans (initialization). + +- Executing plans (create a configuration of a FFT plan having a specified dimension and data type). + +- Destroying plans (to free the ressources associated with the FFT plans). + +These steps necessitate specifying the direction, in which the FFT algorithm should be performed: forward or backward (or also inverse of FFT), and the dimension of the problem at hands as well as the precision (i.e. double or single precision); this is in addition to the nature of the data (real or complex) to be transformed. + +In the following, we consider a one-dimensional (1D) scenario, in which the execution is specified for a double precision complex-to-complex transform plan in the forward and backward directions. The implementation is illustrated via a Fortran code. The latter can be adjusted to run calculations of a single precision as well as of real-to-real/complex transform and can be further extended to multi-dimension cases (i.e. 2D and 3D). We first start with the FFT implementation in a serial-CPU scheme and further extend it to a GPU-accelerated case. The implementation is illustrated for a simple example of a function defined in time-domain. Here we choose a sinus function (i.e. f(t)=sin(ωt) with ω is fixed at the value 2), and its FFT should result in a peak around the value ω=2 in the frequency domain. + +(implementation-fftw)= +### Implementation of `FFTW` + +The implementation of the `FFTW` library is shown below and a detailed description of the library can be found [here](https://www.fftw.org/). + +As described in the code, one needs to initialize the FFT by creating plans. Executing the plans requires specifying the transform direction: *FFTWFORWARD* for the forward direction or *FFTWBACKWARD* for the backward direction (inverse FFT). These two parameters should be defined as an integer parameter. An alternative is to include the `fftw3.f` file as a header (i.e. `include "fftw3.f"`), which contains all parameters required for a general use of `FFTW`. In the case the file is included, the value of the direction parameter does not need to be defined. + +The argument *FFTW_MEASURE* in the function `dfftw_plan_dft_1d` means that `FFTW` measures the execution time of several FFTs in order to find the optimal way to compute the FFT, which might be time-consuming. An alternative is to use *FFTW_ESTIMATE*, which builds a reasonable plan without any computation. This procedure might be less optimal (see [here](https://www.fftw.org/) for further details). + +Note that when implementing the `FFTW` library, the data obtained from the backward direction need to be normalized by dividing the output array by the size of the data, while those of forward direction do not. This is only valid when using the `FFTW` library. + +To check the outcome of the result in the forward direction, one can plot the function in the frequency-domain, which should display a peak around the value ω=+2 and -2 as the function is initially symmetric. By performing the backward FFT of the obtained function, one should obtain the initial function displayed in time-domain (i.e. sin(2t)). This checking procedure holds also when implementing a GPU version of the FFT library. + +For completeness, porting the `FFTW` library to [`cuFFTW`](https://docs.nvidia.com/cuda/cufft/index.html#fftw-supported-interface) does not require modifications in the code - it is done by replacing the file `fftw3.h` with `cufftw.h`. + +```{eval-rst} +.. literalinclude:: external_libraries/fft/fftw_serial.f90 + :language: fortran +``` +```{eval-rst} +:download:`fftw_serial.f90 <./external_libraries/fft/fftw_serial.f90>` +``` + +(compilation-process-fftw)= +### Compilation process of `FFTW` + +The `FFTW` library should be linked with fftw3 (i.e. `-lfftw3`) for the double precision, and fftw3f (i.e. `-lfftw3f`) for the single precision case. + +Here is an example of a module to be loaded. + +On Saga: +```console +$ module load FFTW/3.3.9-intel-2021a +``` +The same module is available on Betzy. + +To compile: +```console +$ ifort -lfftw3 -o fftw.serial fftw_serial.f90 +``` + +In the case of using the `cuFFTW` library, the linking in the compilation syntaxt should be provided for both `cuFFT` and `cuFFTW` libraries. + +(implementation-cufft)= +### Implementation of `cuFFT` + +We consider the same scenario as described in the previous section but this time the implementation involves the communication between a CPU-host and GPU-device by calling the `cuFFT` library. The `cuFFT` implementation is shown below. + +Similarly to the `FFTW` library, the implementation of the GPU-accelerated `cuFFT` library is conceptually based on creating plans, executing and destroying them. The difficulty here however is how to call the `cuFFT` library, which is written using a low-level programming model, from an OpenACC application interface. In this scenario, there are steps that are executed by the `cuFFT` library and other steps are executed by OpenACC kernels. Executing all these steps requires sharing data. In other words, it requires making OpenACC aware of the GPU-accelerated `cuFFT` library. This is done in OpenACC by specifying the directive `host_data` together with the clause `use_device(list-of-arrays)`. This combination permits to access the device address of the listed arrays in the `use_device()` clause from the [host](https://www.nvidia.com/docs/IO/116711/OpenACC-API.pdf). The arrays, which should be already present on the device memory, are in turn passed to the `cuFFT` functions (i.e. `cufftExecZ2Z()` in our example). The output data of these functions is not normalized, and thus it requires to be normalized by dividing by the size of the array. The normalisation may be followed by the function `cufftDestroy()` to free all GPU resources associated with a `cuFFT` plan and destroy the internal plan data structure. + +It is worth noting that the `cuFFT` library uses CUDA streams for an asynchronous execution, which is not the case for OpenACC. It is therefore necessary to make the `cuFFT` runs on OpenACC streams. This is done by calling the routine `cufftSetStream()`, which is part of the `cuFFT` module. The routine includes the function `acc_get_cuda_stream()`, which enables identifying the CUDA stream. + +Note that the use of the OpenACC runtime routines and the `cuFFT` routines requires including the header lines `use openacc` and `use cufft`. + +The tables below summarize the calling functions in the case of a multi-dimension data having a simple or double complex data type (see [here](https://docs.nvidia.com/hpc-sdk/compilers/fortran-cuda-interfaces/index.html) for more details). + +```{eval-rst} +.. literalinclude:: external_libraries/fft/cufft_acc.f90 + :language: fortran +``` +```{eval-rst} +:download:`cufft_acc.f90 <./external_libraries/fft/cufft_acc.f90>` +``` + +Dimension| Creating a FFT plan| +--- |--- | +1D | cufftPlan1D( plan, nx, FFTtype,1) | +2D | cufftPlan2d( plan, ny, nx, FFTtype) | +3D | cufftPlan3d( plan, nz, ny, nx, FFTtype) | + +**Table 1.** *Creating FFT plans in 1D, 2D and 3D dimensions. nx is the size of a 1D array, nx and ny the size of a 2D array, and nx, ny, nz define the size of a 3D array. The FFTtype specifies the data type stored as described in the **Table 2**.* + + +Precision of the transformed plan | subroutine | FFTtype | +--- | --- | --- | +Double precision complex-to-complex | cufftExecZ2Z( plan, in, out, direction ) | ”CUFFT_Z2Z” | +Single precision complex-to-complex | cufftExecC2C( plan, in, out, direction ) | ”CUFFT_C2C” | + +**Table 2.** *Executing a double precision/single-precision complex-to-complex transform plan in a FFT direction to be specified: “CUFFT_FORWARD” for forward FFT and “CUFFT_INVERSE” for backward FFT. The input data are stored in the array **in**, and the results of FFT for a specific direction are stored in the array **out**.* + +(compilation-process-cufft)= +### Compilation process of `cuFFT` + +The `cuFFT` library is part of the CUDA toolkit, and thus it is supported by the NVIDIA-GPU compiler. Therefore, the only modules are required to be load are NVHPC and CUDA modules. + +Modules to be loaded: + +`````{tabs} +````{group-tab} Saga + +```console +$ module load NVHPC/21.11 CUDA/11.4.1 +``` +```` +````{group-tab} Betzy + +```console +$ module load NVHPC/21.7 CUDA/11.4.1 +``` +```` +````` + +We compile using the NVIDIA Fortran compiler `nvfortran`. The compilation process requires linking the `cuFFT` library (`-lcufft`) and adding the CUDA version library to the syntax of the compilation (`-cudalib=cufft`). + +```console +$ nvfortran -lcufft -cudalib=cufft -acc -Minfo=accel -o cufft.acc cufft_acc.f90 +``` +Here the flag `-acc` enables OpenACC on NVIDIA-GPU. It is possible to specify the compute capability e.g. `-gpu=cc80` for Betzy and `-gpu=cc60` for Saga. + +To run: +```console +$ srun --partition=accel --gpus=1 --time=00:01:00 --account=nnXXXXX --qos=devel --mem-per-cpu=1G ./cufft.acc +``` + +(conclusion-ext-lib)= +# Conclusion + +In conclusion, we have provided a description of the implementation of the GPU-accelerated `cuBLAS` and `cuFFT` libraries targeting NVIDIA-GPU. The implementation illustrates the capability of calling a GPU-accelerated library written in a low-level programming model from an OpenACC or OpenMP application interface. We have also documented the implementation of the `FFTW` library for a serial case scenario and emphasized its porting version referred to as the `cuFFTW` library. For the `FFTW` and `cuFFT` libraries, although the implementation has been done for a 1D problem, an extension to 2D and 3D scenarios is straightforward. diff --git a/_sources/code_development/guides/gpu.md.txt b/_sources/code_development/guides/gpu.md.txt new file mode 100644 index 000000000..a198a695a --- /dev/null +++ b/_sources/code_development/guides/gpu.md.txt @@ -0,0 +1,158 @@ +--- +orphan: true +--- + +(gpu-intro)= +# Introduction to using GPU compute + +A GPU, or **G**raphics **P**rocessing **U**nit, is a computational unit, which +as the name suggest, is optimized to work on graphics tasks. Nearly every +computer device that one interacts with contains a GPU of some sort, responsible +for transforming the information we want to display into actual pixels on our +screens. + +One question that might immediately present itself is, **if GPUs are optimized +for graphics - why are they interesting in the context of computational +resources?** The answer to that is of course complicated, but the short +explanation is that many computational tasks have a lot in common with +graphical computations. The reason for this is that GPUs are optimized for +working with pixels on the screen, and a lot of them. Since all of these +operations are almost identical, mainly working on floating point values, they +can be run in parallel on dedicated hardware (i.e. the GPU) that is tailored and +optimized for this particular task. This already sounds quite a bit like working +with a discrete grid in e.g. atmospheric simulation, which points to the reason +why GPUs can be interesting in a computational context. + +Since GPUs are optimized for working on grids of data and how to transform this +data, they are quite well suited for matrix calculations. For some indication of +this we can compare the theoretical performance of one GPU with one CPU +. + +| | AMD Epyc 7742 (Betzy) | Nvidia P100 (Saga) | Nvidia A100 (Betzy)| +|-|-----------------------|--------------------|-------------| +| Half Precision | N/A | 18.7 TFLOPS | 78 TFLOPS | +| Single Precision | 1,3 TFLOPS | 9,3 TFLOPS | 19.5 TFLOPS | +| Double Precision | N/A | 4.7 TFLOPS | 9.7 TFLOPS | + +Based on this it is no wonder why tensor libraries such as +[`TensorFlow`](https://www.tensorflow.org/) and [`PyTorch`](https://pytorch.org/) +[report **speedup**](https://blog.tensorflow.org/2018/04/speed-up-tensorflow-inference-on-gpus-tensorRT.html) +on accelerators between **`23x` and `190x`** compared to using only a CPU. + + +## Getting started + +To get started we first have to {ref}`ssh` into Saga: +```console +[me@mylaptop]$ ssh @saga.sigma2.no +``` + +From the {ref}`hardware specification ` we see that there should be 8 GPU +nodes available on Saga, and from the available {ref}`job types ` +we identify `--partition=accel` as the relevant hardware partition for GPU jobs. +You can run the `sinfo` command to check the available partitions on Saga: + +```console +[me@login.SAGA]$ sinfo +``` +```{eval-rst} +.. literalinclude:: gpu/outputs/sinfo.out + :emphasize-lines: 9-11 +``` + +Here we see that the `accel` partition contains 8 nodes in total, 2 of which are +unused at the moment (`idle`), 4 are fully occupied (`alloc`) and 2 are partially +occupied (`mix`). We can also read from this that the maximum time limit for a GPU +job is 14 days, which might be relevant for your production calculations. + +To select the correct partition use the `--partition=accel` flag with either +`salloc` ({ref}`interactive `) +or +`sbatch` ({ref}`job script `). +This flag will ensure that your job is only run on machines in the `accel` partition +which have attached GPUs. However, to be able to actually interact with one or more +GPUs we will have to also add `--gpus=N` which tells Slurm that we would also like +to use `N` GPUs (`N` can be a number between 1 and 4 on Saga since each node has 4 +GPUs). + +```{tip} +There are multiple ways of requesting GPUs a part from `--gpus=N`, such as +`--gpus-per-task` to specify the number of GPUs that each task should get +access to. Checkout the official [Slurm +documentation](https://slurm.schedmd.com/srun.html) for more on how to specify +the number of GPUs. +``` + + +## Interactive testing + +All projects should have access to GPU resources, and to that end we will start +by simply testing that we can get access to a single GPU. To do this we will run +an interactive job using the `salloc` command, on the `accel` partition and asking +for a single GPU: + +```console +[me@login.SAGA]$ salloc --ntasks=1 --mem-per-cpu=1G --time=00:02:00 --partition=accel --gpus=1 --qos=devel --account= +salloc: Pending job allocation 4318997 +salloc: job 4318997 queued and waiting for resources +salloc: job 4318997 has been allocated resources +salloc: Granted job allocation 4318997 +salloc: Waiting for resource configuration +salloc: Nodes c7-7 are ready for job +``` + +Once we land on the compute node we can inspect the GPU hardware with +the `nvidia-smi` command (this is kind of the `top` equivalent for Nvidia GPUs): + +```console +[me@c7-8.SAGA]$ nvidia-smi +``` +```{eval-rst} +.. literalinclude:: gpu/outputs/nvidia-smi.out + :emphasize-lines: 3,9,19 +``` + +Here we can find useful things like CUDA library/driver version and the name of the +graphics card (`Tesla P100-PCIE...`), but also information about currently +running processes that are "GPU aware" (none at the moment). If you don't get any +useful information out of the `nvidia-smi` command (e.g. `command not found` or +`No devices were found`) you likely missed the `--partition=accel` and/or `--gpus=N` +options in your Slurm command, which means that you won't actually have access to any +GPU (even if there might be one physically on the machine). + +```{tip} +In the above Slurm specification we combined `--qos=devel` with GPUs and +interactive operations so that we can experiment with commands interactively. +This can be a good way to perform short tests to ensure that libraries correctly +pick up GPUs when developing your experiments. Read more about `--qos=devel` +in our guide on {ref}`interactive jobs `. +``` + +## Simple GPU test runs + +In the following we present a few minimal standalone code examples using different +acceleration strategies and programming languages. The purpose of all these examples +is the same (compile, run and verify), so you can choose the version that suites you best. + +```{eval-rst} +.. toctree:: + :maxdepth: 1 + + gpu/tensorflow.md + gpu/cuda.md + gpu/cuda-container.md +``` + + +## Next steps + +Transitioning your application to GPU can be a daunting challenge. We have +documented a few ways to get started in our development [here](code_development), +but if you are unsure please don't hesitate to contact us at +[support@nris.no](mailto:support@nris.no). + +We also have a few tutorials on specific GPU related topics: +- {ref}`openacc` +- {ref}`hipsycl-start` +- {ref}`Running TensorFlow on GPUs ` +- {ref}`Running containers w/CUDA support: BigDFT example ` diff --git a/_sources/code_development/guides/gpu/cuda-container.md.txt b/_sources/code_development/guides/gpu/cuda-container.md.txt new file mode 100644 index 000000000..662271c89 --- /dev/null +++ b/_sources/code_development/guides/gpu/cuda-container.md.txt @@ -0,0 +1,189 @@ +(byo-cuda-container)= +# BYO CUDA environment with containers + +```{note} +This example assumes basic knowledge on Singularity containers, +on the level presented in our container {ref}`intro `. +``` + +This example demonstrates: + +1. how to pull a CUDA container image from the NGC +2. how to build your CUDA application through the container environment +3. how to run your CUDA application through the container environment + +In this example we will use the same code as we used in the basic CUDA +{ref}`tutorial `, but we will try to build the code in a different +CUDA environment than what we currently have available on Saga. In particular, +we will fetch a container image with a _newer_ version of CUDA from the Nvidia GPU +Cloud ([NGC](https://catalog.ngc.nvidia.com/)). + +First, let's revisit the source code from the other example: + +```{eval-rst} +.. literalinclude:: cuda/vec_add_cuda.cu + :language: c +``` + +```{eval-rst} +:download:`vec_add_cuda.cu <./cuda/vec_add_cuda.cu>` +``` + + +## Step 1: Pull container image from the NGC + +We start by browsing the [NGC](https://catalog.ngc.nvidia.com/containers) for a suitable +container image, which should be from the [CUDA](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda) +collection. For development work (compilation etc) we should choose a tag with `devel` +in its name, but the CUDA version and operating system can be whatever you like. +We download one such container with the following command (this might take a few minutes): + +```console +[me@login.SAGA]$ singularity pull docker://nvcr.io/nvidia/cuda:11.4.0-devel-ubuntu20.04 +INFO: Converting OCI blobs to SIF format +INFO: Starting build... +Getting image source signatures +Copying blob 35807b77a593 done +Copying blob 2f02693dc068 done +Copying blob 903c09d5b94e done +Copying blob 205c053b80d7 done +Copying blob 3da463f4fa89 done +Copying blob 6ae79230f62a done +Copying blob 43b3e972ee6d done +Copying blob 93f128a4f293 done +Copying blob c8078b8bb166 done +Copying config c3f63d2c90 done +Writing manifest to image destination +Storing signatures +2021/11/03 22:40:19 info unpack layer: sha256:35807b77a593c1147d13dc926a91dcc3015616ff7307cc30442c5a8e07546283 +2021/11/03 22:40:20 info unpack layer: sha256:2f02693dc0685e3a6de01df36887f5d358f48a48886e688251ac3ef04c410362 +2021/11/03 22:40:20 info unpack layer: sha256:903c09d5b94ea239fc1a5a7fd909c3afe62912ce90c86c8924d57a2f71055d34 +2021/11/03 22:40:21 info unpack layer: sha256:205c053b80d7b029905054aac222afacbb17ec8266df623e0bcea36ce5d88d37 +2021/11/03 22:40:21 info unpack layer: sha256:3da463f4fa89a36aa543d72065de871d22072cd139e6e85b2fb7bd91473a4409 +2021/11/03 22:40:21 info unpack layer: sha256:6ae79230f62a71f8abb1c6aaefbaa48e6cf6f38a671ba511bf19678882d747c2 +2021/11/03 22:40:43 info unpack layer: sha256:43b3e972ee6de26010ac81c65aa5f37612caa6c1f0c9eb9c114f841f67e154a3 +2021/11/03 22:40:43 info unpack layer: sha256:93f128a4f293c7f83d182fda8740bb51ea5c1c7508c97f6b563e08c12c3fca07 +2021/11/03 22:41:12 info unpack layer: sha256:c8078b8bb1668a188a782de56e7d1e6faff012f45a932f1c43b145c2b61ea0d3 +INFO: Creating SIF file... +``` + +We should now have the following file in the current directory + +```console +[me@login.SAGA]$ ls -lh +-rwxrwxr-x 1 me me_g 2.8G Nov 3 12:41 cuda_11.4.0-devel-ubuntu20.04.sif +``` + +which is the Singularity image file. Notice the size (2.8G) of this image, and keep in +mind your `$HOME` disk quota of 20GiB. + +Once we have the container image we can verify that we have the `nvcc` CUDA compiler +available inside it. First we check on the host (no modules load at this point): + +```console +[me@login.SAGA]$ nvcc --version +-bash: nvcc: command not found +``` + +but if we run the same command _through_ the container we should find a compiler: + +```console +[me@login.SAGA]$ singularity exec cuda_11.4.0-devel-ubuntu20.04.sif nvcc --version +nvcc: NVIDIA (R) Cuda compiler driver +Copyright (c) 2005-2021 NVIDIA Corporation +Built on Sun_Aug_15_21:14:11_PDT_2021 +Cuda compilation tools, release 11.4, V11.4.120 +Build cuda_11.4.r11.4/compiler.30300941_0 +``` + +Notice the CUDA version, which is more recent than any currently supported module +on Saga (which is CUDA/11.1.1 at the time of writing). + +```{warning} +Container images are typically a few GiB in size, so you might want to keep your +containers in a project storage area to avoid filling up your limited `$HOME` disk quota. +Also beware that pulled images are cached, by default under `$HOME/.singularity/cache`. +This means that if you pull the same image twice, it will be immediately available from +the cache without downloading/building, but it also means that it will consume disk space. +To avoid this you can either add `--disable-cache` to the `pull` command, change the cache +directory with the `SINGULARITY_CACHEDIR` environment variable, or clean up the cache +regularly with `singularity cache clean`. +``` + +## Step 2: Compile the code through the container environment + +Now that we found a `nvcc` compiler inside the container, we can try to compile our code +_through_ the container environment. In order to make this work we need to add one option +to the compilation command that we used {ref}`before `. In the "native" build on +Saga we did not have to specify the `gpu-architecture` that we compile for, as it was able +to pick that up automatically. In the container environment, however, the CUDA compiler +has not been set up with this information so we have to provide it as a compiler option, +which makes the full compilation string: + +``` +nvcc --gpu-architecture=sm_60 vec_add_cuda.cu -o vec_add_cuda +``` + +and we just have to pass it to the `singularity exec` command: + +```console +[me@login.SAGA]$ singularity exec --bind $PWD cuda_11.4.0-devel-ubuntu20.04.sif nvcc --gpu-architecture=sm_60 vec_add_cuda.cu -o vec_add_cuda +``` + +Notice also the `--bind $PWD` option, which makes sure that the files in the current directory +is visible to the container. The compilation should hopefully finish without any errors/warnings, +and there should now be a `vec_add_cuda` executable file in your current directory (also +accessible by the host, even if it was created from within the container). + +This executable will however not run successfully on the login node, since there are no +GPUs available here. We thus have to request GPU resources through Slurm. + +```{note} +The origin of the `sm_60` architecture flag is a bit hard to explain, and you really need +to dig deep into the hardware specs to figure out the correct option here. +For our machines at NRIS we have: +- Saga: Nvidia Pascal (P100) - `sm_60` +- NIRD: Nvidia Volta (V100) - `sm_70` +- Betzy: Nvidia Ampere (A100) - `sm_80` +``` + +## Step 3: Run the code through the container environment + +We will test the code in an interactive session, so we ask for a single GPU: + +```console +[me@login.SAGA]$ salloc --nodes=1 --gpus=1 --time=0:10:00 --mem=1G --partition=accel --account= +salloc: Pending job allocation 4320527 +salloc: job 4320527 queued and waiting for resources +salloc: job 4320527 has been allocated resources +salloc: Granted job allocation 4320527 +salloc: Waiting for resource configuration +salloc: Nodes c7-8 are ready for job +``` + +Now, the CUDA environment inside the container is probably backward compatible with the +CUDA version on the cluster, so our newly created executable will _probably_ run smoothly +even if we don't run it through the container, but for the sake of consistency we will +launch the program with `singularity exec` + +```console +[me@c7-8]$ singularity exec --bind $PWD cuda_11.4.0-devel-ubuntu20.04.sif ./vec_add_cuda +ENTER MAIN +Segmentation fault +``` + +The reason this failed is that Singularity has not been made aware of the GPU resources +on the host (kind of like the situation we had with `salloc --partition=accel` _without_ +the `--gpus=1` option). The magic keyword to make this work in Singularity is `--nv` +(for Nvidia, for AMD GPUs the keyword is `--rocm`): + +```console +[me@c7-8]$ singularity exec --nv --bind $PWD cuda_11.4.0-devel-ubuntu20.04.sif ./vec_add_cuda +ENTER MAIN +c[0] : 1.000000 +c[1] : 1.000000 +c[42] : 1.000000 +EXIT SUCCESS +``` + +We have now successfully compiled and run a CUDA program _through_ a container environment. diff --git a/_sources/code_development/guides/gpu/cuda.md.txt b/_sources/code_development/guides/gpu/cuda.md.txt new file mode 100644 index 000000000..dc1b9e55e --- /dev/null +++ b/_sources/code_development/guides/gpu/cuda.md.txt @@ -0,0 +1,185 @@ +(cuda-c)= +# Using CUDA in C + +This example demonstrates: + +1. how to compile a simple CUDA program +2. how to request GPU resources and run the program +3. how to monitor the GPU utilization + +In this example we will use [CUDA](https://en.wikipedia.org/wiki/CUDA) to +facilitate offloading of a simple vector addition to be performed by a GPU, +and we will try to verify that the code is _actually_ executed on the device. +We will compile and run the following code on Saga: + +```{eval-rst} +.. literalinclude:: cuda/vec_add_cuda.cu + :language: c +``` + +```{eval-rst} +:download:`vec_add_cuda.cu <./cuda/vec_add_cuda.cu>` +``` + +```{note} +The purpose of this example is _not_ to understand the details in the code snippet +above, but rather to have a working code example that we can compile, run and verify +on a GPU. +``` + +## Step 1: Compiling the code + +In order to compile this code we need a CUDA-aware compiler, and on Saga we get this +by loading a `CUDA` module (choosing here the most recent version at the time of writing): + +```console +[me@login.SAGA]$ module load CUDA/11.1.1-GCC-10.2.0 +``` + +After the module is loaded you should have the `nvcc` CUDA compiler available: + +```console +[me@login.SAGA]$ nvcc --version +nvcc: NVIDIA (R) Cuda compiler driver +Copyright (c) 2005-2020 NVIDIA Corporation +Built on Mon_Oct_12_20:09:46_PDT_2020 +Cuda compilation tools, release 11.1, V11.1.105 +Build cuda_11.1.TC455_06.29190527_0 +``` + +We can now compile the code with the following command (never mind optimization flags +etc, they are not important at this point): + +```console +[me@login.SAGA]$ nvcc vec_add_cuda.cu -o vec_add_cuda +``` + +This command should hopefully finish without any error/warning. We can try to run the +resulting executable (which we called `vec_add_cuda`): + +```console +[me@login.SAGA]$ ./vec_add_cuda +ENTER MAIN +Segmentation fault (core dumped) +``` + +But it will fail because we are here still running on the `login` node, and there are +no GPU hardware and drivers available here. The next step is thus to request GPU resources +for running our program. + +```{note} +In order to run a CUDA program you must have CUDA hardware drivers installed on the +machine. On Saga, these are _only_ available on the GPU compute nodes, _not_ on the +login nodes. However, the drivers are not necessary for the compilation step (only the +CUDA library, which comes with `module load CUDA/...`), so this can be done on the login node. +``` + +## Step 2: Running the code + +We will first test the code in an interactive session, so we ask for a single GPU: + +```console +[me@login.SAGA]$ salloc --nodes=1 --gpus=1 --time=0:10:00 --mem=1G --partition=accel --account= +salloc: Pending job allocation 4320527 +salloc: job 4320527 queued and waiting for resources +salloc: job 4320527 has been allocated resources +salloc: Granted job allocation 4320527 +salloc: Waiting for resource configuration +salloc: Nodes c7-8 are ready for job +``` + +Remember to load the `CUDA` module if not already loaded from Step 1. You can also verify +that you actually have access to a GPU using the `nvidia-smi` command. If all goes well, +your program should now run and exit successfully: + +```console +[me@c7-8]$ ./vec_add_cuda +ENTER MAIN +c[0] : 1.000000 +c[1] : 1.000000 +c[42] : 1.000000 +EXIT SUCCESS +``` + +We here see the expected output of $c[i] = sin^2(i) + cos^2(i) = 1$ for any $i$, which +means that the code runs correctly. + +```{note} +For this particular example we have actually now already verified that the code was executed +**on the GPU**. As the code is written, there is no "fallback" implementation that runs +on the CPU in case no GPU is found, which means that `EXIT SUCCESS` == "the code executed +on the GPU". +``` + +## Step 3: Monitor the GPU utilization + +We will now try to capture some stats from the execution using the `nvidia-smi` tool +to verify that we were able to utilize a few percent of the GPUs capacity. To get a +reasonable reading from this tool we need an application that runs for at least a few +seconds, so we will first make the following change to our source code: + +```{eval-rst} +.. literalinclude:: cuda/loop_add_cuda.cu + :language: c + :lines: 41-46 + :emphasize-lines: 1,6 +``` +```{eval-rst} +:download:`loop_add_cuda.cu <./cuda/loop_add_cuda.cu>` +``` + +i.e. we loop over the vector addition 100 000 times. This should hopefully give sufficient +run time to be picked up by our tool. We then compile and run our new code with the following +job script: + +```{eval-rst} +.. literalinclude:: cuda/run.sh + :language: bash +``` +```{eval-rst} +:download:`run.sh <./cuda/run.sh>` +``` + +Submit the job using `sbatch` (remember to set the `--account` option, and note that we are +back on the `login` node): + +```console +[me@login.SAGA]$ sbatch run.sh +Submitted batch job 4320512 +``` + +Wait for the job to finish and verify from the `slurm-xxxxx.out` file that the +calculation still finished successfully, and that it ran for at least a few seconds. + +We can then add the following lines to the script in order to monitor the GPU +utilization using `nvidia-smi`: + +```{eval-rst} +.. literalinclude:: cuda/monitor.sh + :language: bash + :lines: 17-32 + :emphasize-lines: 4-7,12-13 +``` +```{eval-rst} +:download:`monitor.sh <./cuda/monitor.sh>` +``` + +Submit the job: + +```console +[me@login.SAGA]$ sbatch monitor.sh +Submitted batch job 4320513 +``` + +Wait for the job to complete and inspect the `monitor-xxxx.csv` file we just created: + +```console +[me@login.SAGA]$ cat monitor-4320513.csv +timestamp, utilization.gpu [%], utilization.memory [%] +2021/11/03 21:42:44.210, 0 %, 0 % +2021/11/03 21:42:45.211, 82 %, 76 % +2021/11/03 21:42:46.211, 82 %, 69 % +2021/11/03 21:42:47.211, 82 %, 69 % +``` + +We see here that the GPU utilization reached 82% of the GPUs capacity. diff --git a/_sources/code_development/guides/gpu/tensorflow.md.txt b/_sources/code_development/guides/gpu/tensorflow.md.txt new file mode 100644 index 000000000..4e565c21b --- /dev/null +++ b/_sources/code_development/guides/gpu/tensorflow.md.txt @@ -0,0 +1,120 @@ +# Using TensorFlow in Python + +In this example we will try to utilize the +`TensorFlow/2.6.0-foss-2021a-CUDA-11.3.1` library to execute a very simple +computation on the GPU. We could do the following interactively in Python, but +we will instead use a Slurm script, which will make it a bit more reproducible +and in some sense a bit easier, since we don't have to sit and wait for the +interactive session to start. + +We will use the following simple calculation in Python and `TensorFlow` to test +the GPUs: + +```{eval-rst} +.. literalinclude:: tensorflow/gpu_intro.py + :language: python +``` + +```{eval-rst} +:download:`gpu_intro.py <./tensorflow/gpu_intro.py>` +``` + +To run this we will first have to create a Slurm script in which we will request +resources. A good place to start is with a basic job +script (see {ref}`job-scripts`). +Use the following to create `submit_cpu.sh` (remember to substitute your project +number under `--account`): + +`````{tabs} +````{group-tab} Saga + +```{eval-rst} +.. literalinclude:: tensorflow/submit_cpu.sh + :language: bash +``` +```{eval-rst} +:download:`submit_cpu.sh <./tensorflow/submit_cpu.sh>` +``` +```` +````` + +If we just run the above Slurm script with `sbatch submit_cpu.sh` the output +(found in the same directory as you executed the `sbatch` command with a name +like `slurm-.out`) will contain several errors as `Tensorflow` attempts +to communicate with the GPU, however, the program will still run and give the +following successful output: + +```bash +Num GPUs Available: 0 +tf.Tensor( +[[22. 28.] + [49. 64.]], shape=(2, 2), dtype=float32) +``` + +So the above, eventually, ran fine, but did not report any GPUs. The reason for +this is of course that we never asked for any GPUs in the first place. To remedy +this we will change the Slurm script to include the `--partition=accel` and +`--gpus=1`, as follows: + +`````{tabs} +````{group-tab} Saga + +```{eval-rst} +.. literalinclude:: tensorflow/submit_gpu.sh + :language: bash + :emphasize-lines: 7,8 +``` +```{eval-rst} +:download:`submit_gpu.sh <./tensorflow/submit_gpu.sh>` +``` +```` +````` + +We should now see the following output: + +```bash +Num GPUs Available: 1 +tf.Tensor( +[[22. 28.] + [49. 64.]], shape=(2, 2), dtype=float32) +``` + +However, with complicated libraries such as `Tensorflow` we are still not +guaranteed that the above actually ran on the GPU. There is some output to +verify this, but we will check this manually as that can be applied more +generally. + + +## Monitoring the GPUs + +To do this monitoring we will start `nvidia-smi` before our job and let it run +while we use the GPU. We will change the `submit_gpu.sh` Slurm script above to +`submit_monitor.sh`, shown below: + +`````{tabs} +````{group-tab} Saga + +```{eval-rst} +.. literalinclude:: tensorflow/submit_monitor.sh + :language: bash + :emphasize-lines: 19-21,25 +``` +```{eval-rst} +:download:`submit_monitor.sh <./tensorflow/submit_monitor.sh>` +``` +```` +````` + +```{note} +The query used to monitor the GPU can be further extended by adding additional +parameters to the `--query-gpu` flag. Check available options +[here](http://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf). +``` + +Run this script with `sbatch submit_monitor.sh` to test if the output +`gpu_util-.csv` actually contains some data. We can then use this data +to ensure that we are actually using the GPU as intended. Pay specific attention +to `utilization.gpu` which shows the percentage of how much processing the GPU +is doing. It is not expected that this will always be `100%` as we will need to +transfer data, but the average should be quite high. + diff --git a/_sources/code_development/guides/gpu_usage.md.txt b/_sources/code_development/guides/gpu_usage.md.txt new file mode 100644 index 000000000..6a06bc7e1 --- /dev/null +++ b/_sources/code_development/guides/gpu_usage.md.txt @@ -0,0 +1,193 @@ +--- +orphan: true +--- + +(gpuusage)= +# Basic commands for GPU-usage + +We present some basic command-lines that provide statistics about GPU utilization. A special focus here will be the commands [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) and [`rocm-smi`](https://sep5.readthedocs.io/en/latest/ROCm_System_Managment/ROCm-System-Managment.html), which can be used for monitoring GPU devices on heterogenous systems involving CPUs and GPUs. This guide is motivated by the increase use of software with GPU support, and in which the access to GPU usage is not often trivial. It thus represents an initial step towards improving the utilization of GPUs. + +This guide should be useful for users who are running GPU-based applications. By the end of this guide, users will learn about: +- How to run [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) and [`rocm-smi`](https://sep5.readthedocs.io/en/latest/ROCm_System_Managment/ROCm-System-Managment.html) commands on HPC systems. +- How to access specific information related to hardware and software; in particular: + - GPU and memory utilizations + - Device statistics + - Device monitoring + - Device topology + +```{contents} +:depth: 2 +``` + +## How to run `nvidia-smi` and `rocm-smi` commands +The commands [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) and [`rocm-smi`](https://sep5.readthedocs.io/en/latest/ROCm_System_Managment/ROCm-System-Managment.html) are used in general to monitor and manage GPU applications; and they will be discussed here in the context of HPC systems. These commands should be launched while a submitted application is running. This is necessary in order to collect real-time activities of GPU utilization and memory usage among other metrics. These commands can also be used to access information about GPU-based systems (NVIDIA and AMD), regardless of whether GPU applications are running or not. In the following we present two ways how to run these commands: + +The command `nvidia-smi` is available from an NVIDIA GPU node, and can be accessed in [NRIS clusters](hardware-overview) by following these steps: +- Submit a job: ```$ sbatch job.slurm ``` +- Display which node: ```$ squeue –u username ``` +- Ssh to the listed node e.g. ```$ ssh c7-8``` on Saga and ```$ ssh b5304``` on Betzy. +- Run the command: ```$ nvidia-smi``` +- For more options: ```$ nvidia-smi -h``` + +Information about GPU nodes can be displayed via the command ```$ sinfo –p [name-of-partition]```. In [NRIS clusters](hardware-overview), the partition is specified by `accel`. + +```{note} +Note that access to a GPU node without having active jobs will be denied and will result in *Authentication failed*. +``` + +The command-lines defined above are also valid on an AMD GPU node, in which SLURM is used as a workload manager. Here `rocm-smi` will be used instead. + +An alternative to the first method, is to run the commands `nvidia-smi` and `rocm-smi` interactively, as described below. This interactive way permits displaying GPU usage in real-time. + +`````{tabs} +````{group-tab} nvidia-smi + +```console +$ for j in {1..10}; do srun --jobid=JobID --interactive --pty nvidia-smi; sleep 2; done +``` +```` +````{group-tab} rocm-smi + +```console +$ for j in {1..10}; do srun --jobid=JobID --interactive --pty rocm-smi; sleep 2; done +``` +```` +````` + +where the *JobID* needs to be specified. In this example, the command `nvidia-smi`/`rocm-smi` runs for 10 times as defined by the range {1..10}, in which each run is delayed with 2 seconds, as defined by the option `sleep 2`. Here additional options can be specified in this syntax to display selective metrics as described in the next section. + +## Command `nvidia-smi` + +The command utility [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) is provided by NVIDIA and stands for “NVIDIA System Management Interface”. As the name indicates, the tool is useful for monitoring and managing GPU applications. + +In this section, we cover the following [options](https://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf): + +- Overview of GPU usage +- Device statistics +- Device monitoring +- Device topology + +In particular, we show how to display certain statistical information based on the `nvidia-smi` command and other related [options](https://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf). + +### Overview of GPU usage + +The command `nvidia-smi` provides a general overview of GPU usage. The output of the command is shown in *Fig. 1*. The figure contains two tables: The first one provides information about available GPUs and additional technical information related to the hardware, while the 2nd one contains information about different processes running on GPU. We summarize this information in the *Table 1.*. Here among other information displayed are the driver and cuda versions, the GPU name, memory and GPU utilization. These last two metrics indicate well-utilization of GPUs. The example displayed in *Table. 1* (also *Fig. 1.*) shows that the running application uses 325 MiB of memory (the GPU-device memory is ~16 GiB) and 39% of GPU. Note that the percent refers to the percent of time in the past sampling period, when one or more kernels were executed on the GPU. + +In the following we present additional options to complement the information provided by the command `nvidia-smi`. Such options allow displaying selective information. + +
+ +![Fig1](gpu_usage/fig1.png) + +**Fig. 1.** *Overview of GPU usage in a NVIDIA's system - Output from the command `nvidia-smi`.* +
+ +
+ +![Fig2](gpu_usage/fig2.png) + +**Table 1.** *Description of GPU usage metrics extracted from Fig. 1 (see [here](https://medium.com/analytics-vidhya/explained-output-of-nvidia-smi-utility-fc4fbee3b124 +) for more details).* +
+ +### Device statistics + +Displaying statistics of a device is provided by the command `nvidia-smi stats`. In particular, one can specify additional options to select statistics about GPU utilization (left-hand side) and/or memory utilization (right-hand side), as shown in *Fig. 2*. This is provided by the commands `nvidia-smi stats -d gpuUtil` and `nvidia-smi stats -d memUtil` respectively. The output of the commands is shown in *Fig.2*. Here the first column indicates the GPU index and the second one displays either the GPU or memory utilization, while the last column indicates the percent of time of either the GPU or memory utilization. More information can be displayed by the command `nvidia-smi stats -h`. + +
+ +![Fig3](gpu_usage/fig3.png) + +**Fig. 2.** *Device statistics - Output generated from the command `nvidia-smi stats -d gpuUtil` (left-hand side) and `nvidia-smi stats -d memUtil` (right-hand side).* +
+ +### Device monitoring + +The device monitoring option provides additional metrics about a GPU-device; in particular, SM (Streaming Multiprocessor) utilization, memory utilization, temperature, power consumption, memory clock rate (mclk), processor clock rate (pclk) (see *Fig. 3.*). This information is provided by the command `nvidia-smi dmon`. Here one can also specify additional options to select desired metrics: e.g. the command `nvidia-smi pmon -s u` displays the GPU utilization together with other metrics mentioned above, while the command `nvidia-smi pmon -s m` displays the memory utilization combined with various metrics. + +
+ +![Fig4](gpu_usage/fig4.png) + +**Fig. 3.** *Device monitoring - Output generated from the command `nvidia-smi dmon`.* +
+ +### Device topology + +The device topology option provides information about the nature of interconnect, in particular, in GPU-GPU and GPU-mlx5 (mlx5 refers to [Mellanox ConnectX-5](https://docs.nvidia.com/networking/display/MLNXOFEDv451010/Introduction)) networks as well as CPU affinity and NUMA (Non-Uniform Memory Access) affinity in an HPC system architecture. This is provided by the command-line `nvidia-smi topo -m`, and is useful for optimizing GPU applications that run on multiple GPUs. The output of the command is shown in *Fig. 4*. Here the figure represents a matrix composed of four GPU devices (GPU0, GPU1, GPU2 and GPU3) and two Mellanox devices (mlx5_0, mlx5_1), in which each pair is connected via different type of interconnects. In particular, GPUs are interconnected via [NVLink](https://www.nvidia.com/en-us/data-center/nvlink/), which allows high-bandwidth communication between GPUs. The NVLink in NVIDIA A100, which is displayed in *Fig. 4.* is the third generation NVLink, and is expected to provide higher performance compared to the first and second generations i.e. P100 and V100, respectively. On the other hand, the interconnect between a GPU and mlx5 is established either through SYS (e.g. GPU0-mlx5_0) or PIX (e.g. GPU1-mlx5_0) connections (see Legend in *Fig. 4*). + +In short, understanding the device topology is useful for ensuring the functionality of, for instance, the [GPUDirect RDMA](https://docs.nvidia.com/cuda/gpudirect-rdma/#:~:text=GPUDirect%20RDMA%20is%20a%20technology,video%20acquisition%20devices%2C%20storage%20adapters.) (Remote Direct Memory Access) communication. The RDMA technology permits direct data transfer between a GPU and a third party device (e.g. network interface cards - NICs) through the PCIe (Peripheral Component Interconnect Express) bus and without passing by the CPU host, thus resulting in higher speed data transfer and lower latency. + +
+ +![Fig5](gpu_usage/fig5.png) + +**Fig. 4.** *Device topology - Output generated from the command `nvidia-smi topo -m`.* +
+ +For completeness, we provide the command `lscpu | grep NUMA`, which lists NUMA nodes. The output of this command e.g. from the node `b5301` in our cluster [Betzy](betzy) is + +```console +NUMA node(s): 8 +NUMA node0 CPU(s): 0-7,64-71 +NUMA node1 CPU(s): 8-15,72-79 +NUMA node2 CPU(s): 16-23,80-87 +NUMA node3 CPU(s): 24-31,88-95 +NUMA node4 CPU(s): 32-39,96-103 +NUMA node5 CPU(s): 40-47,104-111 +NUMA node6 CPU(s): 48-55,112-119 +NUMA node7 CPU(s): 56-63,120-127 +``` + +Additional options that can be combined with the command `nvidia-smi` are summarized in the *Table. 2* (see [here](https://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf) for more details), and can also be displayed using the command `nvidia-smi topo -h`. + +| Options | Description | +|---|---| +| nvidia-smi topo | Display topological information about the system| +| nvidia-smi topo --matrix | Display the GPUDirect communication matrix for the system | +| nvidia-smi topo --cpu | CPU number for which to display all GPUs with an affinity | +| nvidia-smi topo --matrix_pci | Display the GPUDirect communication matrix for the system (PCI Only)| + +**Table. 2** *Various options that can be combined with the command [`nvidia-smi topo`](https://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf) to display specific metrics related to the topology of the used system.* + + +## Command `rocm-smi` +The command-line [`rocm-smi`](https://sep5.readthedocs.io/en/latest/ROCm_System_Managment/ROCm-System-Managment.html) is the counterpart of the NVIDIA’s [`nvidia-smi`](https://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf) tool and is provided by AMD as part of the [ROCm](https://sep5.readthedocs.io/en/latest/ROCm_System_Managment/ROCm-System-Managment.html) software stack. The command can be used in the same way as the `nvidia-smi` command for displaying various metrics. + +The command [`rocm-smi`](https://sep5.readthedocs.io/en/latest/ROCm_System_Managment/ROCm-System-Managment.html) can be combined with specific options to display more technical information. A summary of selective options is provided in the *Table. 3*. We refer readers to the [ROCm](https://sep5.readthedocs.io/en/latest/ROCm_System_Managment/ROCm-System-Managment.html) documentation for further details. + +| Options | Description | +|---|---| +| –showhw | Display details of hardware| +| -u, --showuse | Display GPU utilization| +| --showmemuse | Display GPU memory utilization | +| -b, --showbw | Display estimated PCIe use | +| |(i.e. estimated number of bytes sent and received by a GPU through the PCIe bus) | +| --showtoponuma | Display device topology including NUMA nodes| +| -P, --showpower | Display current Average Graphics Package Power Consumption | +| -t, –showtemp | Display current temperature | +| -g, --showgpuclocks | Display current GPU clock frequencies | + +**Table. 3** *Various options that can be combined with the command [`rocm-smi`](https://sep5.readthedocs.io/en/latest/ROCm_System_Managment/ROCm-System-Managment.html) to display specific metrics.* + +# Conclusion +In conclusion, we have presented an overview of the command-lines [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) and [`rocm-smi`](https://sep5.readthedocs.io/en/latest/ROCm_System_Managment/ROCm-System-Managment.html) for monitoring and managing GPU-based applications. In addition, we have presented various options that can be combined with these commands to display specific metrics. We have also shown how to run them interactively in a cluster. + +Overall, these commands are useful for revealing information, in particular, about the GPU and memory utilizations, which are a key indicator of how well the GPUs are utilized. Additional options can be specified such as the GPU-device topology option, which provides an overview of different interconnects between GPUs and Mellanox devices as well as CPU and NUMA affinities. Displaying such information helps improving the performance. Although the [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) and [`rocm-smi`](https://sep5.readthedocs.io/en/latest/ROCm_System_Managment/ROCm-System-Managment.html) commands provide real-time metrics, their use is limited to being a statistical indicator of GPU usage. Therefore more advanced techniques are needed in order to identify bottlenecks in GPU applications. Here code profiling becomes a necessity to help optimizing performance and to ensure well-utilization of GPUs. + + +# Relevant links + +[NVIDIA-SMI](https://developer.nvidia.com/nvidia-system-management-interface) (see also [here](https://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf)) + +[ROCm-SMI](https://sep5.readthedocs.io/en/latest/ROCm_System_Managment/ROCm-System-Managment.html) + +[NVLink](https://www.nvidia.com/en-us/data-center/nvlink/) + +[NVIDIA Mellanox device](https://docs.nvidia.com/networking/display/MLNXOFEDv451010/Introduction) + +[NVIDIA networking](https://www.nvidia.com/en-us/networking/) + +[GPUDirect RDMA](https://docs.nvidia.com/cuda/gpudirect-rdma/#:~:text=GPUDirect%20RDMA%20is%20a%20technology,video%20acquisition%20devices%2C%20storage%20adapters.) + +[Network on AMD](http://developer.amd.com/wp-content/resources/56354_1.00.pdf) diff --git a/_sources/code_development/guides/gpuaware_mpi.md.txt b/_sources/code_development/guides/gpuaware_mpi.md.txt new file mode 100644 index 000000000..6852c40e6 --- /dev/null +++ b/_sources/code_development/guides/gpuaware_mpi.md.txt @@ -0,0 +1,550 @@ +--- +orphan: true +--- + +(gpu-aware-mpi)= + +# Incorporating MPI into GPU-directive models with a GPU-awareness approach + + +# Summary + +We present a descriptive implementation of a hybrid approach in which the MPI (message passing interface) communication framework is combined with either OpenACC or OpenMP application programming interfaces (APIs). The implementation is based on solving the 2D (two-dimension)-Laplace equation in a mini-application form. A special focus will be on performing point-to-point (e.g. `MPI_Send` and `MPI_Recv`) and collective (e.g. `MPI_Allreduce`) operations either between a pair of GPU-devices with the GPU-hardware support or by passing through a CPU-host memory. These two scenarios are referred to as GPU-aware MPI and GPU-non-aware MPI, respectively. Both scenarios will be addressed in the hybrid **MPI-OpenACC** and **MPI-OpenMP** models and their performance will be evaluated and analysed. Interesting enough, the performance is found to be increased by a factor of 10 when enabling the GPU-aware support on the +[supercomputer LUMI-G Early Access Platform](https://docs.lumi-supercomputer.eu/eap/) and by almost a factor of 30 compared to the case when MPI alone is considered. + +By the end of this tutorial, we expect the readers to learn about + +- Implementing a pure MPI using a blocking mode of communication. +- Implementing the hybrid **MPI-OpenACC** and **MPI-OpenMP** models, and specifically: + - Defining the concept of direct memory access. + - Setting up a GPU-device to be assigned to an MPI rank. + - Implementing MPI operations between GPUs with and without using a CPU-host memory as a staging point. + - Compiling the hybrid **MPI-OpenACC** and **MPI-OpenMP** applications on different HPC systems. + + +```{note} +The source codes discussed in this tutorial are provided at the end in the section {ref}`Source codes `, from which can be directly downloaded. +``` + + +# Introduction + +Parallel computing involving communication between heterogenous systems, especially CPU (central processing unit) and GPU (graphics processing unit), permits to significantly improve the performance of computations on modern HPC (high-performance computing) systems. This in turn allows us to address large scientific computational problems, which would not be possible using conventional CPU-based approaches. Such computational problems can benefit from available GPU-programing models to further accelerate the computation over multiple GPU-devices. Here, although the asynchronous OpenACC and OpenMP offer the potential to carry out computations across multiple GPUs, the partition of the computation is limited to a single GPU node. Note that the asynchronous OpenMP relies on the compiler support. Extending the computation to explore multiple GPU nodes requires combining MPI (message passing interface) with additional GPU-programming models, such as OpenACC and OpenMP application programming interfaces (APIs) and CUDA. In this tutorial, we focus on the hybrid **MPI-OpenACC** and **MPI-OpenMP** applications. + +Combining MPI with OpenACC or OpenMP offloading APIs offers the potential to fully utilizing the capacity of multiple GPUs within multiple GPU partitions in modern clusters and supercomputers. Moreover, it has the advanatge of reducing the computing time caused by transferring data via the host-memory during heterogenous communications, thus rendering the HPC applications efficient. In this contetx, it has been shown that integrating [GPU-awareness](https://dl.acm.org/doi/10.1109/ICPP.2013.17) into MPI library improves the performance of scientific applications. This tutorial is thus motivated by the need of guiding readers, who are familiar with MPI, in porting their MPI-based codes to heterogenous systems and towards exploring exascale platforms, such as the [supercomputer LUMI](https://www.lumi-supercomputer.eu/). + +In this tutorial, we will cover two scenarios: a scenario in which an MPI library can directly access a GPU-device memory (i.e GPU-aware MPI); and a scenario in which there is no interaction between an MPI library and a GPU-device (i.e. GPU-non-aware MPI). The implementation will be provided for both the hybrid **MPI-OpenACC** and **MPI-OpenMP** APIs. + +This descriptive tutorial is organized as follows: In {ref}`section I `, we describe the implementation of the low-level MPI alone using an application based on solving the Laplace equation. In {ref}`section II `, we extend the MPI-application to incorporate a GPU-awareness approach. This is done by combining MPI with OpenACC/OpenMP AIPs. Here we will address both GPU-accelerator and non-accelerator -aware MPI library (i.e. MPI with direct memory access vs MPI without direct memory access). +{ref}`Section III ` is devoted to the performance analysis. {ref}`Section IV ` concludes the tutorial. + +```{contents} Table of Contents +``` + +(implementation-of-mpi-alone)= + +# Implementation of MPI alone + +The MPI programming model is widely used in the scientific community for intensive parallel computing that requires distributed memory among multiple nodes. In this section, we implement the low-level [MPI standard](https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf) approach to parallelise our `Fortran` application, which is based on solving the Laplace equation in a uniform 2D-grid. Details about the numerical method can be found [here](https://documentation.sigma2.no/code_development/guides/converting_acc2omp/openacc2openmp.html). + +
+ +![Fig1](GPUawareMPI/Fig1.png) + +**Fig. 1.** *Schematic representation of an MPI-scheme, in which the data that are initially stored in an array of size (nx,ny) (**Fig. 1(a)**) are distributed among different MPI-processes ranging from 0 to nproc-1 (**Fig. 1(b)**). Here nproc is the total number of MPI-processes and npy is the size of the array on y-direction on each MPI-process (nyp=ny/nproc).* +
+ +In this tutorial, we consider a simplest scenario, in which the data, which are initially of dimension {math}`n_{x}` x {math}`n_{y}` (cf. **Fig. 1(a)**) are subdivided only along the y-direction. This results into sub-arrays of dimension {math}`n_{x}` x {math}`n_{yp}`, where {math}`n_{yp}`={math}`n_{y}`/nproc. Here each sub-array is assigned to an MPI process as shown in **Fig. 1(b)**. In a realistic problem, it is recommended to carry out a 2D domain decomposition along both x and y directions. Our simplified scenario, however has the advantage of transforming a 2D-problem into a 1D-problem in terms of implementing a parallel algorithm. Such a simplification is a key element when making tutorials. + +A starting point in this implementation is to generate initial conditions (ICs) and distribute them among different processes in a communicator group. The simplest way to do so is to use the `MPI_Scatter` operation, which distributes the generated data from the process 0 (root) to processes labeled *myid* in the source code (i.e. the rank of an MPI process). This is defined in the range [*myid=0*, *myid=nproc-1*], where *nproc* is the total number of processes. It is worth mentioning that initiating the data in parallel among all MPI-ranks is recommended for scalability reasons. Therefore, an alternative to the `MPI_Scatter` operation is to use a blocking/non-blocking mode of communication as described below + +```Fortran +if(myid.eq.0) then + allocate(f_send(nx,nyp)) + + do k=1,nproc-1 + Call RANDOM_NUMBER(f_send(:,:)) + + call MPI_Send(f_send(:,:),nsend,MPI_DOUBLE_PRECISION,k,tag,& + MPI_COMM_WORLD, ierr) + enddo + deallocate(f_send) + +else + call MPI_Recv(f(:,:),nsend,MPI_DOUBLE_PRECISION,0, & + tag,MPI_COMM_WORLD, status,ierr) +endif +``` +This piece of code should be adapted to fit a specific scenario. + +The ICs we consider are random and are generated by the routine `RANDOM_NUMBER`. Note that MPI-programs, in general, require incorporating the mpi module (i.e. `use mpi`) or including the header file mpif.h (i.e. `include ‘mpif.h’`). In our source code, the meaning of each MPI function is included briefly as a comment in the code itself. + +A subsequent step is to iterate the ICs using an appropriate iterative scheme as described in our previous [tutorial](https://documentation.sigma2.no/code_development/guides/converting_acc2omp/openacc2openmp.html) (see also [here](https://arxiv.org/abs/2201.11811)). In an iterative scheme, the distributed data along the y-direction needs to be updated (i.e. halo exchange); this is because the data at the boundaries of each sub-array in each MPI process are initially set to zero. For instance, computing the new array *f_k(:,1)* for each MPI-process requires updating the elements *f(:,0)* initially set for each process; similarly for *f(:,nyp+1)* (see the lines 82-91 in the code below and the equation in **Fig. 2**). A key element here is to transfer the data at the boundaries between the neighboring MPI processes at each iteration. This is schematically illustrated in **Fig. 2**. This is transformed into a few MPI lines using a blocking communication mode characterized by the MPI functions `MPI_Send()` and `MPI_Recv()`, as described in the source code below. The blocking mode here means that the **send** and **receive** operations do not return until the message data is available to be re-used. In other words, the operations are completed once the message is buffered. Note that there are three additional blocking modes for the **send** operation. These modes, however, are not addressed in the present tutorial. We thus refer readers to the [MPI documentation](https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf) for further description. + +```{eval-rst} +.. literalinclude:: GPUawareMPI/src/laplace_mpi.f90 + :language: Fortran + :lines: 80-103 +``` + +The inconvenient of the blocking mode is related to the possibility of causing the program to deadlock (i.e. the MPI message cannot be completed). An alternative to the blocking mode that avoids MPI deadlock is to use a [non-blocking](https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf) concept. The latter has another advanatge, which relies on enabling overlapping between communication and computation. In this type of mode, the MPI-functions `MPI_Send()` and `MPI_Recv()` are replaced with `MPI_Isend()` and `MPI_Irecv()` respectively, and should be followed by the function `MPI_Wait()`. + +
+ +![Fig2](GPUawareMPI/Fig2.png) + +**Fig. 2.** *Schematic representation of an MPI-scheme, in which the boundary data are transferred between the neighbouring MPI-processes in a point-to-point operation before (**Fig. 2(a)**) and after collecting theme (**Fig. 2(b)**). The form of the equation is also displayed in the same figure for the sake of clarity*. +
+ +Updating the data at the boundaries is a key difficulty in this example, as it requires re-collecting the data from the neighbouring MPI processes to reconstruct a new array at each iteration. The computed maximum between the new and the old arrays is done using the `MPI_Allreduce` operation, in which the result is returned to all MPI processes of the specified communicator group. + +To check the correctness of the results, one can compute the sum of all the elements or eventually display the converged data either in 1D or 2D for comparison. For this reason, we introduce the `MPI_Gather` operation, which allows aggregating the data from each MPI process and make them available only in the root process. This option, however, might become time consuming and eventually might lead to segmentation error when increasing the size of the data. + +## Compilation process of an MPI-application + +Here we describe the compilation process of a pure MPI-application on different HPC systems using the OpenMPI and Intel MPI compilers on the clusters [Saga](https://documentation.sigma2.no/hpc_machines/saga.html) and [Betzy](https://documentation.sigma2.no/hpc_machines/betzy.html) and the Cray compiler on the [supercomputer LUMI](https://www.lumi-supercomputer.eu/). The compiler wrappers associated with the OpenMPI, Intel MPI and Cray compilers are `mpif90`, `mpiifort` and `ftn`, respectively. + +### On the Saga and Betzy clusters + +The following procedure is valid for both Saga and Betzy clusters. Here is an example of modules to be loaded. + +`````{tabs} +````{group-tab} OpenMPI module + +```console +$ module load OpenMPI/4.1.1-GCC-11.2.0 +``` +```` +````{group-tab} Intel MPI module + +```console +$ module load impi/2021.4.0-intel-compilers-2021.4.0 +``` +```` +````` + +The compilation process is described according to the chosen compiler. + +`````{tabs} +````{group-tab} OpenMPI compiler + +```console +$ mpif90 -o laplace.mpi.ompi laplace_mpi.f90 +``` +```` +````{group-tab} Intel MPI compiler + +```console +$ mpiifort -o laplace.mpi.intel laplace_mpi.f90 +``` +```` +````` + +Here is an example of a batch script to launch an MPI job. + +```console +#SBATCH --job-name=lap-mpi_saga +#SBATCH --account=nnxxxxx +#SBATCH --time=00:01:00 +#SBATCH --qos=devel +#SBATCH --nodes=1 #Total nbr of nodes +#SBATCH --ntasks-per-node=4 #Nbr of tasks per node +#SBATCH --mem-per-cpu=2G #Host memory per CPU core + #On Betzy the mem should not be specified for a pure MPI-code +srun ./laplace.mpiompi +``` + +### On the supercomputer LUMI + +On the supercomputer LUMI, an MPI module is loaded in the environment `cray-mpich` (as described +[here](https://docs.lumi-supercomputer.eu/development/compiling/prgenv/#compile-an-mpi-program)) + +```console +$ module load cray-mpich +``` +The syntax of the compilation process of an MPI code using the Cray compiler can be expressed as: + +```console +$ ftn -o laplace.mpi.cray laplace_mpi.f90 +``` + +To launch an MPI job, the following batch script can be used +(see also [here](https://docs.lumi-supercomputer.eu/computing/jobs/batch-job/#example-batch-scripts)) + +```console +#!/bin/bash -l +#SBATCH --job-name=lap-mpi +#SBATCH --account=project_xxxxx +#SBATCH --time=00:02:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --partition=standard + +srun ./laplace.mpi.cray +``` + +(implementation-of-a-gpu-awareness-approach)= + +# Implementation of a GPU-awareness approach + +In this section we extend our MPI-application to incorporate the OpenACC and OpenMP offloading APIs targeting both NVIDIA and AMD GPU-accelerators. A special focus here is to address the concept of **GPU-aware MPI** library (or MPI with GPU-direct memory access) and **GPU-non-aware MPI** (or MPI without GPU-direct access). In the following we implement this concept for both the hybrid **MPI-OpenACC** and **MPI-OpenMP** APIs. Details about the implementation of OpenACC and OpenMP APIs alone are provided in our previous +[tutorial](https://documentation.sigma2.no/code_development/guides/converting_acc2omp/openacc2openmp.html). + +The GPU-awareness approach can simply mean how to make a GPU-device memory aware or not aware of the existence of an MPI-library, such that a direct or non-direct access to the library can be accomplished. Before addressing this concept, it is worthwhile *(i)* defining the mechanism of direct-memory access and *(ii)* introducing how to establish a connection between each MPI rank and a specific GPU-device. Here, we are in the situation in which a host and a device have a distinct memory (i.e. non-shared memory device). + +## Direct memory access + +[Direct memory access](https://www.sciencedirect.com/topics/computer-science/direct-memory-access) (DMA) (see also this [Ref.](https://www.akkadia.org/drepper/cpumemory.pdf), which provides an overview about memory) is a mechanism by which the data can be transferred between an I/O device and a memory system without an involvement of the processor itself. It thus allows two separated processors to directly access the memory of each other via a network. This has the advantage of reducing latency and increasing throughput, which is relevant particularly for modern HPC systems. As an example, the DMA mechanism is used in data management between a CPU-host and a GPU-device as we shall see later. + +## Assigning a MPI rank to a GPU device + +Managing multiple GPU-devices by combining MPI and OpenACC or OpenMP APIs requires as a first step assigning each MPI rank to a single GPU-device. In other words, one needs to determine which processes are within a specifc CPU-node that is connecetd with the nearest GPU-node. This permits to minimize latency, and it is particularly relevant when running an application on multiple nodes. This procedure can be done by splitting the world communicator into subgroups of communicators (or sub-communicators), which is done via the routine `MPI_COMM_SPLIT_TYPE()`. Here each sub-communicator contains processes running on the same node. These processes have a shared-memory region defined via the argument `MPI_COMM_TYPE_SHARED` (see [here](https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf) for more details). Calling the routine `MPI_COMM_SPLIT_TYPE()` returns a sub-communicator (labelled "host_comm" in the source code) created by each subgroup, in which each MPI-rank can be assigned to a single GPU-device (see the lines 97-101 in the code below). + +```{eval-rst} +.. literalinclude:: GPUawareMPI/src/laplace_mpiacc_noaware.f90 + :language: Fortran + :lines: 97-110 +``` + +In **OpenACC** API, the host-device connection is established by specifying the runtime library routine `acc_set_device_num(host_rank,deviceType)`. The latter contains two arguments "host_rank" and "deviceType": the first argument determines which device an MPI rank will be assigned to, and the second one returns the GPU-device type to be used. These are indicated by the lines 104-110 in the source code described above. Similarly in **OpenMP** API, the connection is defined via the function `omp_set_default_device(host_rank)`. + +## GPU-non-aware MPI library + +The MPI implementation without GPU-direct memory access or GPU-non-aware MPI means that calling an MPI routine from an OpenACC or OpenMP API requires updating the data before and after an MPI call. In this scenario, the data are copied back and forth between a host and a device before and after each MPI call. In the hybrid **MPI-OpenACC**, the procedure is defined by specifying the directive `update host()` (see the code line 132) for copying the data from a device to a host before an MPI call, and by the directive `update device()` specified after an MPI call for copying the data back to a device (see the code line 160). The implementation is shown in this piece of code described below +```{eval-rst} +.. literalinclude:: GPUawareMPI/src/laplace_mpiacc_noaware.f90 + :language: Fortran + :lines: 131-169 +``` + +A similar concept is adopted in the hybrid **MPI-OpenMP**. Here, updating the data in connection with an MPI call is done by specifying the directives `update device() from()` ( see the line 128) and `update device() to()` (see the line 162), respectively, for copying the data from a device to a host and back to the device. This is illustrated in the lines of code shown below. +```{eval-rst} +.. literalinclude:: GPUawareMPI/src/laplace_mpiomp_noaware.f90 + :language: Fortran + :lines: 127-163 +``` + +Although this approach is simple to implement, it might lead to a lower performance caused by an explicit transfer of data between a host and a device before and after calling an MPI routine. Furthermore, the approach is synchronous, which does not allow overlapping between MPI-based computation and OpenACC/OpenMP operations. An alternative to this approach is to use the GPU-aware MPI as described in the next section. + +## GPU-aware MPI library + +The concept of the GPU-awareness MPI relies on the possibility of moving data that reside in a GPU-device memory without necessarily using a CPU-host memory as an intermediate buffer (see e.g. [here](https://dl.acm.org/doi/pdf/10.1145/2642769.2642773)). This approach enables an MPI library to directly access a GPU-device memory, which in turn permits to transfer data from one GPU to another GPU, thus reducing the communication and computing time of data between different MPI processes. + +In our example discussed above, the data at the boundaries of each MPI process reside in a GPU-device, as they have already been copied to. In the GPU-non-aware MPI concept, these data must be updated on a CPU-host and copyied back to a GPU-device at each iteration. In the GPU-aware MPI, however, these data can be communicated between a pair of GPUs witout necessarily passing through a CPU-host memory. This approach is supported by recent versions of MPI libraries such as [Open MPI](https://www.open-mpi.org/). The idea here is that when a pointer to a GPU-device is passed to an MPI call, the MPI library automatically sets up a GPU memory for processing data. This implementation might require a newer version of [CUDA driver and toolkit](https://www.open-mpi.org/faq/?category=runcuda). + +In the hybrid **MPI-OpenACC**, the concept is defined by combining the directive `host_data` together with the clause `use_device(list_array)`. This combination enables the access to the arrays listed in the the clause `use_device(list_array)` from the +[host](https://www.nvidia.com/docs/IO/116711/OpenACC-API.pdf). The list of arrays, which should be already present in a GPU-device memory, are directly passed to an MPI routine without a need of a staging host-memory for copying the data. Note that for copying data, we use here [unstructured data](https://www.openacc.org/sites/default/files/inline-files/OpenACC_Programming_Guide_0_0.pdf) blocks characterized by the directives `enter data` and `exit data`, unlike in the previous section, in which the structured data locality is considered. The unstructured data has the advantage of allowing to allocate and deallocate arrays within a data region. + +In our example, the GPU-aware MPI support with OpenACC is illustrated in connection with the MPI operations `MPI_Send` and `MPI_Recv` as described in lines 127-160 (see the code below) and the operation `MPI_Allreduce` in lines 184-192 (see the code below). Note that not all MPI functions are supported by the GPU-awareness concept (see [here](https://www.open-mpi.org/faq/?category=runcuda) for more details). In the lines 133-160, the boundary data stored in the array *f(:,:)* are present in GPUs and are passed directly to the `MPI_Send()` and `MPI_Recv()` functions. Therefore, the operations `MPI_Send` and `MPI_Recv` are performed between GPUs without passing through a CPU-host. A similar picture occurs in connection with the `MPI_Allreduce()` function, in which the `MPI_Allreduce` operation is performed between a pair of GPUs. In the latter picture, we have noticed a slight increase of the computing time (a few ms) compared to the case when the `MPI_Allreduce` operation is carried out between CPUs instead of GPUs. This is because the computed maximum (see the lines 174-182), which is present in a CPU-host is copied back to a GPU-device at each iteration before perfoming the `MPI_Allreduce` operation between GPUs. + +```{eval-rst} +.. literalinclude:: GPUawareMPI/src/laplace_mpiacc_aware.f90 + :language: Fortran + :lines: 127-160 +``` + +```{eval-rst} +.. literalinclude:: GPUawareMPI/src/laplace_mpiacc_aware.f90 + :language: Fortran + :lines: 174-182 +``` + +```{eval-rst} +.. literalinclude:: GPUawareMPI/src/laplace_mpiacc_aware.f90 + :language: Fortran + :lines: 184-192 +``` + +The same concept is adopted in the hybrid **MPI-OpenMP** API. The GPU-aware MPI support with OpenMP can be implemented via the directive [`target data use_device_ptr(ptr-list)`](https://www.openmp.org/spec-html/5.1/openmpsu65.html). Here each array specified in the clause `use_device_ptr()` is a pointer to an object that is accessible on a GPU-device. The implementation is shown in the lines 127-160 of the code below, in which the MPI functions `MPI_Send()` and `MPI_Recv()` can be performed between a pair of GPUs. Similarly for the `MPI_Allreduce` operation shown in the lines 184-192. + +By comparing the syntax of the hybrid **MPI-OpenACC** API with that of the **MPI-OpenMP** API, one can see that the porting procedure of one API to another is straightforward. + +```{eval-rst} +.. literalinclude:: GPUawareMPI/src/laplace_mpiomp_aware.f90 + :language: Fortran + :lines: 127-160 +``` + +```{eval-rst} +.. literalinclude:: GPUawareMPI/src/laplace_mpiomp_aware.f90 + :language: Fortran + :lines: 184-192 +``` + +```{note} +Note that the GPU-aware MPI support needs to be enabled by setting the environment variable `MPICH_GPU_SUPPORT_ENABLED=1` to 1 or to 0 to not enable it, as described in the compilation process. +``` + +## Compilation process of **MPI-OpenACC** and **MPI-OpenMP** applications + +Our hybrid **MPI-OpenACC** and **MPI-OpenMP** applications have been tested on both the cluster +[Betzy](https://documentation.sigma2.no/hpc_machines/betzy.html) (4xNVIDIA A100 GPUs connected by NVLink) and the supercomputer +[LUMI-EAP](https://docs.lumi-supercomputer.eu/eap/) (Early Access Platform) (4xAMD MI100 GPUs connected by the Infinity Fabric Link). The compilation process is thus described below according to which HPC system is used. + +### On the cluster Betzy + +We use a version of OpenMPI library (MPI-3 implementation), which has some supports for GPUs and which enables moving data residing on GPU-memory, in which a GPU-awareness concept is supported in the [Betzy](https://documentation.sigma2.no/hpc_machines/betzy.html) cluster. Note that this concept is not supported in the [Saga](https://documentation.sigma2.no/hpc_machines/saga.html) cluster, and therefore, only the GPU-non-aware MPI concept is supported. For completeness, we refer readers to a tutorial, in which a +[GPU-non-aware MPI](https://documentation.sigma2.no/code_development/guides/openacc_mpi.html) was implemented in the `C` language. + +The modules to be loaded are listed here according to which cluster is considered. + +`````{tabs} +````{group-tab} Betzy + +```console +$ module load OpenMPI/4.1.1-NVHPC-22.1-CUDA-11.4.1 +``` +```` +````{group-tab} Saga + +```console +$ module load OpenMPI/4.0.3-PGI-20.4-GCC-9.3.0 +``` +```` +````` + +The compilation process of the hybrid **MPI-OpenACC** and **MPI-OpenMP** applications is described below + +`````{tabs} +````{group-tab} **MPI-OpenACC** + +```console +$ mpif90 -fast -acc -Minfo=accel -o laplace.mpiacc laplace_mpiacc.f90 +``` +```` +````{group-tab} **MPI-OpenMP** + +```console +$ mpifort -mp=gpu -Minfo=mp -o laplace.mpiomp laplace_mpiomp.f90 +``` +```` +````` + +Where the flag `-mp=gpu` enables **OpenMP** targeting GPU. The option `-Minfo=mp` provides compiler diagnostic of **OpenMP**. It is also optional to specify the compute capability by adding the flag `-gpu=cc60` for NVIDIA P100 GPU +([Saga](https://documentation.sigma2.no/hpc_machines/saga.html)) and `-gpu=cc80` for A100 GPU +([Betzy](https://documentation.sigma2.no/hpc_machines/betzy.html)). + +One can check if the OpenMPI library is built with the GPU-aware support by running the following command: +```console +$ ompi_info --parsable --all | grep mpi_built_with_cuda_support:value +``` + +The output of the command is either **value:true** or **value:false** as expressed below: + +```console +mca:mpi:base:param:mpi_built_with_cuda_support:value:true +or +mca:mpi:base:param:mpi_built_with_cuda_support:value:false +``` + +The output message containing **value:true** means that the NVIDIA GPU-aware support in OpenMPI is enabled by default. + +Here is an example of a batch script to launch a hybrid application on Saga and Betzy clusters. + +```console +#SBATCH --job-name=lap-mpiacc_betz +#SBATCH --account=nnxxxxx +#SBATCH --time=00:01:00 +#SBATCH --qos=devel +#SBATCH --partition=accel --gpus=8 +#SBATCH --nodes=2 #Total nbr of nodes +#SBATCH --ntasks-per-node=4 #Nbr of tasks per node +#SBATCH --gpus-per-node=4 #Nbr of GPUs per node +#SBATCH --mem-per-cpu=2G #Host memory per CPU core + +srun ./laplace.mpiacc +``` + +### On the supercomputer LUMI-EAP + +We list below the modules to be loaded before compiling the application. We refer readers to the original documentation about the [supercomputer LUMI](https://www.lumi-supercomputer.eu/) for further details about modules and the compilation process: + +```console +module load craype-accel-amd-gfx908 +module load cray-mpich +module load LUMI/21.12 partition/EAP +module load rocm/4.5.2 +``` + +Here we compile the hybrid **MPI-OpenACC** and **MPI-OpenMP** applications on LUMI-EAP using a Cray compiler of the wrapper `ftn` as described in the following: + +`````{tabs} +````{group-tab} **MPI-OpenACC** + +```console +$ ftn -hacc -o laplace.mpiacc laplace_mpiacc.f90 +``` +```` +````{group-tab} **MPI-OpenMP** + +```console +$ ftn -homp -o laplace.mpiomp laplace_mpiomp.f90 +``` +```` +````` + +Here, the flags `hacc` and `homp` enable the OpenACC and OpenMP directives in the hybrid **MPI-OpenACC** and **MPI-OpenMP** applications, respectively. + +The following batch script can be used to launch a hybrid application on LUMI-EAP. + +```console +#!/bin/bash -l +#SBATCH --job-name=lap-mpiomp_eap +#SBATCH --account=project_xxxxx +#SBATCH --time=00:01:00 +#SBATCH --partition=eap +#SBATCH --nodes=2 #Total nbr of nodes +#SBATCH --ntasks-per-node=4 #Nbr of tasks per node +#SBATCH --gpus=8 #Total nbr of GPUs +#SBATCH --gpus-per-node=4 #Nbr of GPUs per node + +##In the case a GPU-aware MPI is implemented +export MPICH_GPU_SUPPORT_ENABLED=1 + +srun ./laplace.mpiomp +``` +Note that the GPU-aware support in MPICH is enabled by setting the environment `export MPICH_GPU_SUPPORT_ENABLED=1` on Cray before running the hybrid application. + +(performance-analysis-on-lumi-g-eap)= + +# Performance analysis on LUMI-G EAP + +Our computational tests are performed on the supercomputer [LUMI-G EAP](https://docs.lumi-supercomputer.eu/eap/) (Early Access Platform) (4xAMD MI250x GPUs connected by the Infinity Fabric Link, see also the GPU specifications [here](https://www.amd.com/en/products/server-accelerators/instinct-mi250x)). We carry out experiments based on the hybrid **MPI-OpenACC** and **MPI-OpenMP** APIs in the aim of illustrating the benefit of implementing the GPU-aware MPI library. + +We first begin with the effect of the GPU-aware MPI using the **MPI-OpenACC** API. This is shown in **Fig. 3**, in which the computations are performed on 4 Slurm GPUs. For reference, the computations based on a pure MPI is also shown (blue curve). Interesting enough, we can see clearly that the computing time is reduced by a factor of 10 when the GPU-aware support is enabled (black curve) compared to the case of the GPU-non-aware MPI (green curve). Moreover, the comparison with a pure MPI API shows a further increase of the performance by a factor of 30. + +
+ +![Fig3](GPUawareMPI/Fig3.png) + +**Fig. 3.** *Comparison of the performance of the computations as a function of the number of points nx along the x-axis. Note that we use a unifore 2D-grid. The computations are carried out on a single node in which a total of 4 Slurm GPUs are allocated (i.e. 2xAMD MI250x GPUs on the [superocmputer LUMI](https://docs.lumi-supercomputer.eu/eap/)) using: (Black curve) **MPI-OpenACC** with the GPU-aware MPI support; (green curve) **MPI-OpenACC** with the GPU-non-aware MPI; (bleu curve) **MPI-alone** with 4 CPU-cores. The square symbols are used for guidance.* +
+ +For completeness, we present in the table 1 a comparison of the performance between the hybrid **MPI-OpenACC** and **MPI-OpenMP** APIs. The performance is evaluated for both the GPU-aware MPI and the GPU-non-aware MPI and is shown for different sizes of the spatial grid. The comparison is summarised in the table below and shows a roughly similar performance between these two hybrid models. These results indicate, on one hand, the benefit of implementing the GPU-awareness approach independently on the GPU-directive model; and on the other hand, they highlight the similarity in the performance of the *MPI-OpenACC* and **MPI-OpenMP** APIs. + +Hybrid models\2D-grid (nx,ny) | 8192x8192 | 16384x16384 | 20000x20000 | +-- | -- | -- |-- | +**MPI-OpenACC** with GPU-aware MPI | 1.46 | 5.11 | 7.92 | +**MPI-OpenMP** with GPU-aware MPI | 1.35 | 4.72 | 7.09 | +**MPI-OpenACC** with GPU-Non-aware MPI | 14.90 | 58.03 | 86.25 | +**MPI-OpenMP** with GPU-Non-aware MPI | 14.84 | 61.21 | 85.58 | + +**Table 1** *Comparison of the performance between the hybrid **MPI-OpenACC** and **MPI-OpenMP** APIs at three different grids increased in size. The comparison is performed for both GPU-aware MPI and GPU-non-aware MPI.* + +(gpuaware-conclusion)= + +# Conclusion + +We have presented an overview on GPU-hybrid programming by integrating GPU-directive models (i.e. OpenACC and OpenMP APIs) with the MPI library. This was implemented via an application based on sloving the 2D-Laplace equation. The approach adopted here allows, in general, to utilise multiple GPU-devices not only within a single GPU node but it extends to multiple GPU partitions. It thus allows intra-process communications (i.e. GPU-to-CPU) and inter-process communications (i.e. GPU-to-GPU through GPU interconnects). In particular, we have addressed both GPU-non-aware MPI and GPU-aware MPI library approaches. The latter approach has the advantage of enabling a direct interaction between an MPI library and a GPU-device memory. In other words, it permits performing MPI operations between a pair of GPUs, thus reducing the computing time caused by the data locality. We have carried experiments on the [supercomputer LUMI-G Early Access Platform](https://docs.lumi-supercomputer.eu/eap/) and have observed an increase of the performance by a factor of 10 when implementing the GPU-aware MPI scheme and by almost a factor of 30 when it is compared to the case with MPI alone. + +(source-codes)= + +# Source codes + +We provide here the source codes discussed in this tutorial. The codes can be directly downloaded. + +**Pure MPI** +```{eval-rst} +:download:`laplace_mpi.f90 <./GPUawareMPI/src/laplace_mpi.f90>` +``` + +**Hybrid MPI-OpenACC** without the GPU-aware MPI support +```{eval-rst} +:download:`laplace_mpiacc_noaware.f90 <./GPUawareMPI/src/laplace_mpiacc_noaware.f90>` +``` + +**Hybrid MPI-OpenACC** with the GPU-aware MPI support +```{eval-rst} +:download:`laplace_mpiacc_aware.f90 <./GPUawareMPI/src/laplace_mpiacc_aware.f90>` +``` + +**Hybrid MPI-OpenMP** without the GPU-aware MPI support +```{eval-rst} +:download:`laplace_mpiomp_noaware.f90 <./GPUawareMPI/src/laplace_mpiomp_noaware.f90>` +``` + +**Hybrid MPI-OpenMP** with the GPU-aware MPI support +```{eval-rst} +:download:`laplace_mpiomp_aware.f90 <./GPUawareMPI/src/laplace_mpiomp_aware.f90>` +``` + +**OpenACC** offloading +```{eval-rst} +:download:`laplace_acc.f90 <./GPUawareMPI/src/laplace_acc.f90>` +``` + +**OpenMP** offloading +```{eval-rst} +:download:`laplace_omp.f90 <./GPUawareMPI/src/laplace_omp.f90>` +``` + +(compilation-process)= + +# Compilation process + +In this section we summarise the compilation process of the source code **laplace_mpigpu.f90**. In this code, we use the preprocessor directive `#ifdef` (or also `#if defined`) to enable compiling the same code ported to multiple programming models. Here different options can be used to specify preprocessing of source files and that according to the used HPC system, as described below: + +## On the cluster [Betzy](https://documentation.sigma2.no/hpc_machines/betzy.html) + +`````{tabs} +````{group-tab} **MPI-OpenACC** + +```console +$ mpif90 -cpp -D_OPENACC -fast -acc -Minfo=accel -o laplace.mpiacc laplace_mpigpu.f90 +``` +```` +````{group-tab} **MPI-OpenMP** + +```console +$ mpifort -cpp -D_OPENMP -mp=gpu -Minfo=mp -o laplace.mpiomp laplace_mpigpu.f90 +``` +```` +````` + +Where we use `-cpp` to manually invoke a preprocessor macro `_OPENACC` or `_OPENMP`. + +## On the supercomputer [LUMI-EAP](https://docs.lumi-supercomputer.eu/eap/) + +`````{tabs} +````{group-tab} **MPI-OpenACC** + +```console +$ ftn -eZ -D_OPENACC -hacc -o laplace.mpiacc laplace_mpigpu.f90 +``` +```` +````{group-tab} **MPI-OpenMP** + +```console +$ ftn -eZ -D_OPENMP -homp -o laplace.mpiomp laplace_mpigpu.f90 +``` +```` +````` + +As described in the previous section, we use the conditional compilation with the macros `_OPENACC` and `_OPENMP`. This is enabled in the Cray compiler by specifying the option `-eZ` followed by either `-D_OPENACC` to enable **OpenACC** directives or `-D_OPENMP` to enable **OpenMP** directives. + +(references)= + +# References + +Here are some references, in which the tutorial is based on: + +- [MPI documentation](https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf) + +- [About Memory: DMA,...](https://www.akkadia.org/drepper/cpumemory.pdf) + +- [GPU-aware MPI_Allreduce](https://dl.acm.org/doi/pdf/10.1145/2642769.2642773) + +- [OpenACC for Programmers: Concepts and Strategies](https://www.oreilly.com/library/view/openacc-for-programmers/9780134694306/) + +- [OpenMP API specification 2021](https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5-2.pdf) + +- [OpenMP API reference guide](https://www.openmp.org/wp-content/uploads/OpenMPRefCard-5-2-web.pdf) + +- [OpenACC API specification 2021](https://www.openacc.org/sites/default/files/inline-images/Specification/OpenACC-3.2-final.pdf) + +- [OpenACC API reference guide](https://www.openacc.org/sites/default/files/inline-files/API%20Guide%202.7.pdf) + +- [Tutorials on OpenACC and OpenMP offloading](https://documentation.sigma2.no/code_development/guides/converting_acc2omp/openacc2openmp.html) + +- [OpenACC course](https://github.com/HichamAgueny/GPU-course) diff --git a/_sources/code_development/guides/hipsycl.md.txt b/_sources/code_development/guides/hipsycl.md.txt new file mode 100644 index 000000000..8e297ef2e --- /dev/null +++ b/_sources/code_development/guides/hipsycl.md.txt @@ -0,0 +1,146 @@ +--- +orphan: true +--- + +(sycl)= + +# What is SYCL? + +> [SYCL](https://www.khronos.org/sycl/) is a higher-level programming model to improve programming +> productivity on various hardware accelerators. It is a single-source domain-specific embedded +> language based on pure C++17. It is a standard developed by Khronos Group, announced in March 2014. +> +>>>>>>>>>>>>>>>>>>> ---[_Wikipedia_](https://en.wikipedia.org/wiki/SYCL) + +(hipsycl)= +## What is hipSYCL? + +[hipSYCL](https://github.com/illuhad/hipSYCL) is one of a few currently available implementations +of the SYCL standard (none of which is feature complete wrt SYCL-2020 at the time of writing). +hipSYCL provides backends for offloading to OpenMP (any type of CPU), CUDA (Nvidia GPUs) and +HIP/ROCm (AMD GPUs), as well as experimental support for Level Zero (Intel GPUs). This particular +SYCL implementation is interesting for us at NRIS because it provides a unified tool for all our +current hardware, which constitutes Intel and AMD CPUs, Nvidia GPUs on Saga and Betzy and AMD GPUs +on LUMI. Other available SYCL implementations are [Codeplay](https://developer.codeplay.com/home/)'s +ComputeCpp and [Intel oneAPI](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html)'s +DPC++, which are currently more geared towards Intel hardware (CPUs, GPUs and FPGAs). + +(hipsycl-start)= +# Getting started with hipSYCL + +```{note} +In this tutorial we will use the global installation of hipSYCL on Saga. +If you want to use another SYCL implementation or you need to install it +on a different machine, please refer to the installation instructions in the +[SYCL Academy](https://github.com/codeplaysoftware/syclacademy) documentation. +``` + +## Hello world example + +This example demonstrates: + +1. how to compile a minimal SYCL application using the global hipSYCL installation on Saga +2. how to compile for different target architectures +3. how to run the example on a GPU node on Saga + +In this example we will write a very simple program that queries the system for information +about which device it runs on. We will then compile the code for both CPU and (Nvidia) GPU +targets, and verify that it is able to find both devices. This can be achieved with +just a few lines of code: + +```{eval-rst} +.. literalinclude:: hipsycl/hello_world.cpp + :language: cpp +``` + +Here we first include the main SYCL header file, which in our case will be provided by the +`hipSYCL` module on Saga. In the main function we simply initialize a `sycl::queue` using the +so-called `default_selector`, and then we print out which device was picked up for this particular +queue (more on queues later). The `default_selector` will choose an accelerator if one is found +on the host, otherwise it will fall back to run as (traditional) OpenMP on the host CPU. +By specifying different types of `selectors` it is possible to e.g. force the code to always +run on the CPU, or to choose a particular device if more than one accelerator is available. + +### Compiling for CPUs + +In order to compile the code we need to have a SYCL implementation available, which we will +get by loading the `hipSYCL` module on Saga (check with `module avail hipsycl` to see which +versions are currently available): + +```console +[me@login-1.SAGA ~]$ module load hipSYCL/0.9.1-gcccuda-2020b +``` + +```{note} +If you want to compile for Nvidia GPUs you need a `gcccuda` version of `hipSYCL`. +With EasyBuild there exists also a CPU-only version based on the `GCC` toolchain without +the CUDA backend (and hopefully soon a ROCm version for targeting AMD GPUs). +``` + +After loading the `hipSYCL` module you should have the `syclcc` compiler wrapper available +on your command line (try e.g. `syclcc --help`). We will first compile the code only for +CPU by specifying the `omp` target: + +```console +[me@login-1.SAGA ~]$ syclcc --hipsycl-targets=omp -O3 -o hello_world hello_world.cpp +``` + +This step should hopefully pass without any errors or warnings. If we run the resulting +executable on the login node we will however see a warning: + +```console +[me@login-1.SAGA ~]$ ./hello_world +[hipSYCL Warning] backend_loader: Could not load backend plugin: /cluster/software/hipSYCL/0.9.1-gcccuda-2020b/bin/../lib/hipSYCL/librt-backend-cuda.so +[hipSYCL Warning] libcuda.so.1: cannot open shared object file: No such file or directory +Chosen device: hipSYCL OpenMP host device +``` + +The reason for the warning is that we use a `hipSYCL` version that is compiled with the +CUDA backend, but we don't have the CUDA drivers available when we run the program on the +login nodes. But no worries, the last line that is printed is the actual output of our +program, which tells us that the code was executed on the OpenMP (CPU) host device, which +is exactly as expected since (1) we don't have any accelerator available on the login node +and (2) we only compiled a CPU target for the code. This means that if you run the same +binary on one of the GPU nodes, you will no longer see the `[hipSYCL Warning]` (since +CUDA drivers are now available), but you will still get the same program output `Chosen +device: hipSYCL OpenMP host device` (since the code is still only compiled for CPU targets). + +### Compiling for Nvidia GPUs + +The only thing we need to change when compiling for GPUs is to add new target options to +the compiler string. The only complicating issue here might be to figure out which target +architecture corresponds to the hardware at hand, but for the P100 GPUs on Saga the +name of the target should be `cuda:sm_60` (`cuda:sm_80` for Betzy's A100 cards): + +```console +[me@login-1.SAGA ~]$ syclcc --hipsycl-targets='omp;cuda:sm_60' -O3 -o hello_world hello_world.cpp +clang-11: warning: Unknown CUDA version. cuda.h: CUDA_VERSION=11010. Assuming the latest supported version 10.1 [-Wunknown-cuda-version] +``` + +CUDA drivers is _not_ a prerequisite for compiling the CUDA target, so this can be done on +the login node. We see that we get a `Clang` warning due to the CUDA version, but this does +not seem to be a problem. The resulting executable can still be run on a pure CPU host (e.g. +the login node) with the same result as before: + +```console +[me@login-1.SAGA ~]$ ./hello_world +[hipSYCL Warning] backend_loader: Could not load backend plugin: /cluster/software/hipSYCL/0.9.1-gcccuda-2020b/bin/../lib/hipSYCL/librt-backend-cuda.so +[hipSYCL Warning] libcuda.so.1: cannot open shared object file: No such file or directory +Chosen device: hipSYCL OpenMP host device +``` + +but if we instead run the code on a GPU node (here through an interactive job; remember to ask +for GPU resources on the `accel` partition) we see that the program is actually able to pick up +the GPU device: + +```console +[me@login-1.SAGA ~]$ srun --account= --time=0:10:00 --ntasks=1 --gpus-per-task=1 --partition=accel --mem=1G --pty bash +srun: job 3511513 queued and waiting for resources +srun: job 3511513 has been allocated resources +[me@c7-8.SAGA]$ ./hello_world +Chosen device: Tesla P100-PCIE-16GB +``` + +Note that no code is actually executed on the device in this example, since the `sycl::queue` +remains empty, but at least we know that the hardware is visible to our application. Now the +next step will be to add some work that can be offloaded to the device. diff --git a/_sources/code_development/guides/offloading-using-fortran.md.txt b/_sources/code_development/guides/offloading-using-fortran.md.txt new file mode 100644 index 000000000..58cd2c683 --- /dev/null +++ b/_sources/code_development/guides/offloading-using-fortran.md.txt @@ -0,0 +1,177 @@ +--- +orphan: true +--- + +(offload_fortran_concurrent)= + +# Offloading to GPU using Fortran 2008 + +## Introduction + +In 2010 the ISO Standard Fortran 2008 introduced the `do concurrent` construct +which allows to express loop-level parallelism. The current compilers support multicore +parallel execution, but presently only NVIDIA offer a compiler that support offload to their GPUs. + +The NVIDIA HPC Fortran (Formerly PGI Fortran) supports using the `do +concurrent` construct to offload to NVIDIA GPU accelerators. + +It provides a simple way of using accelerators without any extra +libraries nor deviation from the standard language. No compiler +directives or any other kind of libraries are needed. By using plain +standard Fortran the portability is not an issue. The code will use whatever +means of parallel execution available on the current platform, it might be multicore +or in this example offloading to highly parallel GPU execution. + +The 2008 standard will be supported in the foreseeable future, just like many compiler +still support the Fortran 66 standard. + +This approach provides a simple, user friendly, future proof and portable approach to +offloading to accelerators. + +Example code used can be found [NRIS example repo](https://gitlab.sigma2.no/training/sample_code/-/tree/master/Offload-ISO-Languages). + +## Example code using SAXPY + +Writing the actual Fortran 2008 standard code is surprisingly easy. Here is a +simple example using SAXPY (Single precision Z = A*X + Y). + +There are two Fortran approaches one using indexing addressing +element by element : +```fortran +do concurrent (i = 1:n) + y(i) = y(i) + a*x(i) +end do +``` +or vector syntax introduced in the Fortran 90 standard: +```fortran +do concurrent (i = 1:1) + y = y + a*x +end do +``` + +The vector syntax does not actually need a loop, but in order to use the +parallel `do concurrent` it needs to have a loop, but in this usage only a +single pass. + +The parallel loop can be compiled for a threaded multicore architecture using: +```console +$ nvfortran -o saxpy.x -stdpar=multicore saxpy.f90 +``` +or for GPU offload by using: +```console +$ nvfortran -o saxpy.x -stdpar=gpu saxpy.f90 +``` + +As SAXPY is mostly data movement and little computation the gain in using GPU +is small as copying of data from main memory to device memory is a limiting +factor. + + +## Example with more computation + +Here we use an example with a bit more computation. + +Using indexed syntax: + +```fortran +do concurrent (i=1:M, j=1:M, k=1:M) + Z(k,j,i)=X(k,j,i) * Z(k,j,i)**2.01_real64 + Z(k,j,i)=sin(log10(X(k,j,i) / Z(k,j,i))) +end do +``` + +or Fortran 90 vector syntax, where the loop has only one iteration: +```fortran +do concurrent (i=1:1) + Z = X * Z**2.01_real64 + Z = sin(log10(X / Z)) +end do +``` + +| Run | Run time [seconds] | +|----------------------------|--------------------| +| Indexed syntax CPU 1 core | 14.5285 | +| Vector syntax CPU 1 core | 14.5234 | +| Indexed syntax GPU A100 | 0.4218 | +| Vector syntax GPU A100 | 0.4149 | + +With more flops per byte transferred the speedup by offloading to +GPU is higher. A speedup of 34 compared to a single core is nice. + +The NVfortran compiler is capable of generating code +to offload both using the index addressing syntax as well as +the vector syntax. + + +## Old legacy code example + +We can also look at a matrix-matrix multiplication reference +implementation (DGEMM) code from 8-February-1989. This is found at: +[Basic Linear Algebra, level 3 matrix/matrix operations](http://www.netlib.org/blas/index.html#_level_3) or download the +[Fortran 77 reference implementation](http://www.netlib.org/blas/blas.tgz) +which contains DGEMM and also contain support files needed. + +Download the legacy code, change the comment character to fit +Fortran 90 standard: +```console +$ cat dgemm.f | sed s/^*/\!/>dgemm.f90 +``` + +The BLAS routines multiplication comes in 4 flavors: +- S single (32 bit) precision +- D double (64 bit) precision +- C complex single precision +- Z complex double precision + + +Assume well behaved matrices `C := alpha*A*B + beta*C` and a call to dgemm like: +`call dgemm('n', 'n', N, N, N, alpha, a, N, b, N, beta, c, N)` + +Locate the line below the line highlighted above, about line 228. +Change : +```fortran +DO 90 J = 1,N +``` +with: +```fortran +DO concurrent (J = 1 : N) +``` +and change the +```fortran +90 CONTINUE +``` +with +```fortran +end do +``` + +This is all that is needed to use GPU for offloading, the rest is up to the +compiler. + +Building this code needs some extra files `lsame.f` and `xerrbla.c`. +One example of how to build a threaded version for a multicore CPU could look +like: +```console +$ nvfortran -O3 -stdpar=multicore dgemm-test.f90 dgemm.f90 xerrbla.o lsame.f +``` +or to build an offloaded GPU version: +```console +$ nvfortran -O3 -stdpar=gpu dgemm-test.f90 dgemm.f90 xerrbla.o lsame.f +``` + +| Run | Build flags | Cores | Performance | +| --------------- | ----------------------|-------|-----------------| +| Reference f77 | -O3 | 1 | 4.41 Gflops/s | +| Reference f90 | -O3 -stdpar=multicore | 2 | 6.27 Gflops/s | +| Reference f90 | -O3 -stdpar=multicore | 16 | 24.67 Gflops/s | +| Reference f90 | -O3 -stdpar=gpu |RTX2080| 43.81 Gflops/s | +| Reference f90 | -O3 -stdpar=gpu | A100 | 112.23 Gflops/s | + +The results are stunning: changing only one line in the old legacy +code from `do` to `do concurrent` can speed up from 4 Gflops/s to 112 +Gflops/s a 25x increase in performance. + +An intersting test is to compare this more then 30 year old reference code +with a call to a modern library, the syntax is still the same. +The scientific application fortran code will probably behave like the 30 year old example +while libraries generally show far higher performance. diff --git a/_sources/code_development/guides/offloading.md.txt b/_sources/code_development/guides/offloading.md.txt new file mode 100644 index 000000000..8396d7fbe --- /dev/null +++ b/_sources/code_development/guides/offloading.md.txt @@ -0,0 +1,173 @@ +--- +orphan: true +--- + +# Offloading to GPUs + +In high-performance computing offloading is the act of moving a computation +from the main processor to one or more accelerators. In many cases **the +computation does not need to be explicitly programmed** but can be a standard +`for` (or `do` in Fortran) loop. + +This document shows how to use the standard compilers available on {ref}`saga` +and {ref}`betzy` to offload computation to the attached GPUs. This document is +not considered a comprehensive guide for how to perform offloading, but rather +as a compendium on the different compiler flags required with different +compilers. For guidance on different programming models for offloading please +[see our guides](code_development). + +Below we have listed the necessary flags to enable GPU offloading for the +different systems NRIS users have access to. Both {ref}`saga` and {ref}`betzy` +are Nvidia systems, while {ref}`lumi` is an AMD based system. + +A brief description of their GPU architectures is given in the tabs below. + +````{tabs} +```{tab} Betzy + +Betzy has `Nvidia A100` accelerators which support CUDA version `8.0`. The +generational identifier for the GPU is either `sm_80` or `cc80` depending on +the compiler. +``` +```{tab} Saga + +Saga has `Nvidia P100` accelerators which support CUDA version `6.0`. The +generational identifier for the GPU is either `sm_60` or `cc60` depending on +the compiler. +``` +```{tab} LUMI-G + +LUMI-G has `AMD MI250X` accelerators which is supported by ROCm. The identifier +for the GPU is `gfx90a`. +``` +```` + +## OpenMP + +OpenMP gained support for accelerator offloading in version `4.0`. Most +compilers that support version `4.5` and above should be able to run on +attached GPUs. However, their speed can vary widely so it is recommended to +compare the performance. + +If you are interested in learning more about OpenMP offloading we have +{ref}`a beginner tutorial on the topic here`. + +```{warning} +NVHPC does not support OpenMP offloading on {ref}`saga` as the generation of +GPUs on {ref}`saga` is older than what NVHPC supports. Thus, NVHPC _only_ +supports OpenMP offloading on {ref}`betzy`. +``` + +```````{tabs} + +``````{group-tab} Clang + +`````{tabs} + +````{group-tab} Nvidia + +```bash +-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_ +``` +```` +````{group-tab} AMD + +```bash +-fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx +``` +```` +````` +`````` +``````{group-tab} GCC + +`````{tabs} + +````{group-tab} Nvidia + +```bash +-fopenmp -foffload=nvptx-none="-misa=sm_35" +``` +```` +````{group-tab} AMD + +```bash +-fopenmp -foffload=amdgcn-amdhsa="-march=gfx +``` +```` +````` +`````` +``````{group-tab} NVHPC + +`````{tabs} + +````{group-tab} Nvidia + +```bash +-mp=gpu -Minfo=mp,accel -gpu=cc +``` +```` +````` +`````` +``````` + +## OpenACC + +OpenACC is another open standard for supporting offloading to accelerators. +Since OpenACC was initially developed by Nvidia the best support for OpenACC is +found using Nvidia's compilers. However, several other compilers also support +OpenACC to some extent. + +If you are interested in learning more about OpenACC offloading we have +{ref}`a beginner tutorial on the topic here`. + +```````{tabs} + +``````{group-tab} GCC + +`````{tabs} + +````{group-tab} Nvidia + +```bash +-fopenacc -foffload=nvptx-none="-misa=sm_35" +``` +```` +````{group-tab} AMD + +```bash +-fopenacc -foffload=amdgcn-amdhsa="-march=gfx +``` +```` +````` +`````` +``````{group-tab} NVHPC + +`````{tabs} + +````{group-tab} Nvidia + +```bash +-acc -Minfo=accel -gpu=cc +``` +```` +````` +`````` +``````` + +## Standard Parallelism + +Nvidia additionally supports offloading based on "Standard Parallelism" which +is capable of accelerating C++ `std::algorithms` and Fortran's `do concurrent` +loops. + +You can read more about accelerating Fortran using `do concurrent` +{ref}`in our guide`. + +`````{tabs} +````{group-tab} NVHPC + +```bash +-stdpar=gpu -Minfo=stdpar +``` +```` +````` diff --git a/_sources/code_development/guides/ompoffload.md.txt b/_sources/code_development/guides/ompoffload.md.txt new file mode 100644 index 000000000..c7629de0e --- /dev/null +++ b/_sources/code_development/guides/ompoffload.md.txt @@ -0,0 +1,329 @@ +--- +orphan: true +--- + +(ompoffload)= + +```{index} GPU; Introduction to OpenMP-Offload;OpenMP +``` + +Introduction +============ +This tutorial provides insights into the GPU programming using OpenMP (OMP) offload. The goal of this document is to quickly enable readers to run and test some OpenMP offload codes on {ref}`betzy` platform. We assume that readers have previous knowledge of C/C++ programming, and are also aware of some of the commonly used OpenMP directives. + +After reading this tutorial one should be able to answer: + +* What is offloading in OpenMP? +* How to compile OpenMP code with offloading support on {ref}`betzy` platform? +* How to invoke GPUs from the OpenMP code? +* How to optimize the performance by using advanced features of the OpenMP library? + +OpenMP - 4.0 +============ +Let's start with a short introduction to OpenMP-4.0. Heterogeneous systems, including Supercomputers, are optimized for low latency as well as for high throughput. The high throughput comes from the specialized co-processor, which we know by the name GPUs. In general, throughput is about the amount of data processed or transferred by a device in unit-time, whereas latency is about how fast the data can be transferred or loaded. Programmable GPUs offer a low Energy/Performance ratio, which is a good thing, but GPUs also expose issues related to `programmability`, `performance`, and `portability`. OpenMP 4.0 is thus an approach that enables resolving these three issues in one place. + +OpenMP is a popular shared-memory parallel programming specification which pioneered the unification of proprietary languages into an industry standard. OpenMP 4.0 is the next step to expand its scope from traditional multicore CPUs to advanced programmable Accelerators. When OpenMP 4.0 was rolled out, it took a departure from the traditional `openmp` work-sharing constructs and added support for offloading tasks to other devices. Apart from several other key features in OpenMP 4.0, the major shift that distinguishes it from its predecessor is the ability to `offload`. + +Now, what is offloading in the first place? `Offloading` often refers to taking the computation from the `Host` to the `Target`. The OMP program starts executing on the host, which is generally the CPU, and the computing task can be offloaded to the target device. The target device could be an accelerator or even the CPU itself. In this tutorial, we will focus on the GPUs as the target device; however, it is possible in OpenMP-4.0 to offload computation on other types of coprocessors, like FPGAs. + +Clang/LLVM compiler with OpenMP 4.0 support +=========================================== +Many compilers support OpenMP 4.0, 4.5, and 5.0 directives and are targeted at NVIDIA GPUs; one of the most prevalent hardware GPUs. `Clang` compiler is among the ones that are really moving quickly in accommodating OpenMP offload features. It is a collaborative effort of multiple vendors, like IBM, TI, Intel to bring OpenMP to Clang. Other compilers such as GCC have also some support for OpenMP 4.0; however, the performance might be limited depending on the compiler's version. + +(clang11)= + +Clang on Betzy +============= +Building the Clang compiler with OpenMP-offload and Nvidia support is a chore at this point. It takes a lot of work to get it to work with NVIDIA-GPUs. Luckily, now Clang with OpenMP-offload is available on {ref}`betzy` for its users. +The Clang compiler can be made available on {ref}`betzy` using the following command. + +```bash +$ module load Clang/11.0.1-gcccuda-2020b +``` + +Case study. +=========== +For the sake of consistency, we will use the same [Mandelbrot](https://en.wikipedia.org/wiki/Mandelbrot_set) example that has been used in the {ref}`OpenACC` tutorial. We will go through the code and incrementally add OpenMP directives so that the user can write an OpenMP-offload code without any apparent effort. + +Now, we focus our attention on the `mandelbrot.c` code. For the convenience of our readers, we have copied the `mandelbrot.c` code down below. + +(mandelbrot_C)= + +```{note} +The complete code is provided at the bottom of this page, under resources section. +``` + +```{eval-rst} +:download:`mandelbrot_serial.c ` + +``` + +```{eval-rst} +.. literalinclude:: ompoffload/mandelbrot_serial.c + :language: c++ + +``` +The Mandelbrot set is a well-known function that refers to fractal sets. It has the form: +```{math} f_{c}(z)=z^{2}+c +``` + In short, the {ref}`mandelbrot.c` file produces a fractal picture using the mandelbrot function, in which the function generates the Mandelbrot set of complex numbers at a given pixel. Further reading can be found at [Mandelbrot](https://en.wikipedia.org/wiki/Mandelbrot_set). + + +Here is how the actual code looks like, and the highlighted lines show how the pixel value is updated with the mandelbrot value. + +```{eval-rst} +.. literalinclude:: ompoffload/mandelbrot_serial.c + :language: c++ + :lines: 121-135 + :emphasize-lines: 8-9 + +``` + +OpenMP on CPU. +============== +Let's look at our code again with a special focus on the highlighted region. Here, it can be seen that there is a nested for loop. The first 'y' loop scans the pixels of the image vertically, and the inner loop 'x' scans the image horizontally. In this way, the whole image is scanned pixel by pixel. For each pixel, the mandelbrot function is called and the old pixel value is swapped with the new value. + +As we are using serial code, the first step is to see how much time a serial implementation would take on the CPU without any parallelization. + +```{note} +We assume that `Clang` compiler is already loaded in your work environment; as it is mentioned {ref}`above`. +``` + +The serial version of the program can be compiled and run by using the following commands on {ref}`betzy`. + +```bash +$ make serial +$ srun --ntasks=1 --account= --time=00:10:00 --mem-per-cpu=125M --partition=accel ./serial +``` + +We found that executing a `1280x720` pixels image with `10,000` iterations takes `8` seconds. + + +Let’s build upon this and start applying OpenMP directives to transform our serial code into a parallel code on the CPU. As we know, the potential parallelizable region is the code with the nested ‘for’ loop, as shown in the figure. + +```{eval-rst} +.. literalinclude:: ompoffload/mandelbrot_serial.c + :language: c++ + :lines: 121-135 + :emphasize-lines: 6-11 + +``` + +Let's start off by parallelizing the code on the CPU using regular OMP directives. We will also use this implementation to benchmark our OMP(GPU) code. The OMP(CPU) compilers are very well tested and optimized for high performance, in addition, it is very easy to use: we simply need to type `omp parallel` before the first 'for' loop. Introducing the `parallel` directive allows the compiler to spawn a team of threads; it won’t parallelize anything but executes the same code on multiple threads. + +On {ref}`betzy`, the execution time with the `parallel` directive takes 15 seconds, which is higher than the value obtained using the `serial` version of our code. + +We proceed by adding another directive: `parallel for`. This is the work-sharing construct that divides the work across the number of threads in a group. + +Here we go, with the use of the `parallel for` clause, the code takes 0.54 seconds to execute. The performance is improved by a factor of 20 compared to the serial case. + +[//]:# (It is also possible to fine-tune the `parallel for` by using the `omp schedule` clause, which can be used to determine how the iterations are spread out between the threads.) + +```{eval-rst} +.. literalinclude:: ompoffload/omptarget.c + :language: c++ + :lines: 2-14 + :emphasize-lines: 1,5 + +``` + +```{note} +Thread creation is an expensive operation, thread synchronization is also a point of concern which may deteriorate the overall performance. So, don’t let the compiler do all the optimizations for you; use OpenMP wisely. +``` + +Let’s move ahead with `omp offload`. + +OpenMP on GPU. +============== +In leveraging the high throughput capability of the GPUs, OpenMP 4.0 offers special constructs which take the compute-intensive tasks from the CPU, perform computation on the GPU and bring the computed result back to the CPU. The main motivation behind these heterogeneous architecture-specific constructs is to provide performance and portability in one place. A user doesn’t need to know the low-level accelerator-specific language before writing their device-specific code. Also, general-purpose and special-purpose code can be maintained in one place, thus the 'offloading' offloads some of the programmer’s work too. + +Accelerator-specific code starts with `#pragma omp target` directive. The region within the `target` scope is also called the target region. As soon as the host thread reaches the target region, a new target thread gets created. Before doing any computation, the required data must also be mapped to the target device. OpenMP uses the `map` keyword to transfer data `to` and `from` the GPUs. Devices have their own memory space where they can store variables and data. It is important to note that the host thread can’t access the device thread or the device data. Also, OpenMP executes the target region in a blocking step, which means the CPU threads wait for the execution of the target region to finish before continuing the execution on the CPU. However, it is possible to override the default blocking mechanism into a nonblocking one. It is also interesting that we can use regular OpenMP directives, like `parallel for`, within the target region. + +Now we have enough theory to get our feet wet with the `Offloading` code. + +For the OMP(GPU), we are going to use the `target` directive. The GPU/Accelerator-specific code is encompassed between the target region. The target region guarantees that the `target` directive takes your thread of execution to the target device. The target device could be another CPU, GPU, DSP, or FPGA; in a broader sense, the `target` directive takes the thread of execution and moves it elsewhere. Here one important thing to note is that the program starts executing on the host device, and when the `target` construct is hit by the main thread, a new thread is spawned, we call this thread `new initial thread`. This initial thread executes on the target device, and when this new initial thread hits a parallel construct within the target region, it becomes the master thread. It is something different from the regular OpenMP where there was only one initial thread. It is also important to note that the new initial thread may or may not run on the GPU, depending on the condition of whether the GPU support is available on the system or not. If there is no GPU available on the system then the `new initial thread` will run on the host device. To be sure if the thread is running on the GPU or CPU, one could use `omp_is_initial_device()`. The function returns false if the thread is running on the GPU. + +The minimal example with OMP(offload) is shown below. + +```{eval-rst} +.. literalinclude:: ompoffload/omptarget.c + :language: c++ + :lines: 17-27 + :emphasize-lines: 1 + +``` +```{note} +`Clang` compiler should be available in your {ref}`betzy` work-environment to compile/run the code, as it is mentioned {ref}`here`. Or simply copy/paste the following command on {ref}`betzy` terminal. +``` +```bash +$ module load Clang/11.0.1-gcccuda-2020b +``` + +In our case-study, we create a Makefile, which helps building the code in a simplified way using the `Make` command. To build the code with OMP(offload) support, one must type `make offload` on the terminal. The users have to make sure that all the required modules are loaded into the environment before building the code. + +The bare minimum offload-code takes 148 seconds, which is almost 14 times slower than the `serial` version of the code. This is because the code is running on only one thread on the GPU and is not exploiting the entire GPU-accelerator. Moreover, the time cost of the data transfer between CPU and GPU has also been added up to the computation cost. + +Let’s try to improve our code by adding `teams` construct. The `teams` construct starts a group of threads on the device, each group is called a "team" in the OpenMP terminology. This is somewhat similar to the `parallel` construct, but the main difference is that in the `teams` construct, each thread forms its team, that is a team of only one thread. Threads within different teams are restricted to a synchronisation with each other, that is, inter-team synchronisation is not possible; however, threads can synchronise within a team. + +(targetteams)= + +Now let’s try to run our code with `teams` construct. + +```{eval-rst} +.. literalinclude:: ompoffload/omptarget.c + :language: c++ + :lines: 29-39 + :emphasize-lines: 1 + +``` +Remember to rebuild the code before running. + +It turns out that introducing the concept of `teams` construct has increased the computation cost to 154 seconds compared to 148 seconds in the previous version. What could be the reason behind this degradation? Well, spawning threads involves some cost. Here, all threads are computing only the same thing, which causes the observed low performance. + +In regular OpenMP, when the main thread hits the parallel construct it becomes the master thread, and thus only one master thread gets created. As mentioned in the previous discussion, the `teams` construct forms a group of initial threads where each thread forms its team. When each initial thread hits the ‘parallel construct’ then it becomes the master thread of the team, and in this way multiple master threads get formed. Under the cooperation of the individual master team thread, each thread executes the parallel region simultaneously. + +Let’s put our multiple threads to work by applying the `parallel for` construct. + +Code example is shown below: + +```{eval-rst} +.. literalinclude:: ompoffload/omptarget.c + :language: c++ + :lines: 41-52 + :emphasize-lines: 1-2 + +``` + + Let’s rebuild and run the code. + + We see a significant improvement when using the `parallel for` construct. In this case, the code takes 2.3 seconds to run, which is 67 times faster than our previous version of the code. + + One more thing we can do to optimize the `parallel for` is to use the `collapse` clause. The `collapse` clause unrolls the nested loops into a single iteration space. We now try to test how this clause improves the performance of the code. + +```{eval-rst} +.. literalinclude:: ompoffload/omptarget.c + :language: c++ + :lines: 54-65 + :emphasize-lines: 1-2 + +``` + +It is found that the use of the `collapse` clause affects slightly the performance, and now the computation cost is reduced to 2.2 seconds. + +Can we get any better at improving our current code? Let’s try the `Distribute` construct. The `Distribute` construct is a work-sharing construct that distributes the iterations of the loop across the teams of threads; remember, {ref}`here` we started the teams of threads by using `target teams` construct ? + +It is also possible to schedule the loop iterations into chunks using the `Schedule` clause. + +Let’s try the final version of our code where we apply all improvements in one place. + +```{eval-rst} +.. literalinclude:: ompoffload/omptarget.c + :language: c++ + :lines: 67-77 + :emphasize-lines: 1 + +``` +The final version of our code takes 0.14 seconds for the computation, which is almost 16 times faster than our previous improvement. + +In our experiment, the first optimized OMP(CPU) code takes 0.54 seconds. We thus use it as a benchmark study to evaluate the performance of the OMP(GPU) code. So, the conclusion is that OMP(GPU) is almost 4 times faster than the OMP(CPU) version. + +We are still missing one major part of the OpenMP-Offload, and that is offloading the data to the device. To do the computation somewhere else, we would also require the data on which the computation is going to happen. Since OpenMP supports both ‘distributed’ and ‘shared’ memory architecture, implicit as well as explicit mapping of the variables is possible. In the ‘implicit’ mapping, the compiler decides which variables are going to be sent `to` or `from` the device, whereas in the ‘explicit mapping’, user must use `map` clause within the target region to explicitly map list variables ‘to’, ‘from’ device data environment. A mapped variable may either be in the shared or the distributed memory, and in some cases a copy is required which is determined by OpenMP implementation. +Note that once the data is moved to the device, the device owns it. And it is not possible to reference the data directly from the CPU. To access the device data one needs to bring the data back to the CPU from the device. + +After incorporating the `map` clause, our code looks like this : + +```{eval-rst} +.. literalinclude:: ompoffload/omptarget.c + :language: c++ + :lines: 79-92 + :emphasize-lines: 1 + +``` + +At this point, we conclude that GPUs are optimized for the `throughput` whereas CPUs are optimized for the `latency`. Therefore, to benefit from using GPUs, we must give enough tasks to process per unit time on the GPU. In our code example, for instance, we care more about pixels per second than the latency of any particular pixel. + +To highlight the benefit of using GPUs, we consider an example, in which the size of our input image is increased. As previously, we rerun the code on the GPU as well as on the CPU. + + +```bash +$ make omp +$ srun --account= --cpus-per-task=32 -c 32 --time=10:00 --mem-per-cpu=1G --qos=devel --partition=accel ./omp 8k 10000 +``` +The processing time on the CPU is 19.5 seconds. + +```bash +$ make offload +$ srun --ntasks=1 --time=10:00 --account= --mem-per-cpu=1G --partition=accel --gpus=1 ./offload 8k 10000 +``` +Processing time on the GPU is 0.27 seconds. + +Our numerical experiment shows that running the code on the GPU is 72 times faster than on the multi-core CPU. + +Summary of the execution times +========================== +```{note} +The benchmarking was performed on {ref}`saga` and not on {ref}`betzy`, and you may find a slight difference in the execution times on {ref}`betzy`. +``` + +Image Size | Iterations |OMP-Directive | CPU time in ms. | GPU time in ms. +-- | -- | -- | -- | -- +1280x720 | 10,000 | -- | 10869.028 | -- +1280x720 | 10,000 | `parallel` | 15025.200 | -- +1280x720 | 10,000 | `parallel for` | 542.429 | -- +1280x720 | 10,000 | `target`| -- | 147998.497 +1280x720 | 10,000 | `target teams` | -- | 153735.213 +1280x720 | 10,000 | `target teams parallel for` | -- | 2305.166 +1280x720 | 10,000 | `target teams parallel for collapse` | -- | 2296.626 +1280x720 | 10,000 | `target teams distribute parallel for collapse schedule` | -- | 143.434 +8K | 10,000 | `parallel for` | 19591.378 | -- +8k | 10,000 | `target teams distribute parallel for collapse schedule` | -- | 268.179 + + +Resources +========= + +The complete code is available in compressed format and can be downloaded from the given link. + +```{eval-rst} +:download:`mandelbrot_gpu.tar.gz ` + +``` + +One can download the given `tarball` file on his/her computer and copy it to {ref}`betzy` using `scp` command, as shown below. + +```bash +$ scp username@betzy.sigma2.no:/cluster/home/ +``` +`source directory` should be the absolute path of the downloaded `tarball` on your computer, and the target directory should be the directory where you want to keep and uncompress the `tarball`. + +To uncompress the `tarball` file, execute the following command on the terminal. + +```bash +$ tar -zxvf mandelbrot_gpu.tar.gz +``` + + + +Makefile +======== +For our sample code, we used `Makefile` to build. `Makefile` contains all the code that is needed to automate the boring task of transforming the source code into an executable. One could argue; why not `batch` script? The advantage of `make` over the script is that one can specify the relationships between the elements of the program to `make`, and through this relationship together with timestamps it can figure out exactly what steps need to be repeated to produce the desired program each time. In short, it saves time by optimizing the build process. + +A brief version of the `Makefile` is listed here. + +```{eval-rst} +.. literalinclude:: ompoffload/Makefile.txt + :language: make + :lines: 1-15 + +``` + +Compilation process +=================== + +We briefly describe the syntax of the compilation process with the Clang compiler to implement the OpenMP offload targeting NVIDIA-GPUs on {ref}`betzy` platform. The syntax is given below: + +```console +clang -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_80 gpu_code.c +``` + +Here the flag `-fopenmp` activates the OpenMP directives (i.e. #pragma omp). The option `-fopenmp-targets` is used to enable target `offloading` to `NVIDIA-GPUs` and the `-Xopenmp-target` flag enables options to be passed to the target offloading toolchain. Last, the flag `-march` specifies the name of the `NVIDIA GPU` architecture. diff --git a/_sources/code_development/guides/openacc.md.txt b/_sources/code_development/guides/openacc.md.txt new file mode 100644 index 000000000..25c860fa2 --- /dev/null +++ b/_sources/code_development/guides/openacc.md.txt @@ -0,0 +1,495 @@ +--- +orphan: true +--- + +```{index} GPU; Getting started with OpenACC and Nvidia Nsight, OpenACC; Getting started with OpenACC and Nvidia Nsight, Nvidia Nsight; Getting started with OpenACC and Nvidia Nsight +``` + +(openacc)= + +# Getting started with OpenACC and Nvidia Nsight +[//]: # (pandoc -s -o out.html openacc.md) +> OpenACC is a user-driven directive-based performance-portable parallel +> programming model. +From the [OpenACC homepage](https://www.openacc.org). + +In many ways OpenACC is similar to OpenMP, but with a focus on running the code +on accelerators (such as GPUs). OpenACC defines a set of directives (for both +`C/C++` and `Fortran`) that can be included in existing code to transition the +runtime to accelerators. + +Accelerators, like the Nvidia GPUs on Saga, are great for numerical calculations +and applications that work on the "SIMD" - **S**ingle **I**nstruction +**M**ultiple **D**ata principle, (where one or more operations are applied to a +large number of datapoints independently of each other). Examples include +operations like +[`gemm`](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#Level_3) +which can be [**6 times** faster than on the +CPU](http://developer.download.nvidia.com/compute/cuda/6_5/rel/docs/CUDA_6.5_Performance_Report.pdf), +or generating random numbers which can be [**70 times** +faster!](http://developer.download.nvidia.com/compute/cuda/6_5/rel/docs/CUDA_6.5_Performance_Report.pdf) + +```{note} +If you know some OpenACC or want to see tips for larger applications take a look +at {ref}`the tip section ` at the bottom. +``` + +```{note} +We have also included a Fortran example at +{ref}`the end of this document `. +``` + +```{tip} +For a summary of available directives we have used [this reference +guide.](https://www.openacc.org/sites/default/files/inline-files/API%20Guide%202.7.pdf) +``` + +## Introduction +This guide will introduce the concept of OpenACC directives in `C/C++` code, how +to compile and run such programs on {ref}`saga` and how to +use [Nvidia Nsight](https://developer.nvidia.com/nsight-systems) to profile and +optimize code. + +After reading this guide you should: +- Know what OpenACC is +- Know how to compile `C/C++` OpenACC programs on Saga +- Know how to run OpenACC programs on GPUs on Saga +- Know how to run OpenACC programs with a profiler (`nsys`) on Saga +- Know how to understand the basic Nsight user interface +- Know how to optimize OpenACC programs based on profiler results + +## OpenACC +To begin we will need an example program that does some calculations that we +would like to speed up. + +We have selected an example based on heat dissipation utilizing Jacobi +iterations. The initial source can be found in `jacobi_serial.c`, shown below: + +```{eval-rst} +.. literalinclude:: openacc/jacobi_serial.c + :language: c +``` +```{eval-rst} +:download:`jacobi_serial.c <./openacc/jacobi_serial.c>` +``` + +### Compiling and running on Saga +To compile this initial version on Saga we will need to load the [`Nvidia HPC +SDK`](https://developer.nvidia.com/hpc-sdk). This can be done with the following +command: + +```bash +$ module load NVHPC/20.7 +``` + +```{note} +You can check if a newer version of `NVHPC` is available by issuing the command +`module avail NVHPC` +``` + +Then to compile or serial version we will invoke the `nvc` compiler with the +following command: + +```bash +$ nvc -g -fast -o jacobi jacobi_serial.c +``` + +We can run this program on a compute node by issuing the following: + +```bash +# Run on compute node with 512MB of memory for a maximum of 2 minutes +$ srun --account= --time=02:00 --mem-per-cpu=512M time ./jacobi +# The first number outputted should be the number of seconds it took to run the +# program: +# 40.79user 0.01system 0:40.91elapsed 99%CPU (0avgtext+0avgdata 35212maxresident)k +# 5144inputs+0outputs (18major+1174minor)pagefaults 0swaps +``` + +### Initial transition +To begin transitioning the code to run on a GPU we will insert the `kernels` +directive into the code. The `kernels` directive tells OpenACC that we would +like everything inside the directive to be run on the GPU, but it is up to the +compiler to decide how to best do this. + +It is always a good idea to begin with the `kernels` directive as that is the +easiest and it gives the compiler a lot of flexibility when translating the +code. `kernels` is also a good way to understand if the compiler is not able to +optimize something and if we need to rewrite some code to better run on the GPU. + +The code is available in `jacobi_kernels.c` and the changes applied are shown +below. + +```{eval-rst} +.. literalinclude:: openacc/jacobi_kernels.c + :language: c + :lines: 36-58 + :emphasize-lines: 3,4,21 +``` +```{eval-rst} +:download:`jacobi_kernels.c <./openacc/jacobi_kernels.c>` +``` + +As can be seen in the code above we have added the `kernels` directive around +the main computation that we would like to accelerate. + +To compile the above we need to tell `nvc` that we would like to accelerate it +on GPUs. This can be done with the `-acc` flag. We will also add the +`-Minfo=accel` flag which informs the compiler that we would like it to inform +us of what it is doing with accelerated regions. The full command is as follows. + +```bash +$ nvc -g -fast -acc -Minfo=accel -o jacobi jacobi_kernels.c +``` + +When running this command pay special attention to the information it is telling +us about the accelerated region. + +```bash +main: + 40, Generating implicit copyin(array[:][:]) [if not already present] + Generating implicit copyout(array[1:1998][1:1998]) [if not already present] + Generating implicit copy(error) [if not already present] + Generating implicit copyout(arr_new[1:1998][1:1998]) [if not already present] + 42, Loop is parallelizable + 43, Loop is parallelizable + Generating Tesla code + 42, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ + Generating implicit reduction(max:error) + 43, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ + 52, Loop is parallelizable + 53, Loop is parallelizable + Generating Tesla code + 52, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ + 53, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ +``` + +In the above output the numbers corresponds to line numbers in our +`jacobi_kernels.c` source file and the comments show what `nvc` intends to do +with each line. + +Before we start profiling to see what we can optimize, lets run the program to +learn the additional `Slurm` parameters needed for running with GPU on Saga. The +following is the new command needed (notice the added `--partition=accel` and +`--gpus=1` flags) + +```bash +$ srun --account= --time=02:00 --mem-per-cpu=512M --partition=accel --gpus=1 time ./jacobi +``` + +`--partition=accel` is needed to tell `Slurm` to only run on nodes on Saga with +GPUs and the `--gpus=N` line tells `Slurm` that we would like to have access +to `N` GPUs (`accel` nodes on Saga have `4` separate GPUs, above we are asking +for only one GPU). + +### Profiling +To profile the `kernels` version of our program we will here transition to +{ref}`job-scripts`. This will make it a bit easier to +make changes to how the program is run and also makes it a bit more +reproducible. + +The `Slurm` script is available as `kernels.job` and is show below. + +```{eval-rst} +.. literalinclude:: openacc/kernels.job + :language: bash +``` +```{eval-rst} +:download:`kernels.job <./openacc/kernels.job>` +``` + +Run this script by issuing + +```bash +$ sbatch kernels.job +``` + +The end result should be a file called `kernels.qdrep` which contains the +profiling information. Download this file to your local computer to continue +with this guide. + +```{eval-rst} +:download:`kernels.qdrep <./openacc/kernels.qdrep>` +``` + +## Nsight +We will continue this guide `kernels.qdrep` as the profiling result to view. + +```{note} +To view images in a larger format, right click and select `View Image` +``` + +To begin, start [Nsight Systems](https://developer.nvidia.com/nsight-systems) on +your own machine, giving the following view. + +![Empty window of Nsight Systems](openacc/nsight_blank.png) + +To open our profiling result click `File`, then `Open` and navigate to the +folder where you stored `kernels.qdrep`. Loading this file should give you the +following view. + +![Nsight showing timeline view of profile](openacc/nsight_timeline.png) + +### User interface +The user interface of Nsight is comprised of three main areas and two drop down +menus that control what is shown in the different areas. + +![Nsight annotated to describe user interface](openacc/nsight_annotated.png) + +1. On the left we find the project area, this list shows your project and + profiles that you have loaded. +2. The left topmost dropdown menu selects which view to show +3. In the middle of the user interface we find the main view, currently showing + the timeline of our profile. This view changes depending on the choice made + in the dropdown menu marked with a `2`. +4. The second dropdown, in the middle of the screen, selects different views for + the bottommost area. +5. The area at the bottom shows additional information about the profile + together with the timeline view. + +### Views +Using the topmost dropdown menu, marked with `2` in the picture above, we can +select different views for the current profile. + +When first opening a new profile it can be informative to start with the +`Diagnostics Summary`. This view shows a summary of the profile and can give +great hints about what went wrong if the profile is not as expected. + +![Nsight diagnostics summary view](openacc/nsight_diagnostics.png) + +After that the `Analysis Summary` give an overview of the profile. This view +contains a lot of information which can be nice to review to ensure that the +profile was configured correctly. Instances of good places to review are the +`CLI command used` which shows how the profile was generated, `GPU info` which +shows the accelerator in use and the `Analysis options` which show how `nsys` +interpreted the command line arguments. + +![Nsight analysis summary view](openacc/nsight_analysis.png) + +The last view that we will detail here (because the two remaining are not that +informative for understanding the profile information) is the `Timeline View`, +which is the default view that we saw when we opened Nsight. + +A good place to start with this view is the second dropdown, marked with `4`. In +this dropdown we can select additional information to display about our profile +results. By selecting one of the different `... View` options the profiler can +show us which functions used what amount of the runtime in different ways. In +the image below we have selected `Bottom-Up View` which sorts functions by +placing the most time consuming ones at the top. + +![Nsight Bottom-Up view](openacc/nsight_bottom_up.png) + +In the timeline main view, we can see the usage of different APIs and the amount +of CPU and GPU usage. A quick first thing to do is to click the arrow next to +our GPU name so that it shows the percentage of `Kernels` usage and the +percentage of `Memory` usage. In our current profile we can see that we are only +using about `6%` of the `Kernels` resource which means that our GPU is spending +only `6%` of its time actually doing useful compute. + +![Nsight focused on the timeline view](openacc/nsight_timeline2.png) + +To better understand what we are seeing in the timeline it is useful to zoom +into specific areas to see what is going on. Use the mouse cursor to select a +small column of the timeline area, right click and select `Zoom into selection`. +Depending on how long the profile ran for it can be necessary doing this several +times. Below we have tried to illustrate how far we would usually zoom in. + +![Nsight initial zoom](openacc/nsight_zoom1.png) +![Nsight final zoom](openacc/nsight_zoom2.png) +![Nsight final view zoomed in](openacc/nsight_zoom3.png) + +In the last picture above we have zoomed in on what appears to be a cycle of two +kernel launches. Remembering our code, that is most likely two iterations of the +`while` loop where we placed our `kernels` directive inside. + +## Profile Guided Optimization +Even though we have translated our program to run on the GPU it has not yet +given us the results that we were after. Running on GPU resulted in a +computation that is about `1.5` times slower than just running on CPU, but we +can do better. + +Looking at the zoomed in view of the timeline, in the image below, we can see +that most of the time is taken up with transferring data between the CPU and the +GPU. + +![Nsight final view zoomed in, repost from above](openacc/nsight_zoom3.png) + +Optimizing data transfer is a crucial part of translating code to the GPU and +accounts for most of the time spent optimizing a program for the GPU. + +Looking at our `while` loop we can see that we are only interested in the final +result after the loop exits which means that we should try to keep the data on +the GPU and only transfer in and out at the beginning and end of the loop. To do +this we will introduce the `#pragma acc data` clause which tells the compiler +that we only want to do data movement for a given scope. The changes needed +center around the `while` loop shown below. + +```{eval-rst} +.. literalinclude:: openacc/jacobi_data.c + :language: c + :lines: 36-59 + :emphasize-lines: 1 +``` +```{eval-rst} +:download:`jacobi_data.c <./openacc/jacobi_data.c>` +``` + +Let us compile this on Saga and see if this results in better performance. +Compile and run with the following commands. + +```bash +# Remember 'module load NVHPC/20.7' when logging in and out +$ nvc -g -fast -acc -Minfo=accel -o jacobi jacobi_data.c +$ sbatch kernels.job +``` + +Below we have included the timeline view of the updated +profile. + +```{eval-rst} +:download:`data.qdrep <./openacc/data.qdrep>` +``` + +![Nsight timeline after adding data movement +directives](openacc/nsight_after_data.png) + +Although this doesn't look all that different from the previous profiles, notice +that the timeline only goes to about `3.6` seconds, the previous profile went to +above `70` seconds. Almost a `20x` speedup! Compared to our runs on the CPU this +translation to the GPU has given us about a `10x` speedup. This shows the +importance of data movement and is a good illustration of the optimization +process, initially the code ran much slower on the GPU than on the CPU before +becoming better than the CPU. + +--- + +Doing better than this will be difficult, however, to introduce a few more +concept that can be nice - we will perform a few more iterations on the code. +However, do not expect great improvements. + +The first improvement that we can do is to realize that `arr_new` will never be +needed and is simply a scratch array for our computation, we can thus change our +data directive to `#pragma acc data copy(array) create(arr_new)`. This tells the +compiler that it should copy `array` from the CPU to the GPU when entering the +loop and copy the data back from the GPU to CPU when exiting the scope. The +`create(arr_new)` tells the compiler to only create the data on the GPU, it will +not copy anything in or out, which is ok for us since we will overwrite it on +first loop anyway and never use it after the loop. + +The above optimization will net us very little so lets do some more. Instead of +using the `kernels` directive we can take more control of the translation and +tell the compiler that we would like to parallelize both loops. This is done +with the `#pragma acc parallel loop` directive. Since we also want to do a +reduction across all loops we can also add a reduction by writing `#pragma acc +parallel loop reduction(max:error)` to the first loop. Lastly, we will apply the +`collapse(n)` clause to both loop directives so that the compiler can combine +the two loops into one large one, with the effect of exposing more parallelism +for the GPU. The new code is show below. + +```{eval-rst} +.. literalinclude:: openacc/jacobi_optimized.c + :language: c + :lines: 36-56 + :emphasize-lines: 1, 4, 14 +``` + +```{eval-rst} +:download:`jacobi_optimized.c <./openacc/jacobi_optimized.c>` +``` + +Looking at the generated profile, `optimized.qdrep` shown below, we can see that +we managed to eek out slightly more performance, but not that much. + +```{eval-rst} +:download:`optimized.qdrep <./openacc/optimized.qdrep>` +``` + +![Nsight timeline of final optimized +profile](openacc/nsight_final_optimized.png) + +Compared to the initial translation we can see now that the ratio of `Kernels` +to `Memory` on the GPU is much better, `98%` spent actually doing useful +compute. + +If we zoom in, as in the image below, we can see that there is not much wasted +time between useful compute. Going further with OpenACC is most likely not that +useful and getting this to run even quicker will likely require a rewrite to +`CUDA` which is outside the scope and intention of this guide. + +![Nsight timeline zoomed in on the optimized +profile](openacc/nsight_optimized_zoom.png) + +One way to see how well we have optimized the code is to look at the white space +between compute regions. In our initial translation these white spaces lasted +for around `4` milliseconds. In the optimized profile the whitespace between +kernels amount to around `32` **microseconds**. + +## Summary +In this guide we have shown how to use OpenACC to transition a simple `C` +example from running on the CPU to running the main calculations on GPU. We have +detailed how such code can be used, compiled and run on Saga. And, we have +introduced Nsight and how it can be used to profile and guide OpenACC +transitions. + +(tips)= +## Tips +- **Do not expect miracles!** Translating a large code base to run on GPU is a + large undertaking and should not be taken lightly. Just getting a large code + base to run on GPU and having almost the same performance as the CPU code is + extremely good! Optimizing for GPUs require time and patience. +- Always start with the `kernels` directive and study the compiler output. This + should guide your next steps. The information outputted by the compile will + usually tell you if the scope of the directive can be run effectively on GPU + and if you should take some steps to rewrite parts of the code. + + Compiler output like `Loop carried dependence of prevents + parallelization` and `Loop carried backward dependence of prevents + vectorization` are clear indications that the compiler is not able to + automatically translate the code and a rewrite might be necessary. +- Data movement is paramount. If you know some data is only needed to read from + use `copyin`, `copyout` if it is only written to, `present` can be nice if + you know the data should already be present on the GPU and `copy` ensures + that the program functions as expected. OpenACC has several directive that + can be used to perform data management and some are even scoped for the + entire program. +- Be structured in your approach. Only translate one scope at a time. This + ensures that you can focus on a small area and get less compiler output to + study. Profiling between each round may not be necessary, but it can be + valuable to know what is happening. + +(fortran)= +## Fortran +As mentioned in the beginning of this document, OpenACC also supports `Fortran`. +Directives in Fortran can be added in a similar fashion to OpenMP directives, +with `!$acc` instead of `!$OMP`. Below is an example of matrix multiplication +with the `!$acc kernels` directive. + +```{eval-rst} +.. literalinclude:: openacc/mxm.f90 + :language: fortran + :emphasize-lines: 12, 20 +``` +```{eval-rst} +:download:`mxm.f90 <./openacc/mxm.f90>` +``` + +On Saga, load the `NVHPC/20.7` module and compile with `nvfortran` as follows: +```bash +$ module load NVHPC/20.7 +$ nvfortran -o mxm -fast -acc -gpu=cc60 -Minfo=accel mxm.f90 +``` + +To run the program on Saga with GPUs use: +```bash +$ srun --account= --time=02:00 --mem-per-cpu=512M --partition=accel --gpus=1 ./mxm +``` + +This program is as close to the best case scenario possible for accelerators +and, on Saga, gives a speedup of `24x` compared to a single CPU core. + +| Flags | Run time | Speedup | +|:------------------------:|:-------------:|:--------:| +| `-fast` (single CPU core)| 48.9 seconds | 1 x | +| `-fast -acc -gpu=cc60` | 2.0 seconds | 24 x | + +You can profile `Fortran` programs in the same way you would for `C/C++`, using +`nsys profile` and the flag `-t cuda,openacc`. diff --git a/_sources/code_development/guides/openacc_mpi.md.txt b/_sources/code_development/guides/openacc_mpi.md.txt new file mode 100644 index 000000000..43c436bd0 --- /dev/null +++ b/_sources/code_development/guides/openacc_mpi.md.txt @@ -0,0 +1,327 @@ +--- +orphan: true +--- + +```{index} MPI; Combining MPI and OpenACC, OpenACC; Combining MPI and OpenACC, GPU; Combining MPI and OpenACC, Nvidia Nsight; Combining MPI and OpenACC, Multi-GPU; Combining MPI and OpenACC +``` +# Combining MPI and OpenACC +A lot of existing HPC code is already set up to take advantage of multi-core and +cluster compute resources through `MPI`. When translating a codebase to OpenACC +we can take advantage of this existing infrastructure by giving each `rank` its +own `GPU` to achieve multi-GPU compute. This technique is most likely the easiest +path to utilizing multi-GPU for existing and new projects working with OpenACC. + +```{note} +The alternative to combining `MPI` and OpenACC is to divide the work into +blocks, as we show in the [asynchronous and multi-GPU +guide](./async_openacc.md), however, with out combining such a technique with +`MPI`, sharing is limited to a single node. +``` + +```{tip} +For a summary of available directives we have used [this reference +guide.](https://www.openacc.org/sites/default/files/inline-files/API%20Guide%202.7.pdf) +``` + +## Introduction +This guide will assume some familiarity with `MPI` and an [introductory +level](./openacc.md) of knowledge about OpenACC. + +After reading this guide you should be familiar with the following concepts + - How to take an existing `MPI` application and add OpenACC + - How to go from an initial `CPU`-only implementation to gradually adding + OpenACC directives + - How to share data between `CPU`, `GPU` and other `rank`s + - How to assign the correct GPU based on `rank` and nodes + - How to profile a combined `MPI` and OpenACC application + +--- + +For this guide we will solve the 1 dimensional wave equation, shown below with +`MPI` task sharing. + +```{eval-rst} +.. literalinclude:: openacc_mpi/wave_mpi.c + :language: c +``` + +```{eval-rst} +:download:`wave_mpi.c <./openacc_mpi/wave_mpi.c>` +``` + +To compile this on Saga we will load `OpenMPI` and compile with the built-in `MPI` +compiler. + +```bash +$ module load OpenMPI/4.0.3-PGI-20.4-GCC-9.3.0 +$ mpicc -g -fast -o mpi wave_mpi.c +``` + +To run this with multiple `rank`s, e.g. split the work over `4` processes, use +the following command +```bash +$ srun --ntasks=4 --account= --time=02:00 --mem-per-cpu=512M time ./mpi 1000000 +``` + +## Introducing OpenACC +When starting the transition of an `MPI` enabled application, like the above, it +is imperative to reduce complexity so as to not get overwhelmed by the +transition. We will therefore introduce OpenACC to the program by running it as +one process and once we are done with the OpenACC translation, add the necessary +setup for multi-GPU utilization. + +```{tip} +Try to follow the next sections by implementing them yourself before you see our +solution. This will increase your confidence with OpenACC. +``` + +### Adding `parallel loop` directives +There are several places where we could put directives in this code, however, to +keep this manageable we will focus on the main computational area of the code. +Lets therefore start with the following three loops. + +```{eval-rst} +.. literalinclude:: openacc_mpi/wave_mpi.c + :language: c + :lines: 179-222 + :emphasize-lines: 6-11, 14-18, 40-43 +``` + +Looking at the three loops we can see that every iteration is independent of +every other iteration and it is thus safe to add `#pragma parallel loop` before +each loop. + +```{eval-rst} +.. literalinclude:: openacc_mpi/wave_loop.c + :language: c + :lines: 179-225 + :emphasize-lines: 6, 15, 42 +``` + +```{eval-rst} +:download:`wave_loop.c <./openacc_mpi/wave_loop.c>` +``` + +To compile this on Saga we use the same command as above, adding `-acc` and +`-Minfo=accel` + +```bash +$ mpicc -g -fast -acc -Minfo=accel -o acc wave_loop.c +``` + +To test the above code use `srun` as above, but do not ask for multiple tasks. +We also need to request GPU resources with `--partition=accel` and +`--gpus=1`. + +```bash +$ srun --ntasks=1 --partition=accel --gpus=1 --account= --time=02:00 --mem-per-cpu=512M time ./acc 1000000 +``` + +The code runs on the GPU, but it is not particularly fast. The reason for this +is that we are now continually copying memory in and out of the GPU. If we look +at the main computation we can see that, apart from sharing two elements of the +array with the other `rank`s, we don't need to work on the data on the `CPU`. + +#### Checking with Nsight +To see the problem visually, we can use [Nvidia +Nsight](https://developer.nvidia.com/nsight-systems) to profile the application. +We will simply change the invocation of `time` with `nsys` as follows. + +```bash +$ srun --ntasks=1 --partition=accel --gpus=1 --account= --time=02:00 --mem-per-cpu=512M nsys profile -t cuda,openacc,osrt -o openacc_mpi_tutorial ./acc 1000000 +``` + +This will create an `openacc_mpi_tutorial.qdrep` profile that we can download to +our local machine and view in Nsight Systems. + +![Screenshot of Nvidia Nsight showing kernel and memory usage of our +`wave_loop.c` program](./openacc_mpi/wave_loop_profile.png) + +```{note} +Right click on the image and select `View Image` to see larger version. +``` + +```{tip} +If you have not used Nsight before we have an [introductory tutorial +available](./openacc.md). +``` + +As we can see, in the image above, the `Kernels` to `Memory` ratio is quite one +sided, confirming our suspicion that we are spending too much time transferring +data, and not enough time computing. + + +### Improving data locality +To improve data locality we need to know which pieces of information, e.g. which +arrays, are important to have on `GPU` and those we don't need to transfer. +Taking a step back and thinking about the code we can see that `wave0` and +`wave2` are only used for scratch space while the end result ends up in `wave1`. +In addition we can see that this holds true, except for variable sharing with +`MPI` - which we will come back to below, for the whole `steps` loop. + +Lets add a `#pragma acc data` directive above the `steps` loop so that data is +contained on the `GPU` for the whole computation. Since we have some data in +`wave0` we will mark it as `copyin`, we need the data in `wave1` after the loop +as well so we mark it as `copy` and `wave2` we can just create on the `GPU` +since it is only used as scratch in the loop. + +```{eval-rst} +.. literalinclude:: openacc_mpi/wave_data.c + :language: c + :lines: 178-200 + :emphasize-lines: 2-3 +``` + +```{eval-rst} +:download:`wave_data.c <./openacc_mpi/wave_data.c>` +``` + +```{note} +We had to remove the `check_mpi` calls in the region covered by the `#pragma acc +data` since it is not allowed to exit out of a compute region with `return`. +``` + +Compile and run to see if we get any improvements. + +```bash +$ mpicc -g -fast -acc -Minfo=accel -o acc wave_data.c +$ srun --ntasks=1 --partition=accel --gpus=1 --account= --time=02:00 --mem-per-cpu=512M time ./acc 1000000 +``` + +The above runs quite a bit faster, but we have one problem now, the output is +wrong. This is due to the fact that we are now not sharing any data between the +`GPU` and the `CPU`. To improve this we will introduce the `#pragma acc update` +directive. + +````{tip} +Use `nsys` and Nvidia Nsight to convince +yourself that the above stated improvement actually takes place. +```{eval-rst} +:download:`Nsight updated screenshot <./openacc_mpi/wave_data_profile.png>` +``` +```` + +### There and back again - `GPU <=> CPU` +As we just saw, we are missing some data on the `CPU` to initiate the `MPI` +transfer with. To remedy this we will add the `#pragma acc update` directive. +This directive tells the compiler to transfer data to or from `GPU` without an +associated block. + +First we will copy data from the `GPU` back to the `CPU` so that our `MPI` +transfer can proceed. In the code below notice that we have added `acc update +self(...)`. `self` in this context means that we want to transfer from `GPU` to +`CPU`. + +```{eval-rst} +.. literalinclude:: openacc_mpi/wave_acc.c + :language: c + :lines: 197-222 + :emphasize-lines: 10-11 +``` + +The `MPI` transfer will transmit the correct data, which is good, but we still +have a problem in our code. After the `MPI` transfer the points we received from +the other `rank`s are not updated on the `GPU`. To fix this we can add the same +`acc update` directive, but change the direction of the transfer. To do this we +change `self` with `device` as follows. + +```{eval-rst} +.. literalinclude:: openacc_mpi/wave_acc.c + :language: c + :lines: 220-232 + :emphasize-lines: 5-6 +``` + +We have made a few more improvements to the overall code to more fairly compare +with the pure `MPI` solution. See the `wave_acc.c` file below for additional +improvements. + +```{eval-rst} +:download:`wave_acc.c <./openacc_mpi/wave_acc.c>` +``` + +Compile and run with the following, as usual. + +```bash +$ mpicc -g -fast -acc -Minfo=accel -o acc wave_acc.c +$ srun --ntasks=1 --partition=accel --gpus=1 --account= --time=02:00 --mem-per-cpu=512M time ./acc 1000000 +``` + +## Splitting the work over multiple `GPU`s +We are almost done with our transition to OpenACC, however, what happens if we +launch the above `wave_acc.c` with two `rank`s on the same node. From the +perspective of the batch system we will be allocated two `GPU`s, two `CPU` cores +and a total of `512M+512M` memory. However, our two `MPI` processes do not +specify the `GPU` to use and will utilize the default `GPU`. Since they are +running on the same node, that will likely be the same `GPU`. + +To fix this we will read in our local `rank`, which is exported under OpenMPI as +`OMPI_COMM_WORLD_LOCAL_RANK`, then we can use this to get the correct index to +the `GPU` to use. We need to add `#include ` so that we can access +the Nvidia runtime and have access to `acc_set_device_num()` which we can use to +assign a unique `GPU` to each `MPI` process. + +```{eval-rst} +.. literalinclude:: openacc_mpi/wave_acc_mpi.c + :language: c + :lines: 91-121 + :emphasize-lines: 21-31 +``` + +```{eval-rst} +:download:`wave_acc_mpi.c <./openacc_mpi/wave_acc_mpi.c>` +``` + +We will compile this as before, but now we can run with arbitrary number of +processes! + +```{note} +When using multiple tasks ensure that each task gets a dedicated GPU with the +`--gpus-per-task=N` flag. +``` + +```bash +$ mpicc -g -fast -acc -Minfo=accel -o acc wave_acc_mpi.c +$ srun --ntasks=2 --partition=accel --gpus-per-task=1 --account= --time=02:00 --mem-per-cpu=512M time ./acc 1000000 +``` + +## Summary +We have shown how to take an existing `MPI` application and add OpenACC to +utilize multi-GPU resources. To accomplish this, we added directives to move +compute from `CPU` to `GPU`. To enable synchronization we also added directives +to move small amounts of data back and fourth between the `GPU` and `CPU` so +that we could continue to exchange data with neighboring `MPI` `rank`s. + +### Speedup +Below is the runtime, as measured with `time -p ./executable` (extracting +`real`), of each version. The code was run with `1200000` points to solve. + +| Version | Time in seconds | Speedup | +| ------- | --------------- | ------- | +| `MPI` `--ntasks=1` | `14.29`| N/A | +| `MPI` `--ntasks=12`\* | `2.16` | `6.61x` | +| `MPI` `--ntasks=2` + OpenMP `--cpus-per-task=6`\* | `2.11` | `1.02x` | +| `MPI` `--ntasks=2` + OpenACC | `2.79` | `0.76x` | +| OpenACC\*\* | `2.17` | `1.28x` | +**\*** To keep the comparison as fair as possible we compare the `CPU` resources +that would be the equivalent to [the billing resources of 2 `GPU`s on +Saga](../../jobs/projects_accounting.md). + +**\*\*** OpenACC implementation on a single `GPU` further optimized when no +`MPI` sharing is necessary. + +### Scaling +To illustrate the benefit of combining OpenACC with `MPI` we have, in the image +below, compared three different versions of the solver on increasingly larger +input. + +![Scaling of different inputs](./openacc_mpi/wave_scaling.svg) + +From the figure we can see that on the example input, `1200000`, `MPI` combined +with OpenMP is the quickest. However, as we scale the input size this `CPU` +version becomes slower compared to the `GPU`. The figure also illustrates the +advantage of a single `GPU`. We recommended, if possible, to use one `GPU` when +starting the transition to OpenACC and if the data size is larger than a single +`GPU` continue with `MPI`. If the application already utilizes `MPI` then we +recommend that, when using OpenACC, each `rank` is given more work than in the +pure `CPU` implementation. diff --git a/_sources/code_development/guides/pytorch_profiler.md.txt b/_sources/code_development/guides/pytorch_profiler.md.txt new file mode 100644 index 000000000..631a861b3 --- /dev/null +++ b/_sources/code_development/guides/pytorch_profiler.md.txt @@ -0,0 +1,377 @@ +--- +orphan: true +--- + +(pytochprofiler)= +# Profiling GPU-accelerated Deep Learning + +We present an introduction to profiling GPU-accelerated Deep Learning (DL) models using [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html). Profiling is a necessary step in code development, as it permits identifying bottlenecks in an application. This in turn helps optimize applications, thus improving performance. + +This introduction is limited to profiling DL-application that runs on a single GPU. By the end of this guide, readers are expected to learn about: + +- Defining the concept and the architecture of PyTorch Profiler. +- Setting up PyTorch profiler on an HPC system. +- Profiling a PyTorch-based application. +- Visualizing the output data on a web browser with the Tensorboard plugin, in particular, the metrics: + - GPU usage + - GPU Kernel view + - Memory view + - Trace view + - Module view + +```{contents} Table of Contents +``` + +(profiler)= +## What is PyTorch Profiler +In general, the concept of profiling is based on statistical sampling, by collecting data at a regular time interval. Here, a profiler tool offers an overview of the execution time attributed to the instructions of a program. In particular, it provides the execution time for each function; in addition to how many times each function has been called. Profiling analysis helps to understand the structure of a code, and more importantly, it helps to identify bottlenecks in an application. Examples of bottlenecks might be related to memory usage and/or identifying functions/libraries that use the majority of the computing time. + +PyTorch Profiler is a profiling tool for analyzing Deep Learning models, which is based on collecting performance metrics during training and inference. The profiler is built inside the PyTorch API (cf. {ref}`Fig 1`), and thus there is no need for installing additional packages. It is a dynamic tool as it is based on gathering statistical data during the running procedure of a training model. + +```{eval-rst} + +.. _fig-arch-profiler: + +.. figure:: pytorch_profiler/Figs/fig00.png + :width: 600px + :align: center + + Fig 1: A simplified version of the architecture of PyTorch Profiler. A complete picture of the architecture can be found [here](#https://www.youtube.com/watch?v=m6ouC0XMYnc&ab_channel=PyTorch) (see the slide at 23:00 min). + +``` + +As shown in the figure, the PyTorch API contains a Python API and a C++ API. For simplicity we highlight only the necessary components for understanding the functionality of PyTorch profiler, which integrates the following: (i) aTen operators, which are libraries of tensor operators for PyTorch and are GPU-accelerated with CUDA; (ii) Kineto library designed specifically for profiling and tracing PyTorch models; and (iii) LibCUPTI (CUDA Profiling Tool Interface), which is a library that provides an interface for profiling and tracing CUDA-based application (low-level profiling). The last two libraries provide an interface for collecting and analyzing the performance data at the level of GPU. + +Here we list the performance metrics provided by the profiler, which we shall describe in {ref}`Section`: +- GPU usage +- Tensor cores usage (if it is enabled) +- GPU Kernel view +- Memory view +- Trace view +- Module view + +Further details are provided in these [slides](https://github.com/HichamAgueny/Profiling-GPU-accelerated-DL). + +(setup-pytorch-profiler-in-an-hpc-system)= +## Setup Pytorch profiler in an HPC system +In this section, we describe how to set up PyTorch using a singularity container. + +- **Step 1**: Pull and convert a docker image to a singularity image format: +e.g. from the [NVIDIA NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) + +Note that when pulling docker containers using singularity, the conversion can be quite heavy and the singularity cache directory in `$HOME` space becomes full of temporary files. To speed up the conversion and avoid storing temporary files, one can first run these lines: + +```console +$ mkdir -p /tmp/$USER +$ export SINGULARITY_TMPDIR=/tmp/$USER +$ export SINGULARITY_CACHEDIR=/tmp/$USER +``` +and then pull the container + +```console +$singularity pull docker://nvcr.io/nvidia/pytorch:22.12-py3 +``` + +- **Step 2**: Launch the singularity container + +```console +$singularity exec --nv -B ${MyEx} pytorch_22.12-py3.sif python ${MyEx}/resnet18_api.py +``` +Here the container is mounted to the path `${MyEx}`, where the Python application is located. An example of a Slurm script that launches a singularity container is provided in the {ref}`Section`. + +(case-example-profiling-a-resnet-18-model)= +## Case example: Profiling a Resnet 18 model +We consider the Resnet 18 model as an example to illustarte profiling with [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). Here we list the lines of code required to enable profiling with [PyTorch Profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html) + +``` +with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA], + schedule=torch.profiler.schedule( + wait=1, + warmup=1, + active=2), + on_trace_ready=torch.profiler.tensorboard_trace_handler(‘./out', worker_name=‘profiler'), + record_shapes=True, + profile_memory=True, + with_stack=True +) as prof: +``` +To be incorporated just above the training loop +```python +#training step for each batch of input data + for step, data in enumerate(trainloader): + . + . + . + . + if step +1>= 10: + break + prof.step() +``` + +Here is a code example of the Resnet18 model, in which profiling is enabled. The code is adapted from the [PyTorch tutorial](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). + +```{eval-rst} +.. literalinclude:: pytorch_profiler/resnet18_with_profiler_api.py + :language: python + :lines: 1-63 + :emphasize-lines: 34-59 + :linenos: + +``` + +For reference, we provide here the same application but without enabling profiling. The code is adapted from the [PyTorch tutorial](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). + +```{eval-rst} +.. literalinclude:: pytorch_profiler/resnet18_without_profiler_api.py + :language: python + :lines: 1-44 + :emphasize-lines: 34-40 + :linenos: + +``` + +In the lines of code defined above, one needs to specify the [setting for profiling](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). The latter can be split into three main parts: +- Import `torch.profiler` +- Specify the profiler context: i.e. which kind of **activities** one can profile. e.g. CPU activities (i.e. `torch.profiler.ProfilerActivity.CPU`), GPU activities (i.e. `torch.profiler.ProfilerActivity.CUDA`) or both activities. +- Define the **schedule**; in particular, the following options can be specified: + —*wait=l*: Profiling is disabled for the first `l` steps. This is relevant if the training takes a longer time, and that profiling the entire training loop is not desired. Here, one can wait for `l` steps before the profiling gets started. + + —*warmup=N*: The profiler collects data after N steps for tracing. + + —*active=M*: Events will be recorded for tracing during the active steps. This is useful to avoid tracing a lot of events, which might cause issues with loading the data. + +- Additional options: Trace, record shape, profile memory, with stack, could be enabled. + +Note that, in the `for loop` (i.e. *the training loop*), one needs to call the profile step (`prof.step()`), in order to collect all the necessary inputs, which in turn will generate data that can be viewed with the Tensorboard plugin. In the end, the output of profiling will be saved in the `/out` directory. + +Note that a good practice of profiling should be based on the following: first one can start profiling for a large training loop, and once we identify the bottleneck, then we can select a few iterations for re-profiling and tuning the application. This should be followed by optimising the application and eventually re-profiling to check the impact of the optimisation. + +(visualisation-on-a-web-browser)= +### Visualization on a web browser +To view the output data generated from the profiling process, one needs to install TensorBord. This can be done for instance in a virtual environment. Here we desccribe a step-by-step guide of the installation: + +- **Step 1**: Load a Python model, create and activate a virtual environment. +Load a Python module. e.g.: `module` load python/3.9.6-GCCcore-11.2.0` +- `mkdir Myenv` +- `python –m venv Myenv` +- `source Myenv/bin/activate` + +- **Step 2**: Install TensorBoard Plugin via pip wheel packages using the following command (see also [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html)): +- `python –m pip install torch_tb_profiler` + +- **Step 3**: Run Tensorboard using the command: + +```console +tensorboard --logdir=./out --bind_all +``` +This will generate a local address having a specific registered or private port, as shown in {ref}`Figure`. Note that in HPC systems, direct navigation to the generated address is blocked by firewalls. Therefore, connecting to an internal network from outside can be done via a mechanism called [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example#local-forwarding). As stated in the [SSH documentation](https://www.ssh.com/academy/ssh/tunneling-example#local-forwarding) “Local forwarding is used to forward a port from the client machine to the server machine”. + +The syntax for local forwarding, which is configured using the option `–L`, can be written as, e.g.: + +```console +ssh -L 6009:localhost:6006 username@server.address.com +``` +This syntax enables opening a connection to the jump server `username@server.address.com`, and forwarding any connection from port 6009 on the local machine to port 6006 on the server `username@server.address.com`. + +Lastly, the local address `http://localhost:6009/` can be viewed in a Chrome or Firefox browser. + + +### On Saga cluster + +Here is an example about viewing data using `TensorBoard` on Saga. We assume that TensorBoard plugin is installed in a virtual environment, which we name `Myenv` as described above. Here are main steps: + +**Step 1**: Source the virtual environment +```console +$source Myenv/bin/activate +``` + +**Step 2**: Run the tensorboard command +```console +$tensorboard --logdir=./out --bind_all` +``` +Note that the profiled data are stored in the `out` folder. Running the command prints out a message that includes +```console +... +... +$TensorBoard 2.13.0 at http://login-3.saga.sigma2.no:6006/ +... +``` +The output message contains the address of the current login node, which is in *our case* `login-3.saga.sigma2.no`. This address will be used as a jump server as expressed in the next step. + +**Step 3**: In a new terminal, run this command + +```console +ssh -L 6009:localhost:6006 username@login-3.saga.sigma2.no +``` +Note that the port number `6006` is taken form the address `login-3.saga.sigma2.no:6006`. + +**Step 4**: View the profiled data in a Chrome or Firefox browser +```console +http://localhost:6009/ +``` + +```{eval-rst} + +.. _fig-tensorboard: + +.. figure:: pytorch_profiler/Figs/fig0.png + :width: 600px + :align: center + + Fig 2: Output of running the tensorboar command `tensorboard --logdir=./out --bind_all`. + +``` + +(performance-metrics)= +### Performance metrics + +In this section, we provide screenshots of different views of performance metrics stemming from PyTorch Profiler. The metrics include: + +- GPU usage (cf. {ref}`Figure 3`) +- GPU Kernel view (cf. {ref}`Figure 4`) +- Trace view (cf. {ref}`Figure 5` and {ref}`Figure 6`) +- Memory view (cf. {ref}`Figure 7`) +- Module view (cf. {ref}`Figure 8`) + + +```{eval-rst} + +.. _fig-overview: + +.. figure:: pytorch_profiler/Figs/fig1.png + :width: 600px + :align: center + + Fig 3: Overview of GPU activities. + +``` + +```{eval-rst} +.. _fig-kernel: + +.. figure:: pytorch_profiler/Figs/fig2.png + :width: 600px + :align: center + + Fig 4: View of GPU Kernels. + +``` + +```{eval-rst} + +.. _fig-trace1: + +.. figure:: pytorch_profiler/Figs/fig3.png + :width: 600px + :align: center + + Fig 5: View of Trace. + +``` + +```{eval-rst} + +.. _fig-trace2: + +.. figure:: pytorch_profiler/Figs/fig4.png + :width: 600px + :align: center + + Fig 6: View of Trace. + +``` + +```{eval-rst} + +.. _fig-memory: + +.. figure:: pytorch_profiler/Figs/fig5.png + :width: 600px + :align: center + + Fig 7: View of Memory usage. + +``` + +```{eval-rst} + +.. _fig-module: + +.. figure:: pytorch_profiler/Figs/fig6.png + :width: 600px + :align: center + + Fig 8: View of Modules. + +``` + + +(launching-a-pytorch-based-application)= +## Launching a PyTorch-based application + +For completeness, we provide an example of a job script that incorporates a PyTorch singularity container. The script can be adapted according to requested computing resources. + +```bash +#!/bin/bash -l +#SBATCH --job-name=PyTprofiler +#SBATCH --account= +#SBATCH --time=00:10:00 #wall-time +#SBATCH --partition=accel #partition +#SBATCH --nodes=1 #nbr of nodes +#SBATCH --ntasks=1 #nbr of tasks +#SBATCH --ntasks-per-node=1 #nbr of tasks per nodes (nbr of cpu-cores, MPI-processes) +#SBATCH --cpus-per-task=1 #nbr of threads +#SBATCH --gpus=1 #total nbr of gpus +#SBATCH --gpus-per-node=1 #nbr of gpus per node +#SBATCH --mem=4G #main memory +#SBATCH -o PyTprofiler.out #slurm output + +# Set up job environment +set -o errexit # exit on any error +set -o nounset # treat unset variables as error + +#define paths +Mydir= +MyContainer=${Mydir}/Container/pytorch_22.12-py3.sif +MyExp=${Mydir}/examples + +#specify bind paths by setting the environment variable +#export SINGULARITY_BIND="${MyExp},$PWD" + +#TF32 is enabled by default in the NVIDIA NGC TensorFlow and PyTorch containers +#To disable TF32 set the environment variable to 0 +#export NVIDIA_TF32_OVERRIDE=0 + +#to run singularity container +singularity exec --nv -B ${MyExp},$PWD ${MyContainer} python ${MyExp}/resnet18_with_profiler_api.py + +echo +echo "--Job ID:" $SLURM_JOB_ID +echo "--total nbr of gpus" $SLURM_GPUS +echo "--nbr of gpus_per_node" $SLURM_GPUS_PER_NODE +``` + +More details about how to write a job script can be found [here](https://documentation.sigma2.no/jobs/job_scripts.html). + +(pytorch-conclusion)= +# Conclusion +In conclusion, we have provided a guide on how to perform code profiling of GPU-accelerated Deep Learning models using the PyTorch Profiler. The particularity of the profiler relies on its simplicity and ease of use without installing additional packages and with a few lines of code to be added. These lines of code constitute the setting of the profiler, which can be customized according to the desired performance metrics. The profiler provides an overview of metrics; this includes a summary of GPU usage and Tensor cores usage (if it is enabled), this is in addition to an advanced analysis based on the view of GPU kernel, memory usage in time, trace and modules. These features are key elements for identifying bottlenecks in an application. Identifying these bottlenecks has the benefit of optimizing the application to run efficiently and reliably on HPC systems. + + +# Relevant links + +[PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) + +[NVIDIA NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) + +[Local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example#local-forwarding) + +[Slides](https://github.com/HichamAgueny/Profiling-GPU-accelerated-DL) + +[PyTorch Profiler video](https://www.youtube.com/watch?v=m6ouC0XMYnc&ab_channel=PyTorch) + + + diff --git a/_sources/code_development/guides/rocm_smi_lumi.md.txt b/_sources/code_development/guides/rocm_smi_lumi.md.txt new file mode 100644 index 000000000..12113b4cf --- /dev/null +++ b/_sources/code_development/guides/rocm_smi_lumi.md.txt @@ -0,0 +1,173 @@ +--- +orphan: true +--- + +(monitoring-gpus-on-lumi-g-with-rocm-smi)= +# Monitoring GPUs on LUMI-G with `rocm-smi` + +To monitor GPUs on LUMI-G during the execution of your SLURM job, you can employ the `rocm-smi` command. This can be done by using `srun` with the `--overlap` option, which allows you to execute commands on the nodes allocated to your running job. Detailed information about using `--overlap` on LUMI-G is available [here](https://docs.lumi-supercomputer.eu/runjobs/scheduled-jobs/interactive/#using-srun-to-check-running-jobs). + +## Steps to Monitor GPUs on jobs with a Single Node + +1. Run the following command with your job-id on your single node job-id: + ```bash + srun --overlap --pty --jobid= rocm-smi --showuse # replace with your desired option + ``` + +## Steps to Monitor GPUs with multiple nodes + +To monitor the GPU usage on jobs with multiple nodes, you can use the following steps: + +1. **Identify the Allocated Nodes:** + First, find out which nodes are allocated to your job by using the following command, replacing `` with the ID of your SLURM job: + + ```bash + sacct --noheader -X -P -oNodeList --jobs= + ``` + +2. **Execute `rocm-smi`:** + Once you have the node names (e.g., nid00XXXX), execute `rocm-smi` to monitor the GPU usage: + + ```bash + srun --overlap --pty --jobid= -w rocm-smi --showuse # replace with your desired option + ``` + + Replace `` with the actual node identifier. + +> **Note:** For this to work, use the `#SBATCH --gpus-per-node` directive instead of the `#SBATCH --gpus` directive in your job-script for multiple node jobs. + +## Adding GPU Monitoring to a Job Script on LUMI-G + +Monitoring GPU usage on the LUMI-G cluster can provide you with valuable insights into the performance and efficiency of your GPU-accelerated applications. By integrating ROCm-SMI (Radeon Open Compute System Management Interface) into your SLURM job script, you can collect GPU utilization statistics throughout the runtime of your job. Follow these instructions to modify your existing job script to include GPU monitoring with `rocm-smi`. + +### Script for Expanding Node Ranges + +To monitor specific GPUs, we must first resolve the node range into individual node names. The following script, named `expand_nodes.sh`, will be used in the job script to accomplish this: + +```{code-block} bash +--- +linenos: +emphasize-lines: 4, 6, 7-8, 16-18 +--- +#!/bin/bash + +# Function to expand the node range like "nid[005252-005254]" into individual nodes +expand_node_range() { + local node_range=$1 + if [[ "$node_range" == *"["* ]]; then + local prefix=${node_range%%[*]} # Extract the prefix ending at the first '[' + local range_numbers=${node_range#*[} # Extract the range numbers + range_numbers=${range_numbers%]*} # Remove the trailing ']' + + local IFS='-' + read -r start end <<< "$range_numbers" # Read the start and end numbers of the range + + # Use printf to generate the sequence with zero padding based on the width of the numbers + local width=${#start} + for (( i=10#$start; i <= 10#$end; i++ )); do + echo $(printf "nid%0${width}d" $i) + done + else + echo "$node_range" + fi +} +# Check if an argument was provided +if [ $# -eq 1 ]; then + # Call the function with the provided argument + expand_node_range "$1" +else + echo "Usage: $0 " + exit 1 +fi +``` +Key elements of the expand_nodes.sh script: + +1. The `expand_node_range` function (line 4) takes a string representing a range of nodes and expands it to individual nodes. +2. Checks for the presence of "[" to determine if it's a range (line 6). +3. Extracts the prefix and range numbers (lines 7-8). +4. Uses a for loop (lines 16-18) to iterate through the range and generate node names with proper zero padding. + +Be sure to make the script executable before attempting to use it in your job script: + +```{code-block} bash +chmod +x expand_nodes.sh +``` + + +### Modified Job Script with GPU Monitoring + +The following job script, `monitored_job_script.sh`, has been enhanced to include GPU monitoring capabilities. The GPU monitoring is encapsulated within a function and is designed to run concurrently with the main job. + +```{code-block} bash +--- +linenos: +emphasize-lines: 16, 21, 23-26, 32-35 +--- +#!/bin/bash -e +#SBATCH --job-name= +#SBATCH --account=project_4650000XX +#SBATCH --time=XX:XX:XX +#SBATCH --partition=standard-g # or dev-g +#SBATCH --nodes= +#SBATCH --ntasks-per-node= +#SBATCH --gpus= +#SBATCH --gpus-per-node=8 # Rocm-smi only works on full nodes +#SBATCH -o %x-%j.out + +# Load necessary modules +# ... + +# Define the GPU monitoring function +gpu_monitoring() { + local node_name=$(hostname) + local monitoring_file="gpu_monitoring_${SLURM_JOBID}_node_${node_name}.csv" + + echo "Monitoring GPUs on $node_name" + rocm-smi --csv --showuse --showmemuse | head -n 1 > "$monitoring_file" + + while squeue -j ${SLURM_JOBID} &>/dev/null; do + rocm-smi --csv --showuse --showmemuse | sed '1d;/^$/d' >> "$monitoring_file" + sleep 30 # Change this value to adjust the monitoring frequency + done +} + +export -f gpu_monitoring + +nodes_compressed="$(sacct --noheader -X -P -o NodeList --jobs=${SLURM_JOBID})" +nodes="$(./expand_nodes.sh $nodes_compressed)" +for node in $nodes; do + srun --overlap --jobid="${SLURM_JOBID}" -w "$node" bash -c 'gpu_monitoring' & +done + +# Run the main job task +srun your_program + +``` + +Key elements of the `monitored_job_script.sh` script: + +1. We define a `gpu_monitoring` function (line 16) to capture GPU usage data. +2. The `--csv` flag in the `rocm-smi` command (line 21) is used to format the output as comma-separated values, making it easier to parse and analyze later. +3. The loop on lines 23-26 ensures that GPU data is captured at regular intervals until the job completes. +4. The function is exported (line 29) so that it can be called across different nodes within the job. +5. In lines 32-35 we expand the node range into individual nodes using the `expand_nodes.sh` script. Then we initiate the monitoring on each node in a loop using `srun`. + +Note on ROCm-SMI flags: + +- The `--showuse` and `--showmemuse` flags included with `rocm-smi` show GPU utilization and memory usage, respectively. These flags can be substituted or extended with other flags that are relevant to the specific monitoring requirements of your job. Using the `--csv` format ensures that the output is easily readable and can be processed with standard data analysis tools after the job has concluded. + +### Submitting the Modified Job Script + +To submit the job script with GPU monitoring enabled, use the following SLURM command: + +```bash +sbatch monitored_job_script.sh +``` + +### Reviewing the Monitoring Data + +Upon completion of your job, you can review the collected GPU usage and performance data. For each job, you will find a consolidated CSV file with a naming pattern of `gpu_monitoring__node_.csv`. This file contains time-stamped metrics that will allow you to assess the GPU usage over the duration of the job. + +Analyze the CSV data files using your preferred data processing tool to gain insights into the GPU resource utilization and identify potential bottlenecks or inefficiencies in your application's performance. + +Note to Users: The provided scripts for GPU monitoring serve as an adaptable framework. Depending on the specific requirements of your computation workload, you may need to modify the scripts to fit your needs. Adjustments may include changing the frequency of data capture, modifying the captured metrics, or altering how the node expansion is handled. Use the scripts as a starting point and tailor them to surmount the individual challenges associated with monitoring in a HPC environment like LUMI-G. diff --git a/_sources/code_development/guides/stencil.md.txt b/_sources/code_development/guides/stencil.md.txt new file mode 100644 index 000000000..05b4e0d15 --- /dev/null +++ b/_sources/code_development/guides/stencil.md.txt @@ -0,0 +1,856 @@ +--- +orphan: true +--- + +(stencil)= + +```{index} GPU; Introduction to Stencil communication pattern on GPU; +``` + +Introduction +============ + +`Divide-n-conquer` strategy is the foundation of parallel programming in which a bigger problem is divided into a set of smaller problems and solved efficiently. To design a generalized parallel programming model, which can fit a variety of problems, several methodologies were proposed around divide-n-conquer, and among them, one is Foster's methodology. [PCAM](https://www.mcs.anl.gov/~itf/dbpp/text/node15.html) is the building block of Foster's methodology, which stands for `Partitioning`, `Communication`, `Agglomeration`, and `Mapping`. Since the design and paradigm of parallel programming is a broad topic and beyond the scope of this tutorial, we will primarily focus on the Communication part of PCAM in this tutorial, and we shall see: + +* What is a communication pattern in parallel computing +* A brief overview of Map and Gather communication patterns +* What are Stencil operation and its importance in numerical analysis +* Solving 2D heat equation using Stencil communication pattern in CUDA +* CUDA thread hierarchy +* Profiling our 2D heat equation code example +* How to Optimize our code example +* How to Debug our code example + + +Communication Patterns +====================== +What we know so far is that a parallel computation is divided into tasks, where each task is a unit of work. In CUDA, these tasks can be represented by CUDA threads. These threads need to work together and require inter-thread communication. In CUDA, communication happens through memory. For example, threads may need to read from an input memory location and write to the same memory location. Sometimes these threads exchange partial results with each other to compute the global result. + +The communication between threads depends on the nature of the problem we wish to solve; For example, suppose the salary of ‘n’ employees in a company is stored in an array. Let us call this array ‘salary-array’. Now, we want to add a gift amount of 100 NOK to each employee’s salary. This task can be solved serially by iterating through the array, from the first to the last element in the array, and adding 100 NOK to each employee; clearly, this task will take ‘n’ steps to finish. The same task could have been solved parallelly in a constant time through the ‘MAP’ operation. {ref}`MAP` is a communication pattern where each thread reads and writes to a specific memory location, or we can say that there is a one-to-one correspondence between input and output. GPUs are very efficient in solving such problems, but Map is not very flexible in solving all types of computation problems; for example, Map cannot compute and store the average of 3 subsequent salaries of the employees in the array. However, another pattern called {ref}`GATHER` could solve the problem efficiently. In the case of `Gather` operation, each thread would read the values from 3 different locations in the memory and write them into a single place in the memory, as depicted in the {ref}`figure`. + + + + + +```{eval-rst} + +.. _fig-MAP: + +.. figure:: stencil/MAP.svg + :width: 500px + :align: center + + Fig 1: MAP + +``` + +```{eval-rst} + +.. _fig-Gather: +.. figure:: stencil/Gather.svg + :width: 500px + :align: center + + Fig 2: GATHER + +``` + +So far, we have seen that there are predefined communication patterns that appear now and again to solve a bigger problem, these patterns describe the basic solution to a problem and can be combined to solve a complex computational problem efficiently. + + +Stencil +======= +Stencil operation computes the value of a single element by applying a function to a collection of neighboring elements. +A very simple 9 elements stencil operation is shown in {ref}`Fig 1`. In one dimension, a nine-point stencil around a point at position {math}`x` would apply some function to the values at these positions: +```{math} + {x − 4h, x − 3h, x − 2h, x − h, x, x + h, x + 2h, x + 3h, x + 4h} +``` + + +```{eval-rst} + +.. _fig-coordsys-rect: + +.. figure:: stencil/stencil.svg + :width: 600px + :align: center + + Fig 3: Nine elements stencil. + +``` + +As it can be seen from {ref}`Fig 3` that 9 inputs are used to produce a single output. And if you look at our 9-point stencil operation again, then you will find that it is the [finite-difference-method(FDM)](https://en.wikipedia.org/wiki/Finite_difference_method) of order 8 to calculate the first derivative of a function {math}`{\displaystyle f(x)}` at a point {math}`x`, and that is the reason why Stencil operation is at the core of many algorithms that solve partial differential equations. + + +2D Heat Equation +================ +Heat dissipates into its surrounding by conduction, convection, and radiation. The process of transferring heat from the hotter part to the colder part of a material/body is called conduction. The heat equation models the flow of heat from the hotter part to the colder part of a body. + +The heat equation is a fundamental differential equation because it is the building block for other differential equations and has applications across the sciences. {ref}`Equation 1 ` is called the ideal heat equation because it models the heat flow in an ideal condition. For example, it does not consider the shape and type of the body. To apply it to real-world engineering problems, one should consider other physical constraints too. + + +```{eval-rst} +.. math:: + :name: eq:heat + + \begin{array}{cc}\frac{\delta u}{\delta t} = + \alpha \left (\frac{\delta^2 u}{\delta y^2} + + \frac{\delta^2 u}{\delta x^2} + \right) + \end{array} + +``` +If we try to solve {ref}`Equation 1 `. We get: + +```{eval-rst} +.. math:: + :name: eq:heat_sol + + \begin{array}{cc} + u_{ij}^{t+1} = u_{ij}^{t} + \Delta t\times\alpha + \left( + \frac{u_{i+1}^{t} + u_{i-1}^{t} + u_{j+1}^{t} +u_{j-1}^{t} -4u_{ij}^{t}}{h^2} + \right) + \end{array} + +``` +From {ref}`Equation 2 `, we can see that the change in temperature after time {math}`\Delta t` at a particular cell {math}`u_{ij}` on the 2D surface, depends on its non-diagonal neighboring cells, as shown in {ref}`Fig 4`. You can also notice from {ref}`Fig 4` that it is a 5-point stencil operation. + +```{eval-rst} + +.. _fig_grid-discrete: + +.. figure:: stencil/grid.svg + :width: 200px + :align: center + + Fig 4: Discrete grid visualization. + +``` +Now, it is easy to translate {ref}`Equation 2 ` into pseudocode. + +```{eval-rst} + +.. code-block:: none + :caption: Pseudocode for the heat flow on a 2D grid + + for time 1 -> n : + for cell(1,1) -> cel(grid_size, grid_size) : + Temp_Next(i,j) = Temp_Cur(i,j) + + ( + Temp_Cur(i+1,j) + Temp_Cur(i-1,j) + + Temp_Cur(i,j+1) + Temp_Cur(i,j-1) - + 4.0 * Temp_Cur(i,j) + ) / Cell_Surface_Area + +``` + +```{eval-rst} + +.. _sec-NBC: + +Neumann Boundary Condition +~~~~~~~~~~~~~~~~~~~~~~~~~~ +``` +As we can see from {ref}`Fig 4` that each cell needs 4 neighbor cells to calculate 'Temp_Next'. But, what will happen at the corners and the edges of our grid? We will fall short of 2 cells at each corner-cell, and 1 cell at each edge-cell. To fix this problem, we will use `Neumann Boundary Conditions` which say that the temperature change is {math}`0` or {math}`\left( \frac{\delta u}{\delta t} = 0 \right )` at the boundary. To satisfy the boundary condition, we create a Halo around our grid and copy the temperature from the adjacent edge-cell to the adjacent halo-cell, as shown in {ref}`Fig 6`. + +```{eval-rst} + +.. _fig_halo: + +.. figure:: stencil/halo.svg + :width: 200px + :align: center + + Fig 5: Halo around the grid. + +``` + +Now, we have everything in place to draw a {ref}`flow chart` for the heat simulation in 2D. + +```{eval-rst} + +.. _fig_flowchart: + +.. figure:: stencil/flowchart.svg + :width: 300px + :align: center + + Fig 6: 2D heat simulation. + +``` + +```{eval-rst} +.. _sec-2DHEq_Serial: + +A sequential version of 2D heat simulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``` +Here is how the 2D heat equation is implemented in C, and the highlighted lines show the main stencil operation. The complete code can be downloaded from the {ref}`Resources` section. + +```{eval-rst} +.. literalinclude:: stencil/serial.c + :language: c++ + :lines: 124-137 + :emphasize-lines: 8-13 + :linenos: + +``` + +```{eval-rst} +.. _sec-2DHEq_Execution: + +Compilation and execution on Betzy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``` + +```{eval-rst} + +Follow these steps to compile and run the :ref:`2DHeatEquation` project on Betzy. + +#. :ref:`Download` tarball to your local client. +#. :ref:`Upload` it to your Betzy login +#. :ref:`Uncompress` it +#. :ref:`Build` it +#. :ref:`Execute` it +``` + +The output of our serial version of code should look something similar to this. + +```bash +srun: job 371234 queued and waiting for resources +srun: job 371234 has been allocated resources +Solving heat equation for grid 500 x 500 with 1000 iterations +Used 0.509 seconds to evolve field +Average time per field update: 0.509 ms + +``` +It is also possible to {ref}`visualize` the output, as shown below. + +```{eval-rst} + +.. _fig_2dheateq: + +.. figure:: stencil/anim_out.gif + :width: 250px + :align: center + + Fig 7: Heat diffusion in 2-dimension (Animation). + +``` + + +```{eval-rst} +.. _sec-2DHEq_Cuda: + +CUDA version of 2D heat simulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``` +Here is how the 2D heat equation is implemented in CUDA, and the highlighted lines show the main stencil operation. The complete code can be downloaded from the {ref}`Resources` section. + +```{eval-rst} +.. literalinclude:: stencil/cuda.c + :language: c++ + :lines: 1-20 + :emphasize-lines: 13-17 + :linenos: + +``` + +Follow the instructions to {ref}`Build` and {ref}`Run` the CUDA code on Betzy. + +The output of our CUDA code should look something similar to this. + +``` +Solving heat equation for grid 500 x 500 with 1000 iterations +Launching (32, 32) grids with (16, 16) blocks +Used 0.017 seconds to evolve field +Average time per field update: 0.017 ms + +``` + +The code explanation is straightforward and very similar to the serial version. However, few new concepts have been introduced here, like `Grid`, `Blocks`, and `Threads`. We try to explain each of them briefly; however, an in-depth explanation is given on the [Nvidia CUDA documentation page](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-hierarchy). + +```{eval-rst} +.. _sec-CudaThreadHierarchy: + +CUDA thread hierarchy +~~~~~~~~~~~~~~~~~~~~~~ +``` + +In a typical CUDA program, first, the CPU allocates storage on the GPU and copies the input data from the CPU to the GPU. The function which runs on a GPU is called a `Kernel` function or simply a Kernel. The CPU launches the Kernel, and the execution transfers from the CPU to the GPU. Input data get processed on the GPU and the results transfer back to the CPU. + + +During the execution of the Kernel, CUDA launches a large number of threads. To organize these threads, CUDA maintains a thread hierarchy. According to this 2-tier thread hierarchy, threads are divided into `Blocks` of threads, and blocks are divided into `Grids` of blocks, as shown in {ref}`figure 8`. + + +```{eval-rst} + +.. _fig_grid_n_blocks: + +.. figure:: stencil/grid_block.svg + :width: 500px + :align: center + + Fig 8: CUDA Grid, Blocks, and Threads. + +``` +A user has full control over organizing the threads into blocks and grids, and this can be done during the Kernel call; on the host side. An example of this is shown below. + +```{eval-rst} + +.. code-block:: none + :caption: Example of thread organization in Blocks and Grids. + + const dim3 blockSize(32,32); + const dim3 gridSize(24,19); + + my_kernel<<>>() + +``` + +The above example shows that my_kernel will spawn 466,944 threads in total. To organize these many threads, the threads are organized into blocks of 32x32 threads in X and Y dimensions. So, each block has 32 threads in the X dimension, and 32 threads in the Y dimension; in total, each block has 1024 threads. Now the blocks are arranged in a grid of 24 blocks in the X dimension, and 19 blocks in the Y dimension; in total, 456 blocks in a grid. + +Please note that dim3 is a derived data type wrapped around the intrinsic integer data type. It has three unsigned integers to store X, Y, and Z dimensions respectively. + +The main purpose of this 2-tier hierarchy is to uniquely identify a thread in a pool of threads. Since thread blocks are spread across a 2-dimensional grid, it is easy to identify the block number, at run time, using variables supplied by CUDA-Runtime. Let us try to understand this with an example. Suppose, at a particular moment in time, we want to know the offset of our thread, then what should be our approach to find the global index of our thread? + + +```{eval-rst} + +.. code-block:: none + :caption: Uniquely identify a thread within a pool of threads launched by a Kernel. + + /* + CUDA-Runtime can provide these variables at runtime: + ---------------------------------------------------- + 1. gridDim.x + 2. gridDim.y + 3. blockIdx.x + 4. blockIdx.y + 5. threadIdx.x + 6. threadIdx.y + */ + + // Calculate the global thread index using these steps: + + // 1. Find the block number in which the current thread resides + int global_block_index = gridDim.x*blockIdx.y + blockIdx.x; + + // If 1 block contains m*n threads then p blocks contain how many threads? p*m*n + int total_threads = gloabal_block_index * blockDim.x*blockDim.y; + + // 3. Find the index of the current thread within the current block + int local_thread_index = threadIdx.y*blockDim.x + threadIdx.x; + + // 4. Global thread index + int global_thread_index = total_threads + local_thread_index + + // One liner + int offset = (gridDim.x*blockIdx.y + blockIdx.x)*(blockDim.x*blockDim.y) + + threadIdx.y*blockDim.x + threadIdx.x; + + // Calculate global row and column of a thread + int row = blockIdx.x * blockDim.x + threadIdx.x; + int col = blockIdx.y * blockDim.y + threadIdx.y; + +``` + +```{eval-rst} +.. _sec-CodeDesign: + +Designing the Cuda code +~~~~~~~~~~~~~~~~~~~~~~~ +``` +In this section, we will try to explain the sample Cuda code with the knowledge we got in the previous {ref}`section `. + +In our Cuda code example, we used the [Unified Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd) feature of CUDA-6.0, and that is the reason why we did not allocate the memory using `cudaMalloc`, and also the data movement from host-to-device and device-to-host was not performed using `cudaMemcpy`; however, these two operations were taken care of by a unified function called `cudaMallocManaged`. This function allocates a unified pool of memory which is accessible both from the host and the device. Let us try to figure out where these operations were performed in our Cuda code. + +In the following lines of code, you may see that after including some libraries and header files, we declared a few variables in `line:77,78`, like `dim_block`, and `dim_grid`. Here `dim3` is a data type, provided by the CUDA-Runtime environment. The main use of this data type is to define the dimensions of a block and a grid. We kept the block size fixed with 256 threads in each block. The number of blocks in a grid is calculated in `line:84`. Please note that this number of blocks can only accommodate `grid+2` elements, but our 2D grid has `grid+2 x grid+2` elements, and that is the reason why we specified `dim3(grids, grids)` in `line:85`. + +```{eval-rst} +.. literalinclude:: stencil/cuda.cu + :linenos: + :language: c++ + :lines: 18-87 + :lineno-start: 18 + :emphasize-lines: 60-61, 67, 68 + +``` + +In `lines:90,91` we declared two pointers to deference the memory location, however, at this point in time no memory was allocated and therefore they are only null-pointers. `lines:92,94` used the `cudaMallocManaged` function to allocate the unified memory space and map it to the pointers, which were declared in `lines:90,91`. Henceforth, all the modifications, in the allocated memory space, will be carried out using these pointers. In `line:97` a device Kernel was launched which would calculate the initial and boundary conditions of the grid. +```{eval-rst} +.. literalinclude:: stencil/cuda.cu + :linenos: + :language: c++ + :lines: 88-100 + :lineno-start: 88 + :emphasize-lines: 3-8, 10 + +``` + +The main stencil operation was performed in the `evolve` device kernel, which was run for the required number of iterations and timed using a time function. The logic behind the evolve function is similar to the serial version of the code, but in the Cuda version, `evolve` function is performed parallelly by Cuda threads. In `line:122`, system-level synchronization is used to make sure that the GPU and the CPU are synced; before any computed result is made available on the CPU. And lastly, the allocated memory is freed up in `line:138,139`. + +```{eval-rst} +.. literalinclude:: stencil/cuda.cu + :linenos: + :language: c++ + :lines: 102-140 + :lineno-start: 102 + :emphasize-lines: 19, 21, 37, 38 + +``` + + +```{eval-rst} +.. _sec-2DHEq_Profiling: + +Profiling +~~~~~~~~~~ +``` +We will check our kernel performances using Nvidia Nsight Systems, which is a profiler and can be downloaded from Nvidia`s [official website](https://developer.nvidia.com/nsight-systems). + +Profiling your Cuda code on `Betzy` or `Saga` is not difficult, but involves a few steps to follow. +We will use `nsys` CLI to generate a view of our Cuda executable on the GPU cluster, and later analyze it on our local computer. + +The basic `qdrep` file can be generated by following the command. + +```bash + +[@login-2.BETZY ~]$ module load CUDA/11.4.1 +[@login-2.BETZY ~]$ cd HeatEq2D_Stencil/ +[@login-2.BETZY ~]$ make clean +[@login-2.BETZY ~]$ make all +[@login-2.BETZY ~]$ srun --account=[USR-ACC] --time=05:00 --partition=accel --gpus=1 --mem-per-cpu=512M --job-name=nsys_stencil nsys profile -t cuda -f true -o cuda ./cuda + +``` +Detailed information about different nsys flags and options is provided [here](https://docs.nvidia.com/nsight-systems/UserGuide/index.html). But we use `-t` to profile `cuda` code, other options could be: openmp, mpi, openacc, nvtx, et cetera. Also, we used `-f true` to overwrite previously generated output, and `-o` to generate output in the folder 'nvprofiler' with the name 'cuda'. Finally, we provided the executable name of our Cuda code. After executing the above command, we got output in the form of `cuda.qdrep`. + +Now, download the 'cuda.qdrep' file to your local system using `scp` command, like this: + +```{note} +Run this command on your local computer, and replace username with your user id and with the path to the file on the cluster. `pwd` or `readlink -f filename` would help to know the absolute path of the file. +``` + +```bash +$ scp -r username@betzy.sigma2.no: . +``` +Now, launch the Nvidia Nsight Systems (I assume it has already been [downloaded](https://developer.nvidia.com/nsight-systems) and installed on your local system). This should open a window similar to {ref}`figure 9`. + +```{eval-rst} + +.. _fig_open_nnsys: + +.. figure:: stencil/nns/MainWindowNNS1.png + :width: 700px + :align: center + + Fig 9: Nvidia Nsight Systems main window. + +``` + +From the main menu, click 'File' and browse to the downloaded 'cuda.qdrep' file. This should open a 'view' similar to {ref}`figure 10`. +```{eval-rst} + +.. _fig_main_view: + +.. figure:: stencil/nns/NNSprojectexplorer2.png + :width: 700px + :align: center + + Fig 10: Main view window. + +``` + +From the dropdown menu, one can choose Timeline, Analysis, or Diagnostic Summary. We are interested in the Timeline view, but some important information, like Target summary, Thread summary, and GPU-CPU info can be found in the 'Analysis' tab of this menu. As we can see from {ref}`figure 10` that there are a few other collapsable tabs, like CPU(48), CUDA HW, etcetera. These tabs give information about the underlying resources our kernel used on the Target machine. If we click 'CUDA HW', then we see information about our kernels, as shown in {ref}`figure 11`. + +```{eval-rst} + +.. _fig_GPU-HW: + +.. figure:: stencil/nns/nnskernels_4.png + :width: 500px + :align: center + + Fig 11: Executed kernels on the GPU. + +``` + +This information is quite handy in knowing that our 'evolve' kernel has consumed 97% of the total kernel execution time on the GPU; apparently because the kernel has been called 1000 times, and init\_field kernel called only 2 times for initializing the current and the next grid. + +Now, if we look to the right of this tab, then we see the actual timeline, as shown in {ref}`figure 12`. +```{eval-rst} + +.. _fig_GPU_timeline: + +.. figure:: stencil/nns/nnsslct_kernel_5.png + :width: 700px + :align: center + + Fig 12: Actual timeline. + +``` + +Drag the mouse cursor over the timeline during 'evolve' kernel execution time and press `shift+z` simultaneously to zoom in the timeline, as shown in {ref}`figure 13`. The figure shows that the kernel starts executing at around .389528s and ends at .389550s and took 22,688 microseconds to finish. This is the time utilization of a single kernel call, but we have called it 1000 times. + +```{eval-rst} + +.. _fig_zoom_in: + +.. figure:: stencil/nns/nnshoverzoom_6.png + :width: 700px + :align: center + + Fig 13: Zoomed timeline. + +``` + +To analyze the complete summary, we must go to the bottom left pane of the main window in Nsight System, and select `Stats System View` from the dropdown list. This should show the relevant summary of our kernel call, as shown in {ref}`figure 14`. From the summary, we can see that most of the time was consumed by memory operations in moving data from host to device, and our 'evolve' kernel took around 22,516 microseconds on average. + +```{eval-rst} + +.. _fig_cuda_summary: + +.. figure:: stencil/nns/nnscuda7.png + :width: 700px + :align: center + + Fig 13: Summary of CUDA operations. + +``` + + +In the next section, we will try to reduce the total execution time by doing some memory optimization. + + +```{eval-rst} +.. _sec-2DHEq_Optimization: + +Optimization +~~~~~~~~~~~~ +``` +In our Optimization section, we'll try to incorporate onboard `constant` memory. Constant memory is only used for the reading purpose on the device and can be written or read from the host. The constant memory is accessible to all threads in a wrap, and each thread can access it uniformly. + +Constant memory is located on the device and has its own on-chip cache, or we can say that each {ref}`Streaming Multiprocessor` has a constant cache. Because of this "per-SM" constant memory, reading takes less time as compared to reading directly from the constant memory. + +The lifetime of a variable, which is declared and initialized on the constant memory, is equal to the lifetime of the program. A variable on the constant memory is accessible from any thread as long as the thread is from the same grid. The host can also read and write to the constant memory, but only through CUDA-Runtime functions. Variables on the constant memory must be pre-initialized before using them. Since the device code cannot write to the constant memory, the variables must be initialized from the host code using `cudaMemcpyToSymbol`, which is a CUDA-Runtime function. + +Since our 2d-heat equation performs stencil operation which is a data-parallel operation and maps well to the GPU. Each thread calculates the change in temperature after a discrete time interval. During this calculation, each thread reads some constants, like the diffusion constant, time derivative etcetera. These constants tend to be the same throughout the execution of the program and would be a good candidate to use with constant memory. + +To use the constant memory, we must setup it up from the host code, as shown in the code snippet. The highlighted line shows the copying of the data from the source to the constant memory on the device. + + + +```{eval-rst} +.. literalinclude:: stencil/cuda_const.c + :language: c++ + :lines: 34-46 + :emphasize-lines: 11 + :linenos: + +``` + +Now, we must call the 'setup_constants' function before our 'evolve' kernel call, as shown here. + + +```{eval-rst} +.. literalinclude:: stencil/cuda_const.c + :language: c++ + :lines: 116-122 + :emphasize-lines: 2 + :linenos: + +``` + +Since we setup up our coefficients through constant memory, we do not need to send them as the function arguments during the kernel call. + +The final thing remaining is to fetch the coefficients, on device, while doing computation. This can be done as shown below. + + +```{eval-rst} +.. literalinclude:: stencil/cuda_const.c + :language: c++ + :lines: 197-231 + :emphasize-lines: 14-15 + :linenos: + +``` + +The complete source file can be downloaded from the given link. + +```{eval-rst} +:download:`cuda_const.cu` + +``` + +Let us re-run our code in the Nsight profiler. + + +```{eval-rst} + +.. _fig_cuda_const: + +.. figure:: stencil/nns/nnsconst8.png + :width: 700px + :align: center + + Fig 14: CUDA constant memory. + +``` + +If we compare results from {ref}`figure 13` and {ref}`figure 14`, we see that some improvement was achieved during data transfer, and little time reduction has been noticed in the evolve kernel execution. + +```{eval-rst} +.. _sec-2DHEq_Debugging: + +Debugging +~~~~~~~~~ +``` + +In this section, we will show some of the basic techniques to use CUDA gdb. CUDA gdb is an extension of the gdb debugger, which adds support for CUDA debugging. It supports breakpoints, single stepping, and everything else you would expect from a debugger. CUDA-gdb can be used to debug both the device code, as-well-as the host code. + + +To start the debugging session, we must provide the `-G` flag to our nvcc compiler. The -G flag enables debug symbols just like it would in the GNU-C compiler. + +To make it a little fast and easy, I have included a CUDA-gdb enabled version of our 2DHeatequation code. You just need to compile and run it like this: + +```bash +$ make cuda-gdb +cuda gdb code is built + +$ srun --account= --nodes=1 --ntasks-per-node=1 --time=05:00 --qos=devel --partition=preproc ./cuda-gdb +Submitted batch job 6990542 + +$ cat output.out +Waiting for evolve before pointer swap: + Error(cudaErrorIllegalAddress): an illegal memory access was encountered + +``` + +So, from the output, we know that illegal memory access has occurred somewhere in the code, with no other information. Luckily, we have CUDA-gdb to rescue us from this situation. + +Since the code was compiled using proper flags to load debug symbols, we just need to invoke the CUDA-gdb, like this: + +```bash +$ cuda-gdb ./cuda_gdb +``` + +The above command should start the gdb with command-line-interface, something similar to this: + +```bash +NVIDIA (R) CUDA Debugger +11.1 release +Portions Copyright (C) 2007-2020 NVIDIA Corporation +GNU gdb (GDB) 8.3.1 +Copyright (C) 2019 Free Software Foundation, Inc. +License GPLv3+: GNU GPL version 3 or later +This is free software: you are free to change and redistribute it. +There is NO WARRANTY, to the extent permitted by law. +Type "show copying" and "show warranty" for details. +This GDB was configured as "x86_64-pc-linux-gnu". +Type "show configuration" for configuration details. +For bug reporting instructions, please see: +. +Find the GDB manual and other documentation resources online at: + . + +For help, type "help". +Type "apropos word" to search for commands related to "word"... +Reading symbols from ./cuda_gdb... +(cuda-gdb) + +``` +Now, type `run`, this time we get a lot of vital information, and the execution would stop where it encountered the illegal memory access. + +The information should look something similar to this: + +``` +(cuda-gdb) run +[Thread debugging using libthread_db enabled] +Using host libthread_db library "/lib64/libthread_db.so.1". +warning: File "/cluster/apps/eb/software/GCCcore/10.2.0/lib64/libstdc++.so.6.0.28-gdb.py" auto-loading has been declined by your `auto-load safe-path' set to "$debugdir:$datadir/auto-load". +To enable execution of this file add + add-auto-load-safe-path /cluster/apps/eb/software/GCCcore/10.2.0/lib64/libstdc++.so.6.0.28-gdb.py +line to your configuration file "/cluster/home/user/.cuda-gdbinit". +To completely disable this security protection add + set auto-load safe-path / +line to your configuration file "/cluster/home/user/.cuda-gdbinit". +For more information about this security protection see the +"Auto-loading safe path" section in the GDB manual. E.g., run from the shell: + info "(gdb)Auto-loading safe path" +Solving heat equation for grid 500 x 500 with 1000 iterations +Launching (32, 32) grids with (16, 16) blocks +[Detaching after fork from child process 1561068] +[New Thread 0x7fffef646000 (LWP 1561074)] +[New Thread 0x7fffeee45000 (LWP 1561075)] + +CUDA Exception: Warp Illegal Address +The exception was triggered at PC 0x906110 (cuda_gdb.cu:209) + +Thread 1 "cuda_gdb" received signal CUDA_EXCEPTION_14, Warp Illegal Address. +[Switching focus to CUDA kernel 0, grid 3, block (0,0,0), thread (1,1,0), device 0, sm 0, warp 1, lane 17] +0x0000000000906118 in evolve<<<(32,32,1),(16,16,1)>>> (curr=0x7fffbc000000, next=0x7fffbc0f6200, size=500, + cell_size=0.00999999978, alpha=0.100000001, dt=0.000249999983) at src/cuda_gdb.cu:209 + + +``` + +Now, we know that the kernel 'evolve' is doing some illegal memory accesses. List the code around line number 209. We chose 209 because it is pointed out by the debugger. Since our `listsize` is set to 30, we would list the code from line number 200. + +```bash +(cuda-gdb) list 200 +185 +186 #define CURR(i,j) curr[((i)-1)*(size)+(j)-1] +187 #define NEXT(i,j) next[((i))*(size+2)+(j)] +188 +189 +190 // Additional variables +191 const float cell = cell_size * cell_size; +192 const float r = alpha * dt; +193 // When launching this kernel we don't take into account that we don't want +194 // it run for the boundary, we solve this by the following if guard, this +195 // means that we launch 4 threads more than we actually need, but this is a +196 // very low overhead +197 /* +198 if (0 < row && row < size + 1 && 0 < col && col < size + 1) { +199 const int ip1 = (row + 1) * (size + 2) + col; +200 const int im1 = (row - 1) * (size + 2) + col; +201 const int jp1 = row * (size + 2) + (col + 1); +202 const int jm1 = row * (size + 2) + (col - 1); +203 next[index] = curr[index] + r * +204 ((curr[ip1] - 2. * curr[index] + curr[im1]) / cell +205 + (curr[jp1] - 2. * curr[index] + curr[jm1]) / cell) ; +206 }*/ +207 +208 if (0 < i && i < size + 1 && 0 < j && j < size + 1) { +209 NEXT(i,j) = CURR(i,j) + r * ( +210 (CURR(i-1,j)+CURR(i+1,j)+ +211 CURR(i,j-1)+CURR(i,j+1)- +212 4.0*CURR(i,j)) / (cell_size*cell_size) +213 ); +214 } + + +``` + +We can see that there are some reads and writes in line number 209. We can also see that some macro functions perform read operations. Let us test which memory address they are reading from. + +```bash +(cuda-gdb) p i +$1 = 1 +(cuda-gdb) p j +$2 = 1 +(cuda-gdb) p curr[((i-1)-1)*(size)+(j)-1] +Error: Failed to read generic memory at address 0x7fffbbfff830 on device 0 sm 0 warp 1 lane 17, error=CUDBG_ERROR_INVALID_MEMORY_SEGMENT(0x7). +``` +We are trying to read the memory past the lower bound of our allocated memory space on the device and thus get the error. Fix this by replacing 1 with 2. + +```bash +1. #define CURR(i,j) curr[((i)-1)*(size)+(j)-1] + +2. #define CURR(i,j) curr[((i))*(size)+(j)] +``` + +```{eval-rst} +.. _sec-2DHEq_Resources: + +Resources +========= +``` + +The complete code is available in compressed format and can be downloaded from the given link. + +```{eval-rst} +:download:`HeatEq2D_Stencil` + +``` + +```{eval-rst} +.. _sec-copy: + +Upload it to Betzy +~~~~~~~~~~~~~~~~~~ + +``` + +```bash +$ scp username@betzy.sigma2.no:/cluster/home/ +``` + +```{eval-rst} +.. _sec-uncompress: + +Uncompress it on Betzy +~~~~~~~~~~~~~~~~~~~~~~ + +``` +```bash +$ tar -zxvf HeatEq2D_Stencil.tar.gz +``` + +```{eval-rst} +.. _sec-build: + +Build project on Betzy +~~~~~~~~~~~~~~~~~~~~~~ + +``` +Build `Serial` version. +```bash +$ make serial +``` + +Build `Parallel` version. +```bash +$ make parallel +``` + +```{note} +Module `CUDA/11.4.1` is required on Betzy to build GPU version. +``` +Build `CUDA` version. + +```bash +$ make cuda +``` + +Build complete project. +```bash +$ make all +``` + +```{eval-rst} +.. _sec-execute: + +Execute code on Betzy +~~~~~~~~~~~~~~~~~~~~~~~~ + +``` + +Run `Serial` version. +```bash +$ srun --account= --nodes=1 --ntasks-per-node=1 --time=05:00 --qos=devel --partition=preproc ./serial +``` + +Run `Parallel` version. +```bash +$ srun --account= --nodes=1 --ntasks-per-node=1 --cpus-per-task=32 -c 32 --time=05:00 --mem-per-cpu=512M --qos=devel --partition=preproc ./parallel +``` + +Run `CUDA` version. +```bash +$ srun --account= --partition=accel --gpus-per-task=1 --ntasks=1 --time=05:00 --mem-per-cpu=512M ./cuda +``` + +```{eval-rst} +.. _sec-visualization: + +Visualization on Betzy +~~~~~~~~~~~~~~~~~~~~~~ + +``` +```bash +$ srun --account= --cpus-per-task=1 -c 1 --time=10:00 --mem-per-cpu=1G --qos=devel --partition=preproc ./serial 500 1000 2 +``` +The above command will generate one `png` file at every other iteration, and then you can use `ffmpeg` to create animation. + diff --git a/_sources/code_development/guides/sycl_academy.md.txt b/_sources/code_development/guides/sycl_academy.md.txt new file mode 100644 index 000000000..ec28b096b --- /dev/null +++ b/_sources/code_development/guides/sycl_academy.md.txt @@ -0,0 +1,128 @@ +--- +orphan: true +--- + +# SYCL Academy tutorial + +Codeplay provides a nice introductory tutorial of the basic features of SYCL in their +[SYCL Academy](https://github.com/codeplaysoftware/syclacademy) +repository. This is the course material for the standard SYCL tutorial given by Codeplay on relevant +conferences and workshops throughout the year. In the following we will demonstrate how to compile +and run the example code on Saga. + +## Step 1: Load necessary modules + +In order to compile the examples we will need CMake as well as a SYCL implementation that is compatible +with the Nvidia GPUs that we have available. On Saga we have hipSYCL and CMake installed globally, and +we will choose a CMake version that is compatible with the GCC toolchain that the hipSYCL module is +based upon, in this case `GCCcore/11.2.0`: + +```console +[me@login-1.SAGA ~]$ module load hipSYCL/0.9.2-GCC-11.2.0-CUDA-11.4.1 +[me@login-1.SAGA ~]$ module load CMake/3.21.1-GCCcore-11.2.0 +``` + +```{note} +Some of the examples in this tutorial does not compile with the 0.9.1 version of `hipSYCL`, +so make sure to use at least version number 0.9.2. +``` + +## Step 2: Download course material + +The course material can be downloaded from Github with the following command (remember the `--recursive` option): + +```console +[me@login-1.SAGA ~]$ git clone --recursive https://github.com/codeplaysoftware/syclacademy.git +[me@login-1.SAGA ~]$ cd syclacademy +[me@login-1.SAGA ~/syclacademy]$ ls +CMakeLists.txt CODE_OF_CONDUCT.md External LICENSE.md sycl_academy.png +Code_Exercises CONTRIBUTING Lesson_Materials README.md Utilities +``` + +You will here find `Lesson_Materials` in the form of html slides which you can view in your browser +(best done on your local machine), as well as corresponding `Code_Exercises` for each of the lectures. +You can also follow the exercises by browsing the +[Github page](https://github.com/codeplaysoftware/syclacademy/tree/main/Code_Exercises), +where you will find explanation of the exercises in the `doc.md` file of each folder. + +## Step 3: Configure with CMake + +When we configure the build we need to tell CMake which SYCL implementation we are going to use (hipSYCL) +and which target architecture we want to compile for; `omp` for CPU targets and `cuda:sm_60` for the +Nvidia P100 GPUs we have on Saga (`cuda:sm_80` for the A100 cards on Betzy): + +```console +[me@login-1.SAGA ~/syclacademy]$ mkdir build +[me@login-1.SAGA ~/syclacademy]$ cd build +[me@login-1.SAGA ~/syclacademy/build]$ cmake -DSYCL_ACADEMY_USE_HIPSYCL=ON -DHIPSYCL_TARGETS="omp;cuda:sm_60" .. +``` + +Hopefully no error occurred on this step. + +```{tip} +If you got a `syclacademy/External/Catch2 does not contain a CMakeLists.txt file` error you may +have forgotten to download the submodules of the git repo (`--recursive` option in the clone). +``` + + +## Step 4: Compile and run exercises + +The tutorial is organized such that you are expected to write your solution code in the `source.cpp` file +of each exercise in the `Code_Exercises` folder based on the text given in the corresponding `doc.md` file. +You can then compile your source file for exercise 1 with the following command: + +```console +[me@login-1.SAGA ~/syclacademy/build]$ make exercise_01_compiling_with_sycl_source +``` + +after which the resulting executable (with the same long and cumbersome name as the build target) can be found under: + +```console +[me@login-1.SAGA ~/syclacademy/build]$ ls Code_Exercises/Exercise_01_Compiling_with_SYCL/ +CMakeFiles CTestTestfile.cmake Makefile +cmake_install.cmake exercise_01_compiling_with_sycl_source +``` + +and you can execute you program with + +```console +[me@login-1.SAGA ~/syclacademy/build]$ Code_Exercises/Exercise_01_Compiling_with_SYCL/exercise_01_compiling_with_sycl_source +=============================================================================== +All tests passed (1 assertion in 1 test case) +``` + +If it shows that the test passes it means that your code did not crash, which is good news. + +## Step 5: Compile and run solutions + +If you get stuck at some point there is also a suggested solution to each of the exercises, called `solution.cpp` which you +can compile using the same long and cumbersome exercise name as before, but with `_solution` instead of `_source` in the end + +```console +[me@login-1.SAGA ~/syclacademy/build]$ make exercise_01_compiling_with_sycl_solution +[me@login-1.SAGA ~/syclacademy/build]$ Code_Exercises/Exercise_01_Compiling_with_SYCL/exercise_01_compiling_with_sycl_solution +=============================================================================== +All tests passed (1 assertion in 1 test case) +``` + +## Step 6: Run exercises on GPU nodes + +The main point of all these exercises is of course to run the code on accelerators, and in order to do that +you need to ask for GPU resources through Slurm, here as an interactive job on the `accel` partition: + +```console +[me@login-1.SAGA ~/syclacademy/build]$ salloc --account= --time=1:00:00 --ntasks=1 --gpus=1 --partition=accel --mem=1G +salloc: Pending job allocation 5353133 +salloc: job 5353133 queued and waiting for resources +salloc: job 5353133 has been allocated resources +salloc: Granted job allocation 5353133 +salloc: Waiting for resource configuration +salloc: Nodes c7-8 are ready for job +[me@c7-8.SAGA ~/syclacademy/build]$ Code_Exercises/Exercise_01_Compiling_with_SYCL/exercise_01_compiling_with_sycl_solution +=============================================================================== +All tests passed (1 assertion in 1 test case) +``` + +The test still passed on the GPU node, yay! + +## Step 7: Play with the examples! diff --git a/_sources/code_development/guides/sycl_enccs.md.txt b/_sources/code_development/guides/sycl_enccs.md.txt new file mode 100644 index 000000000..100d5fa51 --- /dev/null +++ b/_sources/code_development/guides/sycl_enccs.md.txt @@ -0,0 +1,137 @@ +--- +orphan: true +--- + +# ENCCS SYCL workshop + +The EuroCC National Competence Centre Sweden (ENCCS) has prepared course material for an +introductory workshop on SYCL spanning three half days. The course page can be found +[here](https://enccs.github.io/sycl-workshop), with the course material publicly available +[here](https://github.com/ENCCS/sycl-workshop). In the following we will demonstrate how +to compile and run the example code on Saga. + +## Step 1: Load necessary modules + +In order to compile the examples we will need CMake as well as a SYCL implementation that is compatible +with the Nvidia GPUs that we have available. On Saga we have hipSYCL and CMake installed globally, and +we will choose a CMake version that is compatible with the GCC toolchain that the hipSYCL module is +based upon, in this case `GCCcore/11.2.0`: + +```console +[me@login-1.SAGA ~]$ module load hipSYCL/0.9.2-GCC-11.2.0-CUDA-11.4.1 +[me@login-1.SAGA ~]$ module load CMake/3.22.1-GCCcore-11.2.0 +``` + +## Step 2: Download course material + +The course material can be downloaded from Github with the following command + +```console +[me@login-1.SAGA ~]$ git clone https://github.com/ENCCS/sycl-workshop.git +[me@login-1.SAGA ~]$ cd sycl-workshop +[me@login-1.SAGA ~/sycl-workshop]$ ls +content LICENSE LICENSE.code make.bat Makefile README.md requirements.txt +``` + +You will here find the lesson material under `content/` in the form of rst files +(best viewed through the official [web page](https://enccs.github.io/sycl-workshop). +The code exercises are located under `content/code/`, where you will find separate +folders for each day of the course, as well as a folder with useful code snippets. + +## Step 3: Configure with CMake + +When we configure the build we need to tell CMake which SYCL implementation we are +going to use (hipSYCL) and which target architecture we want to compile for; `omp` +for CPU targets and `cuda:sm_60` for the Nvidia P100 GPUs we have on Saga (`cuda:sm_80` +for Betzy's A100 GPUs). We need to create separate `build` directories for each of the +examples, here for the very first `00_hello`: + +```console +[me@login-1.SAGA ~/sycl-workshop]$ cd content/code/day-1/00_hello +[me@login-1.SAGA ~/sycl-workshop/content/code/day-1/00_hello]$ cmake -S . -B build -DHIPSYCL_TARGETS="omp;cuda:sm_60" +-- The CXX compiler identification is GNU 11.2.0 +-- Detecting CXX compiler ABI info +-- Detecting CXX compiler ABI info - done +-- Check for working CXX compiler: /cluster/software/GCCcore/11.2.0/bin/c++ - skipped +-- Detecting CXX compile features +-- Detecting CXX compile features - done +-- Looking for C++ include pthread.h +-- Looking for C++ include pthread.h - found +-- Performing Test CMAKE_HAVE_LIBC_PTHREAD +-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed +-- Looking for pthread_create in pthreads +-- Looking for pthread_create in pthreads - not found +-- Looking for pthread_create in pthread +-- Looking for pthread_create in pthread - found +-- Found Threads: TRUE +-- Configuring done +-- Generating done +-- Build files have been written to: $HOME/sycl-workshop/content/code/day-1/00_hello/build +``` + +### Step 4: Compile and run exercises + +The tutorial is organized such that you are expected to write your solution in the +`.cpp` template file of each exercise. Once you have configured a `build` +directory for a particular exercise, simply run `make` in the `build` directory: + +```console +[me@login-1.SAGA ~/sycl-workshop/content/code/day-1/00_hello]$ cd build +[me@login-1.SAGA ~/sycl-workshop/content/code/day-1/00_hello/build]$ make +[ 50%] Building CXX object CMakeFiles/hello.dir/hello.cpp.o +clang-13: warning: Unknown CUDA version. cuda.h: CUDA_VERSION=11040. Assuming the latest supported version 10.1 [-Wunknown-cuda-version] +[100%] Linking CXX executable hello +[100%] Built target hello +``` + +Please ignore the CUDA version warning form `clang-13`, it does not seem to make a +difference. This will build an executable with the same name as the exercise (``) +which can be launched with: + +```console +[me@login-1.SAGA ~/sycl-workshop/content/code/day-1/00_hello/build]$ ./hello +[hipSYCL Warning] backend_loader: Could not load backend plugin: /cluster/software/hipSYCL/0.9.2-GCC-11.2.0-CUDA-11.4.1/bin/../lib/hipSYCL/librt-backend-cuda.so +[hipSYCL Warning] libcuda.so.1: cannot open shared object file: No such file or directory +Running on: hipSYCL OpenMP host device +Hello, world! I'm sorry, Dave. I'm afraid I can't do that. - HAL +``` + +Don't mind the `[hipSYCL Warning]`, they appear since we are launching a GPU application +on the login node, which does not have the appropriate hardware drivers. The code is +still able to run, though, as we can see from the last two lines of output. This is +because we have compiled a fallback option that runs on CPU (`OpenMP host device`). + +## Step 5: Run exercises on GPU nodes + +In order to run the code on accelerators we to be granted GPU resources through Slurm. +We will here use an interactive session, where we get a login prompt on the GPU node, +which we can launch our applications + +```console +[me@login-1.SAGA ~]$ salloc --account= --time=1:00:00 --ntasks=1 --gpus=1 --partition=accel --mem=1G +salloc: Pending job allocation 5353133 +salloc: job 5353133 queued and waiting for resources +salloc: job 5353133 has been allocated resources +salloc: Granted job allocation 5353133 +salloc: Waiting for resource configuration +salloc: Nodes c7-2 are ready for job +[me@c7-2.SAGA ~]$ cd sycl-workshop/content/code/day-1/00_hello/build +[me@c7-2.SAGA ~/sycl-workshop/content/code/day-1/00_hello/build]$ ./hello +Running on: Tesla P100-PCIE-16GB +Hello, world! I'm sorry, Dave. I'm afraid I can't do that. - HAL +``` + +We see that the `[hipSYCL Warning]` is gone, since we now have the GPU drivers and +libraries available. We also see that the application is able to pick up the correct +hardware, which on Saga is Tesla P100 cards, and the program output is still the same, +indicating that the code was executed correctly also on the GPU. + +## Step 6: Compile and run solutions + +If you get stuck at some point there is also a suggested solution to each of the +exercises, located in the `solution/` folder within each exercise. There are now two +ways to build the solution code: either copy the solution file directly to replace +the exercise template file and compile as before, or create another `build` directory +under `solution/` following the above steps to configure, build and run. + +## Step 7: Play with the examples! diff --git a/_sources/code_development/guides/sycl_usm.md.txt b/_sources/code_development/guides/sycl_usm.md.txt new file mode 100644 index 000000000..a84ec4897 --- /dev/null +++ b/_sources/code_development/guides/sycl_usm.md.txt @@ -0,0 +1,380 @@ +--- +orphan: true +--- + +(sycl_usm)= + +# Unified Shared Memory with SYCL + +This example demonstrates: + +1. how to allocate USM pointers in SYCL +2. how to submit work tasks to a SYCL device queue +3. how to write a parallel kernel function in SYCL +4. how to perform `memcpy` operations locally in device memory +5. how to perform a reduction operation in a SYCL kernel function + +In this tutorial we will SYCL-ify a somewhat more realistic example, which is taken from the +{ref}`OpenACC tutorial `. The serial version of the Jacobi iteration program has here been +slightly modified for C++ (and in anticipation of what is to come): + +```{eval-rst} +.. literalinclude:: hipsycl/jacobi_serial.cpp + :language: cpp +``` + +```{eval-rst} +:download:`jacobi_serial.cpp <./hipsycl/jacobi_serial.cpp>` +``` + +## Compile and run reference serial code + +We can compile and run the reference serial version of the code on Saga. First we load a recent version +of the GNU C++ compiler, and compile a `jacobi_serial` target with `-Ofast` optimization: + +```console +[me@login-1.SAGA ~]$ module load GCC/10.2.0 +[me@login-1.SAGA ~]$ g++ -Ofast -o jacobi_serial jacobi_serial.cpp +``` + +Hopefully no errors occurred on this step, and we are ready to run a reference benchmark on a compute node: + +```console +[me@login-1.SAGA ~]$ srun --account= --time=0:10:00 --ntasks=1 --cpus-per-task=1 --mem=1G time ./jacobi_serial +srun: job 3661704 queued and waiting for resources +srun: job 3661704 has been allocated resources +Iterations : 7214 | Error : 0.00999874 +37.91user 0.00system 0:38.08elapsed 99%CPU (0avgtext+0avgdata 32880maxresident)k +3844inputs+0outputs (17major+1097minor)pagefaults 0swaps +``` + +The execution should take around 40 seconds on a single core (38.08s in this case, the `elapsed` value in +the output). We notice also the printed output from our program, which states that it ran a total of 7214 +iterations before reaching an error below 0.01, so this is the number of times we enter the main `while` loop. + +## Introducing SYCL and Unified Shared Memory + +In contrast to the directives based approaches to GPU programming like OpenMP and OpenACC, which can often be achieved +by strategically placed compiler directives into the existing code, porting to SYCL might require a bit more changes to the +structure and algorithms of the program. SYCL supports fully asynchronous execution of tasks using C++ concepts like futures +and events, and provides two main approaches for data management: using Unified Shared Memory (USM) or Buffers. +USM uses familiar C/C++-like memory pointers in a _unified virtual address space_, which basically means that you +can use the same pointer address on the host and on the device. This approach will likely be more familiar to the +traditional C/C++ programmer, but it requires explicit management of all data dependences and synchronization, which can +be achieved by adding `wait` statements or by capturing an `event` from one task and passing it on as an explicit dependency +for other tasks. Buffers, on the other hand, can only be accessed through special `accessor` objects, which are used by +the runtime to automatically construct a dependecy graph for all the tasks, and thus make sure that they are executed in +the correct order. + +In this tutorial we will limit ourselves to the USM approach, and we will for simplicity attach explicit `wait` statements +to all the tasks, which effectively deactivates any asynchronous execution. + +### Step 1: Create a SYCL queue + +Back to the Jacobi source code, we start by creating a `sycl::queue`. This object is "attached" +to a particular device and is used to submit tasks for execution, in general asynchronously +(out-of-order). As in the {ref}`Hello World ` SYCL example, we will print out the name +of the device to make sure that we pick up the correct hardware: + +```{eval-rst} +.. literalinclude:: hipsycl/jacobi_shared.cpp + :language: cpp + :lines: 5-29 + :emphasize-lines: 4, 19-22 +``` + +### Step 2: Allocate USM memory + +USM pointers can be allocated in three ways: `malloc_host`, `malloc_device` or `malloc_shared`. +In this example we will use _shared_ memory pointers only, which are pointer addresses that can be +accessed by both the host and the device. Furthermore, the _physical_ location of such shared +data can actually change during program execution, and the runtime will move the data back +and forth as the access pattern changes from host to device and vice versa. This will make +sure that the data can become accessible from _local_ memory on both the host and the device and it +_allows_ for fast access on the device as long as the data is _allowed_ to reside on the +device local memory throughout the execution of a kernel function, i.e. no data accesses from +the host should occur in the mean time, which would result in costly data migration. + +The changes we need to make to our example code in order to use shared memory is to replace +the stack allocated arrays (`arr` and `tmp`) with `sycl::malloc_shared`: + +```{eval-rst} +.. literalinclude:: hipsycl/jacobi_shared.cpp + :language: cpp + :lines: 28-45 + :emphasize-lines: 5,14 +``` + +We have added a `_s` suffix to the variable name just to indicate it's a shared pointer. +Note that we pass our `sycl::queue` (`Q`) to this memory allocation as it carries the +information of which device this memory should be shared (there could be several queues +with different devices). We see also that the shared data arrays can be filled and +`std::memcpy`'d in exactly the same way as before by the host, so there's no change to +how the host interacts with this data. + +```{note} +Memory allocated with `sycl::malloc_host` will also be "accessible" from the device, but it +will always be fetched from host memory and passed to the device through a memory bus, which +is _always_ going to be _much_ slower than fetching directly from local memory on the device. +The fast alternative to shared memory is to use `sycl::malloc_host` and `sycl::malloc_device` +and then _manually_ transfer the data between the host and the device. This is a bit less +convenient, but it gives more fine-grained control to the programmer. +``` + +### Step 3: Implement the parallel kernel + +We now come to the main work sharing construct in our example (beware, this is a mouthful): + +```{eval-rst} +.. literalinclude:: hipsycl/jacobi_shared.cpp + :language: cpp + :lines: 46-80 + :emphasize-lines: 6-23 +``` + +We will not discuss in detail everything that is going on here, please refer to standard SYCL +literature for more in-depth explanations, e.g. the free e-book on +[Data Parallel C++](https://www.apress.com/gp/book/9781484255735). The take-home message is that +we `submit` to the queue a kernel function which represents a single iteration of a `parallel_for` +loop for execution on the device. Some (probably unnecessary) logic is added to extract the +two array indices `i,j` from the single loop iteration index, but otherwise the body of the kernel +is the same as the nested loop we had in the serial version, except that we need to extract the +computation of the maximum error from this main loop. The reason for this is that the kernel +code will be executed in arbitrary order by many different threads on the device, and no single +thread will be able to compute the true maximum locally. + +Since the memory was allocated as `malloc_shared` between the host and the device, the reduction +operation to find the maximum error, as well as the `std::memcpy` operation between `tmp_s` and +`arr_s`, can be performed by the host. Keep in mind, though, that this will require a _migration_ +of the shared data back and forth between the device and the host at every iteration of the +`while` loop (more than 7000 iterations), and we will see the effect of this in the timings below. + +A critical point in the code snippet above is the `wait()` statement on the tail of the `Q.submit()` +call. This will tell the host to wait for further execution until all the work in the parallel +kernel has been completed. This effectively deactivates asynchronous execution of the device tasks. + +```{tip} +`Q.submit(...).wait();` is a concatenation of the slightly more expressive `Q.submit(...); Q.wait();`, +which emphasizes that it's the entire queue that is drained by the `wait`, not just the task loop +that was just submitted. This means that you can submit several independent tasks to the queue for +asynchronous execution, and then drain them all in `Q.wait()` at a later stage. +``` + + +### Step 4: Free USM memory + +Finally, as always when allocating raw pointers in C++, one has to manually free the memory: + +```{eval-rst} +.. literalinclude:: hipsycl/jacobi_shared.cpp + :language: cpp + :lines: 82-89 + :emphasize-lines: 4-5 +``` + +## Compiling for CPU + +With the adjustments discussed above we end up with the following source code: + +```{eval-rst} +.. literalinclude:: hipsycl/jacobi_shared.cpp + :language: cpp +``` + +```{eval-rst} +:download:`jacobi_shared.cpp <./hipsycl/jacobi_shared.cpp>` +``` + +We can compile an `omp` target of this code on Saga using the `syclcc` compiler wrapper from +the `hipSYCL` module (feel free to ignore the warning): + +```console +[me@login-1.SAGA ~]$ module load hipSYCL/0.9.1-gcccuda-2020b +[me@login-1.SAGA ~]$ syclcc --hipsycl-targets=omp -Ofast -o jacobi_shared_cpu jacobi_shared.cpp +clang-11: warning: Unknown CUDA version. cuda.h: CUDA_VERSION=11010. Assuming the latest supported version 10.1 [-Wunknown-cuda-version] +``` + +And we can run it on a single compute core (please ignore also the hipSYCL warning, which comes +when you run on compute nodes without GPU resources): + +```console +[me@login-1.SAGA ~]$ srun --account= --time=0:10:00 --ntasks=1 --cpus-per-task=1 --mem=1G time ./jacobi_shared_cpu +srun: job 3671849 queued and waiting for resources +srun: job 3671849 has been allocated resources +[hipSYCL Warning] backend_loader: Could not load backend plugin: /cluster/software/hipSYCL/0.9.1-gcccuda-2020b/bin/../lib/hipSYCL/librt-backend-cuda.so +[hipSYCL Warning] libcuda.so.1: cannot open shared object file: No such file or directory +Chosen device: hipSYCL OpenMP host device +Iterations : 7229 | Error : 0.00999993 +65.29user 0.37system 1:05.89elapsed 99%CPU (0avgtext+0avgdata 34300maxresident)k +10337inputs+0outputs (47major+2099minor)pagefaults 0swaps +``` + +We see from the "Chosen device" output of our program that the `sycl::queue` was bound to the +"hipSYCL OpenMP host device", which means that it is using the host CPU as a "device". +So this took about a minute to run, which is some 50% _slower_ than the reference serial run +we did above. However, one of the benefits of SYCL is that it can use the available CPU threads +of the host as "device" for offloading. Let's try to run the same code on 20 CPU cores: + +```console +[me@login-1.SAGA ~]$ srun --account= --time=0:10:00 --ntasks=1 --cpus-per-task=20 --mem=1G time ./jacobi_shared_cpu +srun: job 3671925 queued and waiting for resources +srun: job 3671925 has been allocated resources +[hipSYCL Warning] backend_loader: Could not load backend plugin: /cluster/software/hipSYCL/0.9.1-gcccuda-2020b/bin/../lib/hipSYCL/librt-backend-cuda.so +[hipSYCL Warning] libcuda.so.1: cannot open shared object file: No such file or directory +Chosen device: hipSYCL OpenMP host device +Iterations : 7229 | Error : 0.00999993 +594.42user 16.34system 0:30.84elapsed 1980%CPU (0avgtext+0avgdata 45092maxresident)k +10337inputs+0outputs (47major+2267minor)pagefaults 0swaps +``` + +Alright, we're down to ~30s, which is somewhat faster than the serial reference (still not overly +impressive given that we spend 20 times more resources). Let's see if we can do better on the GPU. + + +## Compiling for Nvidia GPUs + +When compiling for the P100 Nvidia GPUs on Saga we simply have to change the `hipsycl-targets` +from `omp` to `cuda:sm_60`, and then submit a job with GPU resources: + +```console +[me@login-1.SAGA ~]$ syclcc --hipsycl-targets=cuda:sm_60 -Ofast -o jacobi_shared_gpu jacobi_shared.cpp +[me@login-1.SAGA ~]$ srun --account= --time=0:10:00 --ntasks=1 --gpus-per-task=1 --mem=1G --partition=accel time ./jacobi_shared_gpu +srun: job 3672238 queued and waiting for resources +srun: job 3672238 has been allocated resources +Chosen device: Tesla P100-PCIE-16GB +Iterations : 7230 | Error : 0.00999916 +77.14user 54.72system 2:12.42elapsed 99%CPU (0avgtext+0avgdata 156600maxresident)k +11393inputs+0outputs (694130major+7440minor)pagefaults 0swaps +``` + +Good news first: the chosen device is now Tesla P100-PCIE-16GB, which is the name of the graphics +card on the Saga GPU nodes. Our application was actually able to pick up the correct device. +The bad news is of course the elapsed time of 2m12s, which is _significantly_ slower than both +the serial and OpenMP versions above. We already hinted at the reason for this poor performance, +so let's see if we can fix it. + +## Optimizing for GPU performance + + +### Step 5: Move data between USM pointers on the device + +In this example we have two `std::memcpy` performed by the host on the USM shared pointer. The first one +is a single operation before we enter the main `while` loop, while the other is performed at the end of +every loop iteration. Since this operation is performed by the host CPU, it will implicitly invoke a +data migration in case the data happens to be located in device memory when the function is called. +Since we are copying data _between_ two USM pointers, we can actually perform this `memcpy` directly +on the device, and thus avoid the costly data migration. + +The `memcpy` that we do _before_ the main work loop in our example could be left unchanged. +This single function call should have no noticeable impact on the performance since the data is already +located on the host after the initialization. We will still submit also this `memcpy` operation to the +`sycl::queue` for execution on the device since it will serve as a preporatory step of migrating the +data to device memory _in advance_ of the upcoming kernel execution. + +```{eval-rst} +.. literalinclude:: hipsycl/jacobi_memcpy.cpp + :language: cpp + :lines: 44-80 + :emphasize-lines: 2, 34 +``` + +```{eval-rst} +:download:`jacobi_memcpy.cpp <./hipsycl/jacobi_memcpy.cpp>` +``` + +As we can see from the code snippet above, there are two changes to the `memcpy` function calls: +(1) `std::` is replaced by `Q.` and (2) we have put a `.wait()` on the tail of the function call. +(1) will offload the the work to be performed by the device rather than the host, while (2) will +hold back the host from further execution until the `Q` is empty (for now the queue holds only a +single `memcpy` task). + +In contrast to the first `memcpy`, the one in the loop is critical for performance. +If this operation is performed as `std::memcpy` by the host, it will require an implicit data +migration from device to host (and back) _in every iteration_ of the `while` loop. Making this +a `Q.memcpy` instead will allow the copy to be executed locally in device memory without ever +involving the host. + +```{tip} +The `Q.memcpy(...)` syntax is actually a shorthand for something a bit more cumbersome +`Q.submit([&](sycl::handler &h) { h.memcpy(...); })`, which is more in line with the syntax of the +kernel submission above. +``` + +### Step 6: Add reduction object to compute maximum error + +There's still one more operation inside the `while` loop that needs to be considered, and that is +the computation of the maximum error in each iteration. This could not be straightforwardly included +in the kernel function, so we left it as a separate loop to be executed by the host after the kernel +has completed. However, just as for the `memcpy` that we discussed above, this will also imply a costly +data migration back to the host at every iteration. The way around this problem is to attach a +`sycl::reduction` operation to this error variable, which will allow us to include the maximum reduction +back into the main kernel function. The syntax to achieve this is somewhat involved: + +```{eval-rst} +.. literalinclude:: hipsycl/jacobi_reduction.cpp + :language: cpp + :lines: 40-90 + :emphasize-lines: 3-4, 11-12, 17, 21-22, 31, 43 +``` + +```{eval-rst} +:download:`jacobi_reduction.cpp <./hipsycl/jacobi_reduction.cpp>` +``` + +First of all, we need to allocate the variable that is collecting the error as a USM pointer so that it +is accessible on the device. We do this by `sycl::malloc_shared` of a single `float`. Then we need to wrap this USM +pointer into a `sycl::reduction` operation, and pass it as an extra argument to the `parallel_for` kernel. +Notice that the `max_err` object is passed into the kernel as the `max` argument to the lambda function. +Then we call the `combine()` function of this `sycl::reduction` object, which will perform the +`sycl::maximum` operation on the data, and thus compute the _true_ maximum among all the entries +in a thread safe manner. Finally, since the `err_s` pointer is shared between device and host, the host +will still have access to the final error and can print it out in the end. + +## Compiling and running optimized code + +We now compile a `sm_60` target of the final version, and run on a GPU node: + +```console +[me@login-1.SAGA ~]$ syclcc --hipsycl-targets=cuda:sm_60 -Ofast -o jacobi_reduction_gpu jacobi_reduction.cpp +[me@login-1.SAGA ~]$ srun --account= --time=0:10:00 --ntasks=1 --gpus-per-task=1 --mem=1G --partition=accel time ./jacobi_reduction_gpu +srun: job 3808343 queued and waiting for resources +srun: job 3808343 has been allocated resources +Chosen device: Tesla P100-PCIE-16GB +Iterations : 7230 | Error : 0.00999916 +2.03user 3.83system 0:06.49elapsed 90%CPU (0avgtext+0avgdata 156604maxresident)k +11457inputs+0outputs (1030major+6413minor)pagefaults 0swaps +``` + +We see that by making sure that the data _remains_ in device local memory throughout the execution of the +kernel, we have reduced the overall run time to about six seconds. Notice also that most of this time is +spent in `system` calls setting up the program, and only two seconds is spent by actually running the program. +This system overhead should (hopefully) remain at a few seconds also for larger application when the total runtime +is much longer. + +## Summary + +In this guide we have transitioned a serial C++ code into a small GPU application using the SYCL framework. +We have taken several steps from the initial serial implementation to the final accelerated version, using +concepts like Unified Shared Memory and a SYCL reduction operation. We have seen that the path to actual +_accelerated_ code is not necessarily straightforward, as several of the intermediate steps shows execution +times significantly _slower_ than the original serial code. The steps can be summarized as follows: + +| Version | CPUs | GPUs | Run time | Relative | +|:------------------------:|:--------:|:------:|:-------------:|:---------:| +| `jacobi_serial` | 1 | 0 | 38.1 sec | 100% | +| `jacobi_shared` | 1 | 0 | 65.9 sec | 173% | +| `jacobi_shared` | 20 | 0 | 30.8 sec | 81% | +| `jacobi_shared` | 1 | 1 | 132.4 sec | 348% | +| `jacobi_memcpy` | 1 | 0 | 110.2 sec | 289% | +| `jacobi_memcpy` | 20 | 0 | 33.9 sec | 89% | +| `jacobi_memcpy` | 1 | 1 | 93.8 sec | 246% | +| `jacobi_reduction` | 1 | 0 | 115.1 sec | 302% | +| `jacobi_reduction` | 20 | 0 | 21.6 sec | 56% | +| `jacobi_reduction` | 1 | 1 | 6.5 sec | 17% | + +We have with this example shown in some detail how to compile and run a SYCL code on Saga, and how to make use of +the available GPU resources there. We have highlighted some basic SYCL _syntax_, but we have not gone into much +detail on what goes on under the hood, or how to write _good_ and _efficient_ SYCL code. This simple example only +scratches the surface of what's possible within the framework, and we encourage the reader to check out other more +complete resources, like the [Data Parallel C++](https://www.apress.com/gp/book/9781484255735) +e-book, before venturing into a real-world porting project using SYCL. diff --git a/_sources/code_development/guides/tensorflow_gpu.md.txt b/_sources/code_development/guides/tensorflow_gpu.md.txt new file mode 100644 index 000000000..ba703c42e --- /dev/null +++ b/_sources/code_development/guides/tensorflow_gpu.md.txt @@ -0,0 +1,680 @@ +--- +orphan: true +--- + +```{index} GPU; TensorFlow on GPU, TensorFlow; TensorFlow on GPU +``` + +(tensorflow)= + +# TensorFlow on GPU +The intention of this guide is to teach students and researchers with access to +[Sigma2][sigma2] how to use machine learning libraries on CPU and GPU. The guide +is optimized for [`TensorFlow 2`][tensorflow], however, we hope that if you +utilize other libraries this guide still holds some value. Do not hesitate to +{ref}`contact us ` +for additional assistance. + +For the rest of this guide many of the examples ask for Sigma2 resources with +GPU. This is achieved with the `--partition=accel --gpus=1` +({ref}`job_scripts_saga_accel`), +however, `TensorFlow` does not require the use of a GPU so +for testing it is recommended to not ask for GPU resources (to be scheduled +quicker) and then, once your experiments are ready to run for longer, add back +inn the request for GPU. + +A complete example, both `python` and Slurm file, can be found at +{download}`files/mnist.py` +and +{download}`files/run_mnist.sh`. + +# Installing `python` libraries +The preferred way to "install" the necessary machine learning libraries is to +load one of the pre-built {ref}`modules ` below. By using the built-in modules +any required third-party module is automatically loaded and ready for use, +minimizing the amount of packages to load. + +- `TensorFlow/2.2.0-fosscuda-2019b-Python-3.7.4` +- `PyTorch/1.4.0-fosscuda-2019b-Python-3.7.4` +- `Theano/1.0.4-fosscuda-2019b-Python-3.7.4` +- `Keras/2.3.1-fosscuda-2019b-Python-3.7.4` + +```{note} +When loading modules pressing `` gives you +autocomplete options. +``` + +```{note} +It can be useful to do an initial `module purge` to +ensure nothing from previous experiments is loaded before +loading modules for the first time. +``` + +```{note} +Modules are regularly updated so if you would like a newer +version, than what is listed above, use `module avail | +less` to browse all available packages. +``` + +If you need additional python libraries it is recommended to create a +[`virtualenv`][virtualenv] environment and install packages there. This +increases reproducibility and makes it easy to test different packages without +needing to install libraries globally. + +Once the desired module is loaded, create a new `virtualenv` environment as +follows. + +```bash +# The following will create a new folder called 'tensor_env' which will hold +# our 'virtualenv' and installed packages +$ virtualenv -p python3 tensor_env +# Next we need to activate the new environment +# NOTE: The 'Python' module loaded above must be loaded for activation to +# function, this is important when logging in and out or doing a 'module purge' +$ source tensor_env/bin/activate +# If you need to do other python related stuff outside the virtualenv you will +# need to 'deactivate' the environment with the following +$ deactivate +``` + +Once the environment is activated, new packages can be installed by using `pip +install `. If you end up using additional packages make sure that the +`virtualenv` is activated in your {ref}`job-scripts`. + +```sh +# Often useful to purge modules before running experiments +module purge + +# Load machine learning library along with python +module load TensorFlow/2.2.0-fosscuda-2019b-Python-3.7.4 +# Activate virtualenv +source $SLURM_SUBMIT_DIR/tensor_env/bin/activate +``` + +## Manual route +If you still would like to install packages through `pip` the following will +guide you through how to install the latest version of `TensorFlow` and load the +necessary packages for GPU compute. + +To start, load the desired `python` version - here we will use the newest as of +writing. + +```bash +$ module load Python/3.8.2-GCCcore-9.3.0 +``` + +```{warning} +This has to be done on the login node so that we have access to +the internet and can download `pip` packages. +``` + +Then create a [virtual environment][virtualenv] which we will install packages +into. + +```bash +# The following will create a new folder called 'tensor_env' which will hold +# our 'virtualenv' and installed packages +$ virtualenv -p python3 tensor_env +# Next we need to activate the new environment +# NOTE: The 'Python' module loaded above must be loaded for activation to +# function, this is important when logging in and out or doing a 'module purge' +$ source tensor_env/bin/activate +# If you need to do other python related stuff outside the virtualenv you will +# need to 'deactivate' the environment with the following +$ deactivate +``` + +Next we will install the latest version of [`TensorFlow 2`][tf_install] which +fortunately should support GPU compute directly without any other prerequisites. + +```bash +$ pip install tensorflow +``` + +To ensure that the above install worked, start an interactive session with +`python` and run the following: + +```python +>>> import tensorflow as tf +>>> tf.test.is_built_with_cuda() +# Should respond with 'True' if it worked +``` + +The import might show some error messages related to loading CUDA, however, for +now we just wanted to see that `TensorFlow` was installed and pre-compiled with +`CUDA` (i.e. GPU) support. + +This should be it for installing necessary libraries. Below we have listed the +modules which will have to be loaded for GPU compute to work. The next code +snippet should be in your {ref}`job-scripts` so that the correct modules +are loaded on worker nodes and the virtual environment is activated. + +```sh +# Often useful to purge modules before running experiments +module purge + +# Load desired modules (replace with the exact modules you used above) +module load Python/3.8.2-GCCcore-9.3.0 +module load CUDA/10.1.243 +module load cuDNN/7.6.4.38 +# Activate virtualenv +source $SLURM_SUBMIT_DIR/tensor_env/bin/activate +``` + +# Loading data +For data that you intend to work on it is simplest to upload to your home area +and if the dataset is small enough simply load from there onto the worker node. + +To upload data to use [`rsync`][rsync] to transfer data: + +```bash +# On your own machine, upload the dataset to your home folder +$ rsync -zh --info=progress2 -r /path/to/dataset/folder @saga.sigma2.no:~/. +``` + +For large amounts of data it is recommended to load into your {ref}`project-area` +to avoid filling your home area. + +To retrieving the path to the dataset, we can utilize python's [`os`][python_os] +module to access the variable, like so: + +```python +# ...Somewhere in your experiment python file... +# Load 'os' module to get access to environment and path functions +import os + +# Path to dataset +dataset_path = os.path.join(os.environ['SLURM_SUBMIT_DIR'], 'dataset') +``` + + +## Loading built-in datasets +First we will need to download the dataset on the login node. Ensure that the +correct modules are loaded. Next open up an interactive python session with +`python`, then: + +```python +>>> tf.keras.datasets.mnist.load_data() +``` + +This will download and cache the MNIST dataset which we can use for training +models. Load the data in your training file like so: + +```python +(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data() +``` + +# Saving model data +For saving model data and weights we suggest the `TensorFlow` [built-in +checkpointing and save functions][tensorflow_ckpt]. + +The following code snippet is a more or less complete example of how to load +built-in data and save weights. + +```python +#!/usr/bin/env python + +# Assumed to be 'mnist.py' + +import tensorflow as tf +import os + +# Access storage path for '$SLURM_SUBMIT_DIR' +storage_path = os.path.join(os.environ['SLURM_SUBMIT_DIR'], + os.environ['SLURM_JOB_ID']) + +# Load dataset +mnist = tf.keras.datasets.mnist +(x_train, y_train), (x_test, y_test) = mnist.load_data() +x_train, x_test = x_train / 255., x_test / 255. + +def create_model(): + model = tf.keras.models.Sequential([ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(512, activation='relu'), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation='softmax') + ]) + model.compile(optimizer='adam', + loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True), + metrics=['accuracy']) + return model + +# Create and display summary of model +model = create_model() +# Output, such as from the following command, is outputted into the '.out' file +# produced by 'sbatch' +model.summary() + +# Save model in TensorFlow format +model.save(os.path.join(storage_path, "model")) + +# Create checkpointing of weights +ckpt_path = os.path.join(storage_path, "checkpoints", "mnist-{epoch:04d}.ckpt") +ckpt_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=ckpt_path, + save_weights_only=True, + verbose=1) + +# Save initial weights +model.save_weights(ckpt_path.format(epoch=0)) + +# Train model with checkpointing +model.fit(x_train[:1000], y_train[:1000], + epochs=50, + callbacks=[ckpt_callback], + validation_data=(x_test[:1000], y_test[:1000]), + verbose=0) +``` + +The above file can be run with the following {ref}`job-scripts` which will +ensure that correct modules are loaded and results are copied back into your +home directory. + +```sh +#!/usr/bin/bash + +# Assumed to be 'mnist_test.sh' + +#SBATCH --account= +#SBATCH --job-name= +#SBATCH --ntasks=1 +#SBATCH --mem-per-cpu=8G +## The following line can be omitted to run on CPU alone +#SBATCH --partition=accel --gpus=1 +#SBATCH --time=00:30:00 + +# Purge modules and load tensorflow +module purge +module load TensorFlow/2.2.0-fosscuda-2019b-Python-3.7.4 +# List loaded modules for reproducibility +module list + +# Run python script +python $SLURM_SUBMIT_DIR/mnist.py +``` + +Once these two files are located on a Sigma2 resource we can run it with: + +```bash +$ sbatch mnist_test.sh +``` + +And remember, in your code it is important to load the latest checkpoint if +available, which can be retrieved with: + +```python +# Load weights from a previous run +ckpt_dir = os.path.join(os.environ['SLURM_SUBMIT_DIR'], + "", + "checkpoints") +latest = tf.train.latest_checkpoint(ckpt_dir) + +# Create a new model instance +model = create_model() + +# Load the previously saved weights if they exist +if latest: + model.load_weights(latest) +``` + +# Using `TensorBoard` +[`TensorBoard`][tensorboard] is a nice utility for comparing different runs and +viewing progress during optimization. To enable this on Sigma2 resources we will +need to write data into our home area and some steps are necessary for +connecting and viewing the board. + +We will continue to use the MNIST example from above. The following changes are +needed to enable `TensorBoard`. + +```python +# In the 'mnist.py' script +import datetime + +# We will store the 'TensorBoard' logs in the folder where the 'mnist_test.sh' +# file was launched and create a folder like 'logs/fit'. In your own code we +# recommended that you give these folders names that you will recognize, +# the last folder uses the time when the program was started to separate related +# runs +log_dir = os.path.join(os.environ['SLURM_SUBMIT_DIR'], + "logs", + "fit", + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) +tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) + +# Change the last line, where we fit our data in the example above, to also +# include the TensorBoard callback +model.fit(x_train[:1000], y_train[:1000], + epochs=50, + # Change here: + callbacks=[ckpt_callback, tensorboard_callback], + validation_data=(x_test[:1000], y_test[:1000]), + verbose=0) +``` + +Once you have started a job with the above code embedded, or have a previous run +which created a `TensorBoard` log, it can be viewed as follows. + +1. Open up a terminal and connect to `Saga` as usual. +2. In the new terminal on `Saga`, load `TensorFlow` and run `tensorboard + --logdir=/path/to/logs/fit --port=0`. +3. In the output from the above command note which port `TensorBoard` has + started on, the last line should look something like: `TensorBoard 2.1.0 at + http://localhost:44124/ (Press CTRL+C to quit)`. +4. Open up another terminal and this time connect to `Saga` using the following: + `ssh -L 6006:localhost: @saga.sigma2.no` where `` is + the port reported from step `3` (e.g. `44124` in our case). +5. Open your browser and go to `localhost:6006`. + +# Advance topics +## Using multiple GPUs +Since all of the GPU machines on `Saga` have four GPUs it can be beneficial for +some workloads to distribute the work over more than one device at a time. This +can be accomplished with the [`tf.distribute.MirroredStrategy`][mirrored]. + +```{warning} +As of writing, only the `MirroredStrategy` is fully +supported by `TensorFlow` which is limited to one +node at a time. +``` + +We will, again, continue to use the MNIST example from above. However, as we +need some larger changes to the example we will recreate the whole example and +try to highlight changes. + +```python +#!/usr/bin/env python + +# Assumed to be 'mnist.py' + +import datetime +import os +import tensorflow as tf + +# Access storage path for '$SLURM_SUBMIT_DIR' +storage_path = os.path.join(os.environ['SLURM_SUBMIT_DIR'], + os.environ['SLURM_JOB_ID']) + +## --- NEW --- +strategy = tf.distribute.MirroredStrategy() +print(f"Number of devices: {strategy.num_replicas_in_sync}") + +# Calculate batch size +# For your own experiments you will likely need to adjust this based on testing +# on GPUs to find the 'optimal' size +BATCH_SIZE_PER_REPLICA = 64 +BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync + +# Load dataset +mnist = tf.keras.datasets.mnist +(x_train, y_train), _ = mnist.load_data() +x_train = x_train / 255. +## --- NEW --- +# NOTE: We need to create a 'Dataset' so that we can process the data in +# batches +train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(60000).repeat().batch(BATCH_SIZE) + +def create_model(): + model = tf.keras.models.Sequential([ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(512, activation='relu'), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation='softmax') + ]) + model.compile(optimizer='adam', + loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True), + metrics=['accuracy']) + return model + +# Create and display summary of model +## --- NEW --- +with strategy.scope(): + model = create_model() +# Output, such as from the following command, is outputted into the '.out' file +# produced by 'sbatch' +model.summary() +log_dir = os.path.join(os.environ['SLURM_SUBMIT_DIR'], + "logs", + "fit", + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) +tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) + +# Save model in TensorFlow format +model.save(os.path.join(storage_path, "model")) + +# Create checkpointing of weights +ckpt_path = os.path.join(storage_path, "checkpoints", "mnist-{epoch:04d}.ckpt") +ckpt_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=ckpt_path, + save_weights_only=True, + verbose=1) + +# Save initial weights +model.save_weights(ckpt_path.format(epoch=0)) + +# Train model with checkpointing +model.fit(train_dataset, + epochs=50, + steps_per_epoch=70, + callbacks=[ckpt_callback, tensorboard_callback]) +``` + +Next we will use a slightly altered job script to ask for two GPUs to see if the +above works. + +```sh +#!/usr/bin/bash + +# Assumed to be 'mnist_test.sh' + +#SBATCH --account= +#SBATCH --job-name= +#SBATCH --ntasks=1 +#SBATCH --mem-per-cpu=8G +#SBATCH --partition=accel --gpus=2 +#SBATCH --time=00:30:00 + +# Purge modules and load tensorflow +module purge +module load TensorFlow/2.2.0-fosscuda-2019b-Python-3.7.4 +# List loaded modules for reproducibility +module list + +# Run python script +python $SLURM_SUBMIT_DIR/mnist.py +``` + +## Distributed training on multiple nodes +To utilize more than four GPUs we will turn to the [`Horovod`][hvd] project +which supports several different machine learning libraries and is capable of +utilizing `MPI`. `Horovod` is responsible for communicating between different +nodes and perform gradient computation, averaged over the different nodes. + +Utilizing this library together with `TensorFlow 2` requires minimal changes, +however, there are a few things to be aware of in regards to scheduling with +`Slurm`. The following example is based on the [official `TensorFlow` +example][hvd_tf_ex]. + +To install `Horovod` you will need to create a `virtualenv` as described above. +Then once activated install the `Horovod` package with support for +[`NCCL`][nccl]. + +```sh +# This assumes that you have activated a 'virtualenv' +# $ source tensor_env/bin/activate +$ HOROVOD_GPU_OPERATIONS=NCCL pip install horovod +``` + +Then we can run our training using just a few modifications: + +```python +#!/usr/bin/env python + +# Assumed to be 'mnist_hvd.py' + +import datetime +import os +import tensorflow as tf +import horovod.tensorflow.keras as hvd + +# Initialize Horovod. +hvd.init() + +# Extract number of visible GPUs in order to pin them to MPI process +gpus = tf.config.experimental.list_physical_devices('GPU') +if hvd.rank == 0: + print(f"Found the following GPUs: '{gpus}'") +# Allow memory growth on GPU, required by Horovod +for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) +# Since multiple GPUs might be visible to multiple ranks it is important to +# bind the rank to a given GPU +if gpus: + print(f"Rank '{hvd.local_rank()}/{hvd.rank()}' using GPU: '{gpus[hvd.local_rank()]}'") + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') +else: + print(f"No GPU(s) configured for ({hvd.local_rank()}/{hvd.rank()})!") + +# Access storage path for '$SLURM_SUBMIT_DIR' +storage_path = os.path.join(os.environ['SLURM_SUBMIT_DIR'], + os.environ['SLURM_JOB_ID']) + +# Load dataset +mnist = tf.keras.datasets.mnist +(x_train, y_train), _ = mnist.load_data() +x_train = x_train / 255. + +# Create dataset for batching +dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) +dataset = dataset.repeat().shuffle(10000).batch(128) + +# Define learning rate as a function of number of GPUs +scaled_lr = 0.001 * hvd.size() + + +def create_model(): + model = tf.keras.models.Sequential([ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(512, activation='relu'), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation='softmax') + ]) + # Horovod: adjust learning rate based on number of GPUs. + opt = tf.optimizers.Adam(scaled_lr) + model.compile(optimizer=opt, + loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True), + metrics=['accuracy'], + experimental_run_tf_function=False) + return model + + +# Create and display summary of model +model = create_model() +# Output, such as from the following command, is outputted into the '.out' file +# produced by 'sbatch' +if hvd.rank() == 0: + model.summary() + +# Create list of callback so we can separate callbacks based on rank +callbacks = [ + # Horovod: broadcast initial variable states from rank 0 to all other + # processes. This is necessary to ensure consistent initialization of all + # workers when training is started with random weights or restored from a + # checkpoint. + hvd.callbacks.BroadcastGlobalVariablesCallback(0), + + # Horovod: average metrics among workers at the end of every epoch. + # + # Note: This callback must be in the list before the ReduceLROnPlateau, + # TensorBoard or other metrics-based callbacks. + hvd.callbacks.MetricAverageCallback(), + + # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to + # worse final accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * + # hvd.size()` during the first three epochs. See + # https://arxiv.org/abs/1706.02677 for details. + hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, + initial_lr=scaled_lr, + verbose=1), +] + +# Only perform the following actions on rank 0 to avoid all workers clash +if hvd.rank() == 0: + # Tensorboard support + log_dir = os.path.join(os.environ['SLURM_SUBMIT_DIR'], + "logs", + "fit", + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, + histogram_freq=1) + # Save model in TensorFlow format + model.save(os.path.join(storage_path, "model")) + # Create checkpointing of weights + ckpt_path = os.path.join(storage_path, + "checkpoints", + "mnist-{epoch:04d}.ckpt") + ckpt_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=ckpt_path, + save_weights_only=True, + verbose=0) + # Save initial weights + model.save_weights(ckpt_path.format(epoch=0)) + callbacks.extend([tensorboard_callback, ckpt_callback]) + +verbose = 1 if hvd.rank() == 0 else 0 +# Train model with checkpointing +model.fit(x_train, y_train, + steps_per_epoch=500 // hvd.size(), + epochs=100, + callbacks=callbacks, + verbose=verbose) +``` + +```{warning} +When printing information it can be useful to use the `if +hvd.rank() == 0` idiom to avoid the same thing being +printed from every process. +``` + +This can then be scheduled with the following: + +```sh +#!/usr/bin/bash + +#SBATCH --account= +#SBATCH --job-name= +#SBATCH --partition=accel --gpus-per-task=1 +#SBATCH --ntasks=8 +#SBATCH --mem-per-cpu=8G +#SBATCH --time=00:30:00 + +# Purge modules and load tensorflow +module purge +module load TensorFlow/2.2.0-fosscuda-2019b-Python-3.7.4 +source $SLURM_SUBMIT_DIR/tensor_env/bin/activate +# List loaded modules for reproducibility +module list + +# Export settings expected by Horovod and mpirun +export OMPI_MCA_pml="ob1" +export HOROVOD_MPI_THREADS_DISABLE=1 + +# Run python script +srun python $SLURM_SUBMIT_DIR/mnist_hvd.py +``` + +Note especially the use of `--gpus-per-task=1` which means that each task will +get a dedicated GPU. The above job will thus take up two whole nodes on Saga. + +[sigma2]: https://www.sigma2.no/ +[tensorflow]: https://www.tensorflow.org/ +[virtualenv]: https://virtualenv.pypa.io/en/stable/ +[tf_install]: https://www.tensorflow.org/install/pip +[rsync]: https://en.wikipedia.org/wiki/Rsync +[python_os]: https://docs.python.org/3/library/os.html +[tensorflow_ckpt]: https://www.tensorflow.org/tutorials/keras/save_and_load +[tensorboard]: https://www.tensorflow.org/tensorboard +[mirrored]: https://www.tensorflow.org/guide/distributed_training#mirroredstrategy +[hvd]: https://github.com/horovod/horovod +[hvd_tf_ex]: https://github.com/horovod/horovod/blob/master/examples/tensorflow2/tensorflow2_keras_synthetic_benchmark.py +[nccl]: https://developer.nvidia.com/nccl diff --git a/_sources/code_development/guides/vs_code/connect_to_server.md.txt b/_sources/code_development/guides/vs_code/connect_to_server.md.txt new file mode 100644 index 000000000..c1def638a --- /dev/null +++ b/_sources/code_development/guides/vs_code/connect_to_server.md.txt @@ -0,0 +1,49 @@ +# Connecting to a system with Visual Studio Code +Visual studio Code has the ability to connect to our systems through ssh/sftp. This guide will show you how to do this. + +## What will this do for me? +Allowing VS code to connect to our systems allows you to edit and manage your files as if they were stored locally on your own device. + +It also somewhat allows you to have a graphical interface to our systems, by showing you the files and their content, instead of having to view everything through a terminal + +## How +### Install ssh plugin +1. Go to plugins +2. Search for "ssh" +3. Select the plugin called "Remote - SSH" +4. Install it +5. Reload window + +![Install ssh plugin](ssh-plugin-highlighted.png) + +### Open Connection + +1. Click the green button with the connection symbol in the lower left-hand corner. +2. Select "Connect to Host..." + +![Connect to host](open-connection-highlighted.png) + +### Connect to system +* Fram: username@fram.sigma2.no +* Saga: username@saga.sigma2.no +* Betzy: username@betzy.sigma2.no + +When prompted, type in your password. +If you get a question about what system you are connecting to, select "Linux" + +![Connect to host](open-connection2.png) + +### Open folder +Select the folder you want to open. +* For home folder, either select "~" or "/cluster/home/[username]/" +* For project folder, select /cluster/projects/nnXXXXk/ +* Click OK + +![Open folder](open-folder.png) + +### Done! +You are now connected to a system and have access to everything you had access to from a terminal in the IDE. + +You also have access to terminal through the built in terminal in VS code. + +![Terminal](terminal.png) \ No newline at end of file diff --git a/_sources/code_development/guides_containers_gpu.md.txt b/_sources/code_development/guides_containers_gpu.md.txt new file mode 100644 index 000000000..5684c9d64 --- /dev/null +++ b/_sources/code_development/guides_containers_gpu.md.txt @@ -0,0 +1,16 @@ +(dev-guides_containers)= + +# Containers with GPU support +- Building containers with Singularity: + - [Beginner] + [Containers on NRIS HPC systems](guides/containers.md) + - [Beginner] + [BigDFT with MPI and CUDA](guides/containers/bigdft.md) + - [Beginner] + [Container with build environment](guides/container_env.md) + - [Beginner] + [Container with MPI support](guides/container_mpi.md) + - [Beginner] + [Container with GPU support (OpenACC)](guides/container_openacc.md) + - [Beginner] + [CUDA Container](guides/gpu/cuda-container.md) diff --git a/_sources/code_development/guides_gpu.md.txt b/_sources/code_development/guides_gpu.md.txt new file mode 100644 index 000000000..4db25629f --- /dev/null +++ b/_sources/code_development/guides_gpu.md.txt @@ -0,0 +1,51 @@ +(dev-guides_gpu)= + +# GPU programming models +- Introduction to GPU: + - [Beginner] + [Introduction to using GPU partition](guides/gpu.md) + - [Beginner] + [Offloading to GPU](guides/offloading.md) +- Calling GPU accelerated libraries: + - [Beginner] + {ref}`Calling cuBLAS from OpenACC` + - [Beginner] + {ref}`Calling cuBLAS from OpenMP` + - [Beginner] + {ref}`Calling cuFFT from OpenACC` +- GPU programming with OpenACC: + - [Beginner] + [Getting started with OpenACC and Nvidia Nsight](guides/openacc.md) + - [Intermediate] + [Async and Multi-GPU OpenACC](guides/async_openacc.md) +- GPU programming with OpenMP: + - [Beginner] + [Introduction to OpenMP offloading](guides/ompoffload.md) +- GPU programming with SYCL: + - [Beginner] + [Getting started with hipSYCL](guides/hipsycl.md) + - [Beginner] + [SYCL Academy tutorial](guides/sycl_academy.md) + - [Beginner] + [SYCL ENCCS tutorial](guides/sycl_enccs.md) + - [Intermediate] + [Unified Shared Memory with SYCL](guides/sycl_usm.md) +- Porting applications: + - [Beginner] + [Porting OpenACC to OpenMP offloading](guides/converting_acc2omp/openacc2openmp.md) + - [Beginner] + {ref}`Translating GPU-accelerated applications` + - [Beginner] + {ref}`Translating CUDA to HIP with Hipify` + - [Beginner] + {ref}`Translating CUDA to SYCL with Syclomatic` + - [Beginner] + {ref}`Translating OpenACC to OpenMP with Clacc` +- Hybrid programming + - [Beginner] + [MPI and OpenACC](guides/openacc_mpi.md) + - [Intermediate] + [GPU-aware MPI with OpenACC and OpenMP](guides/gpuaware_mpi.md) +- Offloading to GPU using Fortran 2008: + - [Beginner] + [Offloading to GPU using Fortran 2008](guides/offloading-using-fortran.md) diff --git a/_sources/code_development/guides_ml.md.txt b/_sources/code_development/guides_ml.md.txt new file mode 100644 index 000000000..71f70e3b1 --- /dev/null +++ b/_sources/code_development/guides_ml.md.txt @@ -0,0 +1,8 @@ +(dev-guides_ml)= + +# Machine Learning +- TensorFlow on GPU + - [Beginner] + [Introduction to TensorFlow: part I](guides/tensorflow_gpu.md) + - [Beginner] + [Introduction to TensorFlow: part II](guides/gpu/tensorflow.md) \ No newline at end of file diff --git a/_sources/code_development/guides_monitor_gpu.md.txt b/_sources/code_development/guides_monitor_gpu.md.txt new file mode 100644 index 000000000..04ef01897 --- /dev/null +++ b/_sources/code_development/guides_monitor_gpu.md.txt @@ -0,0 +1,14 @@ +(dev-guides_monitor)= + +# Monitoring GPU accelerated applications +- Profiling and debugging CUDA applications + - [Beginner] + [Stencil Communication Pattern with CUDA](guides/stencil.md) +- Basic commands for GPU usage + - [Beginner] + [Command-lines `nvidia-smi` and `rocm-smi`](gpuusage) + - [Beginner] + [Monitoring using `rocm-smi` on LUMI-G](monitoring-gpus-on-lumi-g-with-rocm-smi) +- Profiling GPU-accelerated Deep Learning + - [Beginner] + [PyTorch Profiler](pytochprofiler) diff --git a/_sources/code_development/guides_python.md.txt b/_sources/code_development/guides_python.md.txt new file mode 100644 index 000000000..b80ff5766 --- /dev/null +++ b/_sources/code_development/guides_python.md.txt @@ -0,0 +1,6 @@ +(dev-guides_python)= + +# Python libraries +- Parallel and distributed computing libraries + - [Beginner] + [Using Dask to scale your Python program](guides/dask.md) \ No newline at end of file diff --git a/_sources/code_development/overview.rst.txt b/_sources/code_development/overview.rst.txt new file mode 100644 index 000000000..fa5ae99c3 --- /dev/null +++ b/_sources/code_development/overview.rst.txt @@ -0,0 +1,36 @@ +.. _code_development: + +:id: code_development + +============================== +Code development and tutorials +============================== +---------------- +Code development +---------------- + +.. toctree:: + :maxdepth: 1 + + building.md + building_gpu.md + betzy.md + compilers.md + debugging.md + performance.md + Calling-fortran-from-Python.md + + +--------- +Tutorials +--------- +In this section we present a list of tutorials covering different topics in heterogenous computing involving GPU (Graphics processing unit) accelerators: it goes from GPU programming models, Machine Learning, containers with GPU support to monitoring GPU-accelerated applications. + +.. toctree:: + :maxdepth: 1 + + guides_gpu.md + guides_ml.md + guides_containers_gpu.md + guides_monitor_gpu.md + guides_python.md diff --git a/_sources/code_development/performance.md.txt b/_sources/code_development/performance.md.txt new file mode 100644 index 000000000..58cc68a42 --- /dev/null +++ b/_sources/code_development/performance.md.txt @@ -0,0 +1,31 @@ +# Performance Analysis and Tuning + +Understanding application performance on modern HPC architectures +is a very complex task. There are a number of factors that can limit +performance: IO speed, CPU speed, memory latency and bandwidth, thread +binding and correct memory allocation on NUMA architectures, +communication cost in both threaded shared-memory applications, and +in MPI-based codes. + +Sometimes the performance can be improved without recompiling the code, +e.g., by arranging the working threads or MPI ranks in a more +efficient way, or by using more / less CPU cores. In other cases it +might be required to perform an in-depth investigation into +the hardware performance counters and re-writing (parts of) the +code. Either way, identifying the bottlenecks and deciding on what +needs to be done can be made simpler by using specialized tools. Here +we describe some of the tools available on Fram. + +* {ref}`arm-performance-reports`. An in-depth analysis of + three synthetic benchmarks. We demonstrate some pitfalls of + profiling, and show how one can use profiling to reason about the + performance of real-world codes. + +* [VTune Amplifier](performance/vtune.md). Performance analysis of the + original, and the optimized software package ART - a 3D radiative + transfer solver developed within the [SolarALMA + project](https://www.mn.uio.no/astro/english/research/projects/solaralma/). + +* [Other (Intel) tools](performance/intel_tuning.md). An overview of the tools and + a quick guide to which type of tuning, and to what kind of programming + model they are applicable. diff --git a/_sources/code_development/performance/intel_tuning.md.txt b/_sources/code_development/performance/intel_tuning.md.txt new file mode 100644 index 000000000..52fa45277 --- /dev/null +++ b/_sources/code_development/performance/intel_tuning.md.txt @@ -0,0 +1,171 @@ +--- +orphan: true +--- + +# Intel tuning tools + +Intel provide a set of tuning tools that can be quite useful. The +following sections will give an overview of the tools and provide a +quick guide of the scope of the different tools and for which type of +tuning and what kind of programming model that are applicable for, +vectorization, threading, MPI. + +Below is a list of tools that can be used to analyses the +application, starting from text analysis of the source code to massive +parallel MPI runs. + +* Intel compiler, analysis of source code. +* Intel XE-Advisor, analysis of Vectorization. +* Intel XE-Inspector, analysis of threads and memory. +* Intel VTune-Amplifier, analysis and profiling of complete program performance. +* Intel MPI, profile the MPI calls. +* Intel Trace Analyzer, analysis of MPI communication. + + +The PRACE Best Practice Guide for the Intel processor Knights Landing: +[http://www.prace-ri.eu/best-practice-guide-knights-landing-january-2017/#tuning.section] +contain a nice review of the Intel tools and how to use them. Chapter 7 is about +tuning. Only chapters 1 and 2 are specific to the Knights Landing processor. + +A short overview is given here with links to the relevant tools. +The Best Practice Guide provide an overview of the tools with a focus on +sampling. The actual usage of the different tools is only covered in depth +by the Intel documentation. Even with the documentation it can be hard to +start using the tools in an effective way. Attending Intel training is advised. + + +## Intel compiler - optimization report + +Compiler flags provide a mean of controlling the optimization done by +the compiler. There are a rich set of compiler flags and directives +that will guide the compiler's optimization process. The details of +all these switches and flags can be found in the documentation, in +this guide we'll provide a set of flags that normally gives acceptable +performance. It must be said that the defaults are set to request a +quite high level of optimization, and the default might not always be +the optimal set. Not all the aggressive optimizations are numerically +accurate, computer evaluation of an expression is as we all know quite +different from paper and pencil evaluation. + +Please consult the BPG guide section about the Intel Compiler at: +[http://www.prace-ri.eu/best-practice-guide-knights-landing-january-2017/#id-1.8.3.5] + + +To use the Intel compiler one of the Intel compiler modules must be loaded, +an example is : +``` +module load intel/2018b +``` + + +## Intel MPI library + +The MPI Perf Snapshot is a built in lightweight tool that will provide +some helpful information with little effort from the user. Link your +program with -profile=vt and simply issue the -mps flag to the mpirun +command (some environment need to be set up first, but this is quite +simple). + +``` +mpiifort -o ./photo_tr.x -profile=vt *.o +mpirun -mps -np 16 ./photo_tr.x +``` + +More information at the Best Practice Guide : +[http://www.prace-ri.eu/best-practice-guide-knights-landing-january-2017/#id-1.8.3.6] + + + +## Intel XE-Advisor + +Vectorization Advisor is an analysis tool that lets you identify if +loops utilize modern SIMD instructions or not, what prevents +vectorization, what is performance efficiency and how to increase +it. Vectorization Advisor shows compiler optimization reports in +user-friendly way, and extends them with multiple other metrics, like +loop trip counts, CPU time, memory access patterns and recommendations +for optimization. + +More information at the Best Practice Guide : +[http://www.prace-ri.eu/best-practice-guide-knights-landing-january-2017/#XE-Advisor] + +To have access to the XE-Advisor load the designated module, an example is: +``` +module load Advisor/2018_update3 +``` +Then launch the advisor in GUI or text version. +``` + advixe-gui & +``` +Using the GUI X11 version over long distances might be somewhat slow. + + +## Intel XE-Inspector + +Intel Inspector is a dynamic memory and threading error checking tool +for users developing serial and multithreaded applications. The +tuning tool XE-advisor as tool tailored for threaded shared memory +applications (it can also collect performance data for hybrid MPI +jobs using a command line interface). It provide analysis of memory +and threading that might prove useful for tuning of any application. + +More information at the Best Practice Guide : +[http://www.prace-ri.eu/best-practice-guide-knights-landing-january-2017/#id-1.8.3.8] + +To use the XE-Inspector us the Inspector module, like : +``` +module load Inspector/2018_update3 +``` +To launch the GUI: +``` +inspxe-gui & +``` + +## Intel VTune Amplifier + +Intel VTune Amplifier provides a rich set of performance insight into +CPU performance, threading performance and scaleability, bandwidth, +caching and much more. Originally a tool for processor development, +hence the strong focus on hardware counters. + +Analysis is fast and easy because VTune Amplifier understands common +threading models and presents information at a higher level that is +easier to interpret. Use its powerful analysis to sort, filter and +visualize results on the timeline and on your source. However, the +sheer amount of information is sometimes overwhelming and in some +cases intimidating. To really start using VTune Amplifier some basic +training is suggested. + +More information at the Best Practice Guide : +[http://www.prace-ri.eu/best-practice-guide-knights-landing-january-2017/#id-1.8.3.9] + +To get access to the Vtune Amplifier load the corresponding module, like: + +``` +ml load VTune/2018_update3 +``` + +For a run-through example see [a VTune case study](vtune.md). + + +## Intel Trace Analyzer +Intel Trace Analyzer and Collector is a graphical tool for +understanding MPI application behavior, quickly finding bottlenecks, +improving correctness, and achieving high performance for parallel +cluster applications based on Intel architecture. Improve weak and +strong scaling for small and large applications with Intel Trace +Analyzer and Collector. The collector tool is closely linked to the +MPI library and the profiler library must be compiled and linked with +the application. There is an option to run using a preloaded library, +but the optimal way is to link in the collector libraries at build +time. + +More information at the Best Practice Guide : +[http://www.prace-ri.eu/best-practice-guide-knights-landing-january-2017/#id-1.8.3.10] + + To use Intel Trace Analyzer load the corresponding module, e.g. + + ``` + ml load itac/2019.4.036 + ``` + diff --git a/_sources/code_development/performance/vtune.md.txt b/_sources/code_development/performance/vtune.md.txt new file mode 100644 index 000000000..a13a6f20a --- /dev/null +++ b/_sources/code_development/performance/vtune.md.txt @@ -0,0 +1,219 @@ +--- +orphan: true +--- + +# Example VTune Analysis + +As an example we use VTune Amplifier to analyze the performance of the +original, and the optimized software package ART - a 3D radiative +transfer solver developed within the [SolarALMA + project](https://www.mn.uio.no/astro/english/research/projects/solaralma/). The + code is written in C++ and consists of two major computational + parts: + +* An equation of state (EOS) solver, which - based on various types of +input data - computes electron density, gas pressure, and +density. + +* A nonlinear solver for radiative transfer (RT). This code is based on +a FORTRAN code from 1970 by Robert Kurucz. + +Input data is read from HDF5 files and composed into a set of 3D +Cartesian grids. The two kernels described above are executed +independently for each grid point, with no communication required by +the neighbor cells. In this sense the code is trivially +parallelizable and to find opportunities for optimization we look at +the per-core (call it "sequential") performance. The optimization +effort has been done within the PRACE Preparatory Access project type +D. For more details about the optimizatoin techniques [consult the +white paper.](https://doi.org/10.5281/zenodo.2633704) + + +## Using VTune on Fram + +First, to use VTune on Fram you need to load the corresponding +software module `VTune`. To list the available versions: + +``` +$ ml avail VTune + + VTune/2017_update1 VTune/2018_update1 VTune/2018_update3 +``` + +Then load the desired (newest) version + +``` +$ ml load VTune/2018_update3 +``` + +To gather information about a code's performance one needs to execute +the code using the [`amplxe-cl` +command](https://software.intel.com/en-us/vtune-amplifier-help-amplxe-cl-command-syntax). Depending +on the needs, `amplxe-cl` can gather all sorts of performance statistics: FPU +utilization, usage of vector (SIMD) AVX insturctions, instructions per +clock, memory bandwidth, cache utilization, threading level, etc. For +example, to collect general information about the most time-consuming +parts of the code: + +``` +$ amplxe-cl -collect hotspots ./your-program your-arguments +``` + +For a complete list of analysis modes please consult the [VTune +documentation](https://software.intel.com/content/www/us/en/develop/tools/vtune-profiler.html). A +useful set of performance metrics is gathered by the +`hрc-performance` analysis, which can help to identify opportunities +to optimize CPU, memory, and vectorization level: + +``` +$ amplxe-cl -collect hpc-performance ./your-program your-arguments +``` + +A detailed description and the available options for each profiling +mode can be obtained as follows + +``` +$ amplxe-cl -help collect hpc-performance + +[...] + + To modify the analysis type, use the configuration options (knobs) as + follows: + -collect hpc-performance -knob = + Multiple -knob options are allowed and can be followed by additional collect + action options, as well as global options, if needed. + +sampling-interval +[...] + +enable-stack-collection +[...] + +collect-memory-bandwidth +[...] + +dram-bandwidth-limits +[...] + +analyze-openmp +[...] +``` + +The `enable-stack-collection` knob, disabled by default, provides detailed +caller / callee information for each profiled function. It can be very +useful, but might introduce some overhead. We will use it in the +following example. + +Collected performance statistics are saved in a subdirectory, by +default in the directory you are running from. For the above example +the results are stored in `r000hpc/`. They can then be compressed and +moved to, e.g., a desktop computer, or they can be analyzed on one of +the Fram login nodes using the VTune Amplifier GUI: + +``` +$ ssh -Y fram.sigma2.no +$ ml load VTune/2018_update3 +$ amplxe-gui +``` + +Note that running the GUI directly on Fram migh feel sluggish depending +on your network connection. + +## VTune analysis + +The performance characteristics of the original code are obtained as +follows + +``` +amplxe-cl -collect hpc-performance -knob enable-stack-collection=true -knob collect-memory-bandwidth=false mpirun -np 1 ./ART.x +``` + +Since we are interested in the "sequential" (per-core) performance, we +only analyze a single MPI rank. The stack collection is enabled. Once +the sampling results are opened in the VTune Amplifier GUI, the +performance summary is presented: + +![VTune Summary](vtune/vtune_summary.png "VTune Summary") + +The CPU Utilization section shows the total percentage of all +available cores used. Since a single ART process was executed, this +metric is very low (1 out of 64 logical cores are used), but that is +expected. The memory statistics show that ART is compute bound: there +is almost no references to the global memory (DRAM bound 0%). Also the +caches are not very busy. It is clear that most of the run time goes +into CPU instructions. + +The FPU utilization report is hence the most interesting one in this +case. It reveals that 90% of the floating point instructions are scalar, i.e., +the vector units (AVX) are mostly unused. Looking at the top most busy +functions it becomes clear that the time is spent in calls to `libm` +`exp` and `log`. + +A closer look at the Bottom-up section of the performance report +reveals the heaviest parts of the code. + +![VTune Bottom-up](vtune/vtune_bottomup.png "VTune Bottom-up") + +This confirms the previous finding (and adds `pow` to the list of +computationally heavy functions). From the above reports we can +roughly sketch the optimization directions: + +* Re-write the code such that the vectorized math library is used for + `exp, log, pow` calls +* Concentrate on the heaviest functions from the Bottom-up list + +## The optimized code + +Both GCC and ICC provide an interface to a vectorized math library with +platform-optimized implementations of amongst others +`exp,pow,log`. Intel compiler uses +its own Short Vector Math Library (SVML). The library comes together +with the compiler installation, and there is nothing system-specific +that needs to be done to use it. GCC on the other hand relies on +`libmvec`, which is part of Glibc version 2.22 and higher. This means +that on systems with an older version of Glibc the vectorized math +library is not readily available, regardless of the GCC version. This +is a practical problem, because OS vendors often lag a few years when +it comes to Glibc (e.g., Centos 7.5 comes with version 2.17 released +in the end of 2012). However, it is possible to install a custom Glibc +as a module and use it for user's code. + +As noted in the [documentation of +SVML](https://software.intel.com/en-us/node/524289) the vectorized +math library differs from the scalar functions in accuracy. Scalar +implementations are not the same as the vectorized ones with vector +width of 1. Instead, they follow strict floating-point arithmetic and +are more computationally demanding. GCC is by default conservative +wrt. the floating-point optimizations: vectorized math library is only +enabled with the `-ffast-math` compiletime option. ICC by default uses +relaxed settings (`-fp-model fast=1`), which allow the compiler to +make calls to low accuracy SVML functions. In addition, SVML also +provides higher accuracy vectorized functions (`-fp-model precise`). +Vectorization of libm calls can be prohibited with `-fp-model +strict`. With ART the high accuracy is not required, hence we compile +the code with the most relaxed settings. + +Vectorization of the code has been performed using `#pragma simd` +defined by the OpenMP standard. All the heaviest ART functions, and +all floating-point intensive loops have been vectorized using this +method, and VTune was used throughout the process to find new +bottlenecks once the existing ones have been optimized. Below is the +final VTune report of the optimized code compiled using the GCC 8 +compiler: + +![VTune Summary - Optimized GCC](vtune/vtune_opt_gcc.png "VTune Summary - Optimized GCC") + +and using the Intel 18 compiler: + +![VTune Summary - Optimized ICC](vtune/vtune_opt_intel.png "VTune Summary - Optimized ICC") + +Notably, the optimized code utilizes the vector (AVX) units for 87% of +the total FP instructions with GCC, and for 94% of the FP instructions +with the Intel compiler. The capacity of the vector units is used in +85%-90%, which demonstrates that the code is almost fully vectorized. + +Compared to the original code, the performance tests have shown that +on a Broadwell-based architecture the optimized code works from 2.5 +times faster (RT solver) to 13 times faster (EOS solver) on a single +core. All optimizatoin techniques employed have been described in +detail in [the white paper](https://doi.org/10.5281/zenodo.2633704). diff --git a/_sources/computing/responsible-use.md.txt b/_sources/computing/responsible-use.md.txt new file mode 100644 index 000000000..8dbf38754 --- /dev/null +++ b/_sources/computing/responsible-use.md.txt @@ -0,0 +1,77 @@ +# Using shared resources responsibly + +One of the major differences between using remote HPC resources and your own +system (e.g. your laptop) is that **HPC resources are shared**. Please use the +shared resources responsibly. Below we list few things to think about for a +more responsible resource use. + +Please also make sure that you have gone through the documentation about +{ref}`job-types`, {ref}`queue-system`, {ref}`hardware-overview`, +{ref}`choosing-memory-settings`, and {ref}`choosing-number-of-cores`, to verify +that you are submitting the right job to the right partition to the right +hardware and not wasting resources. + + +## Be kind to the login nodes and other users + +The login node is often busy managing all the logged-in users, creating and +editing files and compiling software. If the machine runs out of memory or +processing capacity, it will become very slow and unusable for everyone. + +**Always use the queue system for running jobs**. The login nodes are only for +file transfer, compilation, editing, job submission and short tests, etc. If +you run production jobs on the login nodes, we will need to stop them and email +you about it. More likely, another frustrated user might email us first and +complain about the too slow login node. + +**Don't run interactive calculations on the login nodes**. If you need to run a job +interactively (not scheduled), have a look at {ref}`interactive-jobs`. + + +## Adjust required memory, number of cores, and time to what your job really needs + +Do not ask for a lot more memory or number of cores or time than you need. +This may unnecessarily deplete your quota and may also delay the start of your +calculations. It may also delay the start of calculations for others and +deplete available resources for others. + +Please read these before asking for a lot "just to be on the safe side": +- {ref}`choosing-memory-settings` +- {ref}`choosing-number-of-cores` + +Don't use `--exclusive` in job scripts unless explicitly told by NRIS +staff to do so. This is especially important if you use `--mem-per-cpu`. + + +## Have a backup plan + +See the documentation about {ref}`storage-backup` to learn what folders are +backed up and how. + +However, **your data is your responsibility**. Make sure you understand what +the backup policy is on the file systems on the system you are using and what +implications this has for your work if you lose your data on the system. + +Make sure you have a robust system in place for taking copies of critical data +off the HPC system wherever possible to backed-up storage. Tools such as +`rsync` can be very useful for this. + +Your access to the shared HPC system will generally be time-limited, so you +should ensure you have a plan for transferring your data off the system before +your access finishes. The time required to transfer large amounts of data +should not be underestimated, and you should ensure you have planned for this +early enough (ideally, before you even start using the system for your +research). + + +## Transferring data + +Disk speed, meta-data performance, network speed, and firewall speed may limit +the transfer bandwidth. + +Here are tips to make your data transfer easier and faster: + +**Plan for it**: If you need to transfer large amount of data, don't start on +the last day of your project. Data transfer may take hours or even days. + +Please read our page about {ref}`file-transfer`. diff --git a/_sources/computing/tuning-applications.md.txt b/_sources/computing/tuning-applications.md.txt new file mode 100644 index 000000000..f0353ab6b --- /dev/null +++ b/_sources/computing/tuning-applications.md.txt @@ -0,0 +1,374 @@ +# Tuning applications + +## Introduction + +Running scientific application in order to maximise the efficiency of +the hardware is increasingly important as the core count increase. A +system like Betzy with 172k cores represent a substantial cost and +efficiency of the applications run should be as high a practically +possible. + +There are several steps with increasing complexity that can be taken +to learn more about the application's behaviour. Starting from +checking the scaling to produce a roof line model of the application. +Digging deeper into the matter is also possible with even more +advanced usage of the tools and even more advanced tools. + +A set of tools is presented, each with increasing complexity. + +In this case study we'll follow a real application called FVCOM which +is an ocean current simulation code. I typically run on 1024 cores, +not the application with the highest core count, but a typical example +of a seasoned well known application with it's special traits. + +In the following we will cover application tuning and benchmarking at a general level. +If you are interested in specific advice or guidance for your particular software, please +have a look at the {ref}`application specific ` pages, locate your application and see if it has a dedicated guide. If not, +and this is of interest to you, please consider contacting us by sending a mail to [support@nris](mailto:support@nris.no). We +greatly appreciate any assistance or help in improving this. + +## Scaling + +The scaling of an application is one of the key factors for running in +parallel. This simple metric is very simple to record. Just run the +application with an increasing number of cores and record the run time +(many ways exist to record run time, but `time -p mpirun ` +is one of the simplest. As the speedup in many cases also depend on +the input data used it is important to run the scaling test with a +relevant input data set. + +Plot the recorded run time as function of core used and look for trend in the run time, +alternatively plot the speedup from the lowest core count. + +| # cores | run time | speedup | +| ---- | ------- | ------ | +| 128 | 4071.89 | 1 | +| 256 | 2193.52 | 1.86 | +| 512 | 1173.19 | 3.47 | +| 1024 | 659.38 | 6.18 | +| 2048 | 395.53 | 10.29 | + +```{figure} tuning/Speedup.png +:alt: Scaling + +Fig. 1 - Scaling +``` + +The plot show the recorded speedup for FVCOM with the current input data. +The dashed line represent perfect speedup. + +(tuning-applications-apr)= + +## ARM Performance reports + +To learn a great deal of the application the tool +`Performance-reports` can be used. This tool profile the application +and provide a one page report with a huge array of important metrics. + + +### Running + +The commands used to run the ARM performance reports are: +``` +module load Arm-PerfReports/20.0.3 +perf-report mpirun ./fvcom.bin --casename=$RUN > $RUN_DIR/log-${SLURM_JOBID}.out +``` +When the Slurm job is finished two files containing performance reports are found as: +`fvcom_1024p_8n_1t_yyyy-mm-dd_hh-mm.txt` and `fvcom_1024p_8n_1t_yyyy-mm-dd_hh-mm.html`. + +See also {ref}`arm-performance-reports`. + + +### Header + +```{figure} tuning/Perf-reports-1.png +:alt: Performance report header + +Fig. 2 - Performance report header +``` + +The header give basic +information how the application was invoked and run. It also provide +total wall time. The figure to the right give a quick overview of how +the time is distributed over the categories of execution. This is +binned into 3 categories, compute time, time spent in MPI library and +time spent doing Input & Output. In this example we see that time is +spent on doing computation and communication using MPI. + + +### Summary + +```{figure} tuning/Perf-reports-2.png +:alt: Performance report summary + +Fig. 3 - Performance report summary +``` + +The summary report +section contain a lot of useful information in just a few lines. It +show us that only 69% of the time is spent doing computations. This +metric is important when we calculate the total efficiency of the +application. + +The next entry teach us that the application spent 31% of the total +time doing communication using MPI. This is time spent as a +consequence of MPI functions needed to make a parallel program. This +number should be as low as possible as the time spend doing +communications is wasted compute cycles. About 1/3 of all compute +cycles in this run is wasted do to waiting for MPI functions t +complete. + +The last entry show that FVCOM does very little Input & Output during +a run. Other applications might have a far larger number here. + + +### Details - CPU + +```{figure} tuning/Perf-reports-3.png +:alt: Performance report Details - CPU + +Fig. 4 - Performance report Details - CPU +``` + +The CPU details +give us some hints about the actual CPU usage and the application +instructions and memory access. This is a pure MPI application and +each rank is single core, hence 100% single core. + +36% of the compute time is spend executing scalar numeric operations, +only 0.3% is spend executing vector operations. This is not good, as +the AMD Rome processors have AVX2 vector instructions capable of 256 +bits, 4 double or 8 single precision of floating point operations in +one instruction. Using only scalar instructions mean that we are only +exploiting 1/4 or 1/8th of the total compute capacity of the +processor. + +62% of the compute time is spent waiting for data to be moved to and +from memory into the processor. This is normally due to a limitation +of the memory bandwidth as is normally the case or poor programming +with a lot of random memory accesses. Poor memory bandwidth is a +common performance limitation. While the AMD Rome processors in Betzy +show memory bandwidth in excess of 300 Gbytes per second this is far +from enough to keep up with a processor running at 2500 MHz. Example, +300 Gbytes/s is 37 G double precision floats per second. A single core +can do 4 double precision operations per vector unit per clock or 2.5 +GHz\*4 = 10 Gflops/s. Then there are two vector units per core and 128 +cores in total. The cache helps in data can be reused in several +calculations, but as the measurements show the bulk of time is spent +waiting for memory. + +The comments below clearly indicate this as the application is memory +bound and very little time is spent in vectorised code. + + +### Details - MPI + +```{figure} tuning/Perf-reports-4.png +:alt: Performance report Details - MPI + +Fig. 5 - Performance report Details - MPI +``` + +The MPI details +informs us that of the time spent in communication the majority of the +MPI (63%) time is spent on doing collective operations, while the rest +is spent in point-to-point communication. The relatively high fraction +of time in collectives can be due to imbalance between the ranks. We +see that the transfer rate is only in single digit Mbytes/s and only +two digits for point-to-point. Very far from the wire speed at about +10 Gbytes/s. + +```{figure} tuning/Perf-reports-5.png +:alt: Performance report Details - IO + +Fig. 6 - Performance report Details - IO +``` + +The IO details +show that FVCOM spent a very little time doing IO during a run. About equal +time for read and write. The transfer bandwidth is low as the file system is +capable of far higher bandwidth. + +```{figure} tuning/Perf-reports-6.png +:alt: Performance report Details - OpenMP + +Fig. 7 - Performance report Details - OpenMP +``` + +The OpenMP +(threads) detail report is of little use for FVCOM as it is a single +threaded put MPI application. The same division as for MPI is relevant +here computation and synchronisation. In OpenMP which used shared +memory there is no transfer of data as all is in the same memory, but +access control is very important. Hence synchronisation can make up a +fraction of the time. False sharing is thing to look out for here when +developing multi threaded programs. It's easy to forget that while +processor address individual bytes the cache is 64 bytes chunks of +memory that might overlap with another thread in another core and +cache line. + +```{figure} tuning/Perf-reports-7.png +:alt: Performance report Details - Memory + +Fig. 8 - Performance report Details - Memory +``` + +The Memory detail report show the amount of memory used, mean and peak. As for +FVCOM with the input model in question it uses only 22% of the nodes' memory. + +It does unfortunately not give any information about strided or random +access. Even scalar operation can exhibit nice strided access. Any +form of random access can kill any performance. Accessing bytes at 100 +nm access time (for normal DDR memory) effectively run the processing +speed at 10 MHz frequency. + + + +## Intel Advisor + +### Introduction +I order to get even more insight of your application with respect to +code generated (vector instructions, vectorised loops etc), +performance and memory access the tool Intel Advisor come in handy. + +It's very good for investigating how the compiler generated vector +code and also providing hints about vectorisation. It can also provide +information about memory access, like striding type. + +It can record memory bandwidth and flops which is used to make a +*roof-line model*. Please read about the +[roof-line model](https://en.wikipedia.org/wiki/Roofline_model). +if you are unfamiliar with it. + +This case study does not go into depth on how to understand the +advisor tool, this is far better covered in the Intel tools +[documentation](https://software.intel.com/content/www/us/en/develop/documentation.html), +where they also provide a nice selection of tutorials. + + +### Running and collecting + +The GUI can be used to set up, launch, control and analyse the application. However, +it's simpler to collect data within a batch job. +When running MPI jobs it's only needed to run a single executable as they are all the same. +A project directory need to be created before command line collection can be initiated. + +The commands that used to collect data for FVCOM in the batch job are when using OpenMPI: +``` +module load Advisor/2019_update5 + +advdir=$(dirname $(which advixe-cl)) +mpirun -np 1 $advdir/advixe-cl -project-dir /cluster/work/support/olews/FVCOM_build/MATNOC-iompi/PO3/fvcom3 --collect survey -- ./fvcom.bin --casename=$RUN : -np 1023 ./fvcom.bin --casename=$RUN > $RUN_DIR/log-${SLURM_JOBID}.out +``` +An absolute path for mpirun arguments is needed, either full path or relative. +The project directory is also given with full path. This directory need to be present before +the collection begin. + +When survey has been run *tripcounts* and *flops* can be collected. +``` +module load Advisor/2019_update5 + +advdir=$(dirname $(which advixe-cl)) +mpirun -np 1 $advdir/advixe-cl -project-dir /cluster/work/support/olews/FVCOM_build/MATNOC-iompi/PO3/fvcom3 --collect tripcounts -flop -- ./fvcom.bin --casename=$RUN : -np 1023 ./fvcom.bin --casename=$RUN > $RUN_DIR/log-${SLURM_JOBID}.out +``` + +Memory access can be collected using the *map* keyword. +``` +module load Advisor/2019_update5 + +advdir=$(dirname $(which advixe-cl)) +mpirun -np 1 $advdir/advixe-cl -project-dir /cluster/work/support/olews/FVCOM_build/MATNOC-iompi/PO3/fvcom3 --collect map -flop -- ./fvcom.bin --casename=$RUN : -np 1023 ./fvcom.bin --casename=$RUN > $RUN_DIR/log-${SLURM_JOBID}.out +``` + +Both *tripcounts* and *map* increase the run time significantly, +remember to increase the Slurm run time. + + +### Display Survey + +To display the collected data please start the Advisor GUI tool using (remember to +log in using ssh -Y, so the login nodes have connection to your X11 terminal): +``` +advixe-gui & +``` +Open the project file and report file on rank0 (we run the collection on rank0). + +```{figure} tuning/Advisor-1.png +``` + +The fist screen show en overview of the run, elapsed time, time in vector and scalar code. +In addition Gflops/s for Floating point & integer and some CPU cache data load. +The fact that only 37.5% of the CPU time is spent executing vector instructions tells +us that the compiler did not manage to vectorise the loops as often as we would hope for. + + +### Display Roof-line model + +By clicking at the vertical stripe marked "roofline" the obtained +roof-line model of the application is displayed. It shows us +graphically how the application map out on a memory bandwidth vs +performance scale, see link to *roof-line model* above. + +```{figure} tuning/Advisor-2.png +``` + +The circles are all located at the memory bandwidth +line and left the regions where the application is compute bound. This +is another illustration that the application performance is limited by +the memory bandwidth. The two colours identifies scalar (blue) and +vector (orange) performance. + +The Advisor can also generated some recommendations that can be helpful. + +```{figure} tuning/Advisor-6.png +``` + + +### Display Memory access + +```{figure} tuning/Advisor-7.png +``` + +The map collection will record memory accesses and +lump them into categories for unit, constant and variable stride. It's +well known that variable stride is very bad for performance. By providing +references to the source code it's possible to review the code and see if it's +possible to rewrite or do any changes that will improve performance. + + +### Display memory dependencies + +```{figure} tuning/Advisor-8.png +``` + +The dependency collection (takes very long time 20-100x +normal run time) record variable that reuse same variables, +e.g. memory location and potential race condition and sharing. The +compiler need to take action and generate code that places locks and +semaphores on such variables. This can have significant impact on +performance. + + +### Program efficiency + +The Advisor can calculate a total performance of the complete run: + +```{figure} tuning/Advisor-4.png +``` + +The number is discouraging, 0.46 Gflops/s. A single +core can perform the following with two vector units at 256 bits +each : 2.5 GHz * 256 bits (4 double,64 bits) * 2 (two vector units) = +20 Gflops/s or using Fuse multiply add to get twice the performance at +40 Gflops/s. For single precision 32 bits the numbers can be +multiplied by two. + +Using the most conservative numbers for double precision and not fused +multiply add (20 Gflops/s) we get 0.46 / 20 = 2.3% of the theoretical +performance. + +This performance is not unusual for scientific code of this type. We +saw early on that only a fraction of the time (68.6%) was spent in CPU +time, of that time only a fraction was spent computing (35.9 + 0.3 = +36.2%). Combining this we arrive at 0.686*0.36.2=0.248 or 25% of the +time was spent computing. diff --git a/_sources/files_storage/backup.md.txt b/_sources/files_storage/backup.md.txt new file mode 100644 index 000000000..4fc3af111 --- /dev/null +++ b/_sources/files_storage/backup.md.txt @@ -0,0 +1,54 @@ +(storage-backup)= + +# Backup on Betzy, Fram, Saga, and NIRD + + +## Backup of home folders on compute clusters + +**Betzy, Fram and Saga**: Home folder is backed up daily to NIRD storage, and can be accessed via following mount point on corresponding clusters login nodes: +- `/cluster/backup/home/$username` + +Please, note that we keep a copy of all files, but daily backup only copies the changes (done through a rsync process). If the file has not been modified, the timestamp will remain the same as on the file inside your home or project folders. + +## Backup of project folders on compute clusters + +Directories under `/cluster/projects` are backed up. All other areas are not backed up. + +In addition to not being backed up, the work area `/cluster/work` also enforces +an automatic cleanup strategy, and is **not** meant for permanent storage. +Files in this area will be **deleted** after 42 or 21 days, depending on the storage capacity, +see [User work area](user-work-area) for details. + +**Betzy,Fram and Saga**: The project areas are backed up to NIRD storage which can be accessed via following mount point on all clusters login nodes: +- `/cluster/backup/hpc/betzy/nnXXXXk` +- `/cluster/backup/hpc/fram/nnXXXXk` +- `/cluster/backup/hpc/saga/nnXXXXk` + +## Snapshots + +In addition to the daily backup, we also have snapshots of all home and project files, copied fully: + +**Location**: `/cluster/backup/home/.snapshots/` +- Daily snapshots for the last 7 days +- Weekly snapshots for the last 6 weeks + +**Location**: `/cluster/backup/hpc/.snapshots/` +- Daily snapshots for the last 7 days +- Weekly snapshots for the last 6 weeks + + +## Backup on NIRD + +Protection against data corruption on NIRD is implemented by taking nightly snapshots. Even so, it is the responsibility of the PI/XO to regulate the usage and take steps to ensure that the data are adequately secured against human errors or inappropriate usage/access. + +The allocated storage quota on NIRD is meant for primary storage. Backup to a secondary location is a service on demand and can be ordered for selected datasets. + +Snapshots and backup service on NIRD are described in details on the dedicated pages linked below. + +```{eval-rst} +.. toctree:: + :maxdepth: 1 + + nird/snapshots_lmd.md + nird/backup_lmd.md +``` diff --git a/_sources/files_storage/clusters.md.txt b/_sources/files_storage/clusters.md.txt new file mode 100644 index 000000000..d0ed5d46b --- /dev/null +++ b/_sources/files_storage/clusters.md.txt @@ -0,0 +1,312 @@ +(storage-areas)= + +# Storage areas on HPC clusters + +Projects and users receive different areas to store files and other +data. Some areas are used for temporary files during job execution +while others are for storing project data. + +```{contents} Table of Contents +``` + + +(clusters-overview)= + +## Overview + +The following table summarizes the different storage options for **Betzy, Fram, and Saga**. +Below the table, we give recommendations and discuss the pros and cons of the various storage areas. + +| Directory | Purpose | {ref}`Default Quota ` | {ref}`Backup ` | +| :---------------------------------------------- | :------------------- | :--------------------------------- | :---------------------------------: | +| `/cluster/home/$USER` (`$HOME`) | User data | 20 GiB / 100 K files | Only if quota enforced | +| `/cluster/work/jobs/$SLURM_JOB_ID` (`$SCRATCH`) | Per-job data | N/A | No | +| (Fram/Saga) `/localscratch/$SLURM_JOB_ID` (`$LOCALSCRATCH`) | Per-job data | {ref}`Individual ` | No | +| `/cluster/work/users/$USER` (`$USERWORK`) | Staging and job data | N/A | No | +| `/cluster/projects/` | Project data | {ref}`1 TiB / 1 M files ` | Yes | +| `/cluster/shared/` | Shared data | {ref}`Individual ` | No | + +- **User areas and project areas are private**: Data handling and storage policy is documented [here](/files_storage/sharing_files.md). +- **`$LOCALSCRATCH` area is only implemented on Fram and Saga**. +- In addition to the areas in the tables above, **both clusters mount the + NIRD project areas** as `/nird/projects/NSxxxxK` for NIRD TS(Tiered Storage) projects and `/nird/datalake/NSxxxxK` for NIRD DL(DataLake) + projects on the login nodes (but not on the compute nodes). +- The `/cluster` file system is a high-performance parallel file + system. On Fram, it is a [Lustre](https://www.lustre.org/) system with + a total storage space of 2.3 PB, and on Saga it is a + [BeeGFS](https://www.beegfs.io/) system with a total storage space of + 6.5 PB. + For performance optimizations, consult {ref}`storage-performance`. + + +(clusters-homedirectory)= + +## Home directory + +The home directory is `/cluster/home/$USER`. The location is stored +in the environment variable `$HOME`. {ref}`storage-quota` is enabled on home +directories which is by default 20 GiB and 100 000 files, so it +is not advisable to run jobs in `$HOME`. However, it is perfectly +fine to store `stderr` and `stdout` logs from your batch jobs in +`$HOME` so they are available for reviewing in case of issues with it. + +The home directory should be used for storing tools, scripts, application +sources or other relevant data which must have a backup. + +The home directory is only accessible for the user. Files that should be +accessible by other uses in a project must be placed in the project +area. + +{ref}`Backed up ` +with daily snapshots **only if {ref}`storage-quota` is enforced** for the last 7 +days and weekly snapshots for the last 6 weeks. + + +## Job scratch area + +Each job gets an area `/cluster/work/jobs/$SLURM_JOB_ID` that is +automatically created for the job, and automatically deleted when the +job finishes. The location is stored in the environment variable +`$SCRATCH` available in the job. `$SCRATCH` is only accessible by the +user running the job. + +On Fram and Saga there are two scratch areas (see also below). + +The area is meant as a temporary scratch area during job +execution. +**This area is not backed up** ([documentation about backup](backup.md)). + +There are special commands (`savefile` and `cleanup`) one can use in +the job script to ensure that files are copied back to the submit +directory `$SLURM_SUBMIT_DIR` (where `sbatch` was run). + +```{note} +**Pros of running jobs in the job scratch area** + +- There is less risk of interference from other jobs because every job ID has + its own scratch directory. +- Because the scratch directory is removed when the job finishes, the scripts + do not need to clean up temporary files. +``` + +```{warning} +**Cons of running jobs in the job scratch area** + +- Since the area is removed automatically, it can be hard to debug + jobs that fail. +- One must use the special commands to copy files back in case the job + script crashes before it has finished. +- If the main node of a job crashes (i.e., not the job script, but the + node itself), the special commands might not be run, so files might + be lost. +``` + +(job-scratch-area-on-local-disk)= + +## Job scratch area on local disk + +**This only exists on Fram and Saga**. + +A job on **Fram/Saga** can request a scratch area on local disk on the node +it is running on. This is done by specifying +`--gres=localscratch:`, where ** is the size of the requested +area, for instance `--gres=localscratch:20G` for 20 GiB. + +Normal compute nodes on Fram have 198 GiB disk that can be handed out +to local scratch areas, and the bigmem nodes have 868 GiB. +On Saga most nodes have 330 GiB; a few of the +bigmem nodes have 7 TiB, the hugemem nodes have 13 TiB and the GPU +nodes have either 406 GiB or 8 TiB. If a job tries +to use more space on the area than it requested, it will get a "disk +quota exceeded" or "no space left on device" error (the exact message +depends on the program doing the writing). +Please do not ask for more than what you actually need, other users might share +the local scratch space with you (Saga only). + +Jobs that request a local scratch area, get an area `/localscratch/$SLURM_JOB_ID` +that is automatically created for the job, and automatically deleted +when the job finishes. The location is stored in the environment +variable `$LOCALSCRATCH` available in the job. `$LOCALSCRATCH` is +only accessible by the user running the job. + +Note that since this area is on *local disk* on the compute node, it +is probably not useful for jobs running on more than one node (the job +would get one independent area on each node). + +The area is meant to be used as a temporary scratch area during job +execution by jobs who do a lot of disk IO operations (either metadata +operations or read/write operations). Using it for such jobs will +speed up the jobs, and reduce the load on the `/cluster` file system. + +**This area is not backed up** ([documentation about backup](backup.md)). + +Currently, there are *no* special commands to ensure that files are +copied back automatically, so one has to do that with `cp` commands or +similar in the job script. + +```{note} +**Pros of running jobs in the local disk job scratch area** + +- Input/output operations are faster than on the `/cluster` file system. +- Great if you need to write/read a large number of files. +- It reduces the load on the `/cluster` file system. +- There is less risk of interference from other jobs because every job ID has + its own scratch directory. +- Because the scratch directory is removed when the job finishes, the scripts + do not need to clean up temporary files. +``` + +```{warning} +**Cons of running jobs in the local disk job scratch area** + +- Since the area is removed automatically, it can be hard to debug + jobs that fail. +- Not suitable for files larger than 198-300 GB. +- One must make sure to use `cp` commands or similar in the job + script to copy files back. +- If the main node of a job crashes (i.e., not the job script, but the + node itself), files might be lost. +``` + + +(user-work-area)= + +## User work area + +Each user has an area `/cluster/work/users/$USER`. The location is +stored in the environment variable `$USERWORK`. +**This area is not backed up** ([documentation about backup](backup.md)). +By default, `$USERWORK` is a private area and only accessible by +the user owning the area. However, it is possible to grant other +users access here, for e.g., debugging purposes. Note that write +access to your `$USERWORK` can not be granted to others. + +To allow others to read your work area, you may use the command: +`chmod o+rx $USERWORK` + +Note that by doing so you will allow everyone on the machine to +access your user work directory. If you want to share the results +in `$USERWORK` with other people in the project, the best way is to +move them to the project area. + +The `$USERWORK` directory is meant for files that are used by one +or more jobs. All result files must be moved out from this area +after the jobs finish, otherwise they will be automatically deleted +after a while (see notes below). We highly encourage users to keep +this area tidy, since both high disk usage and automatic deletion +process takes away disk performance. The best solution is to clean up +any unnecessary data after each job. + +File deletion depends on the newest of the *creation-*, *modification-* and +*access* time and the total usage of the file system. The oldest files will +be deleted first and a weekly scan removes files older than 42 days. + +When file system usage reaches 70%, files older than 21 days are subject to +automatic deletion. If usage is over 90%, files older than 17 days are subject to +automatic deletion. + +It is **not** allowed to try to circumvent the automatic deletion by +for instance running scripts that touch all files. + +```{note} +**Pros of running jobs in the user work area** + +- Since job files are not removed automatically directly when a job + finishes, it is easier to debug failing jobs. +- There is no need to use special commands to copy files back in case + the job script or node crashes before the job has finished. +``` + +```{warning} +**Cons of running jobs in the user work area** + +- There is a risk of interference from other jobs unless one makes + sure to run each job in a separate sub directory inside `$USERWORK`. +- Because job files are not removed when the job finishes, one has to + remember to clean up temporary files afterwards. +- One has to remember to move result files to the project area if one + wants to keep them. Otherwise they will eventually be deleted by + the automatic file deletion. +``` + + +(project-area)= + +## Project area + +All HPC projects have a dedicated local space to share data between project +members, located at `/cluster/projects/`. + +The project area is controlled by {ref}`storage-quota` and the default project quota for +HPC projects is 1 TiB, but projects can apply for more during the +application process with a maximum quota of 10 TiB on Fram and Saga, and 20 TiB on Betzy. + +Also after the project has been created, project members can request to increase +the quota to up to 10/20 TiB by documenting why this is needed. Such requests should be submitted by the project leader via e-mail to [contact@sigma2.no](mailto:contact@sigma2.no?subject=Storage%20Quota%20Request%20project%20X&body=1.%20How%20large%20are%20the%20input%20files%3F%20(Approximate%20or%20exact%20numbers%20are%20fine.)%0A%0A2.%20How%20many%20such%20input%20files%20will%20be%20used%20in%20a%20single%20job%3F%0A%0A3.%20At%20what%20rate%20do%20you%20intend%20to%20process%20your%20data%3F%20(Approximate%20GB%20per%20week%20or%20equivalent.)%0A%0A4.%20What%20size%20is%20your%20output%20files%20and%20will%20you%20use%20this%20data%20as%20input%20in%20further%20analysis%3F%0A%0A5.%20Please%20explain%20why%20you%20cannot%20benefit%20from%20the%20%2Fcluster%2Fwork%20area%0A%0A6.%20Based%20on%20your%20answers%20above%2C%20how%20much%20storage%20quota%20do%20you%20think%20you%20need%3F) +. Note that only files that are relevant for further computation jobs should be kept on the HPC machine. HPC is not intended for long-term storage. In your request, please include answers to the following questions: + +1. How large are the input files? (Approximate or exact numbers are fine.) +2. How many such input files will be used in a single job? +3. At what rate do you intend to process your data? (Approximate GB per week or equivalent.) +4. What size are your output files and will you use this data as input in further analysis? +5. Please explain why you cannot benefit from the /cluster/work area +NIRD is tightly connected with our HPC systems and data can be moved between the two both fast and easily. +6. Please explain why staging data from NIRD is not sufficient for your project +7. Based on your answers above, how much storage quota do you think you need? + + +Requests for more than 10/20 TiB require an application for a separate {ref}`nird` project area. On special occasions, storage above 10/20 TiB can be permitted. This requires an investigation of the workflow to ensure that needs cannot be satisfied through an allocation on NIRD. Granted disk space above 10/20 TiB is charged according to the [Contribution model](https://www.sigma2.no/user-contribution-model), Storage category B. + +Note that unused quota can also be withdrawn for technical reasons (too little +space) or organisational reasons (less needs/less usage/fewer members of +the group/fewer compute hours). + +Daily backup is taken to NIRD ([documentation about backup](backup.md)). + +```{note} +**Pros of running jobs in the project area** + +- Since job files are not removed automatically when a job + finishes, it is easier to debug failing jobs. +- There is no need to use special commands to copy files back in case + the job script or node crashes before the job has finished. +- There is no need to move result files to save them permanently or + give the rest of the project access to them. +``` + +```{warning} +**Cons of running jobs in the project area** + +- There is a risk of interference from other jobs unless one makes + sure to run each job in a separate sub-directory inside the project + area. +- Because job files are not removed when the job finishes, one has to + remember to clean up temporary files afterwards otherwise they can + fill up the quota. +- There is a risk of using all of the disk quota if one runs many jobs + and/or jobs needing a lot of storage at the same time. +``` + +(shared-project-area)= + +## Shared project area + +In special cases, there might be a need for sharing data between projects for +collaboration and possibly preventing data duplication. + +If such a need is justified, a meta-group and its according directory +in `/cluster/shared` is created. The area gets a disk quota based on +the needs of the data. The permissions of the areas vary. In some +cases all but a few users in the meta-group only have read-only access +to the area. In other cases, all users on the cluster have read-only +access. + +## Decommissioning + +Starting at the 2020.1 resource allocation period, storage decommissioning + procedures have been established for the HPC storages. This to ensure + predictable astorage for users and projects, and the provisioning more + sustainable to Sigma2. + For more details, please visit the +[data decommissioning policies](https://www.sigma2.no/data-decommissioning-policies) + page. diff --git a/_sources/files_storage/file_transfer.md.txt b/_sources/files_storage/file_transfer.md.txt new file mode 100644 index 000000000..3672d39a9 --- /dev/null +++ b/_sources/files_storage/file_transfer.md.txt @@ -0,0 +1,205 @@ +(file-transfer)= + +# File transfer + +```{admonition} Summary: use rsync for file transfer + +For file transfer to/from and between compute and storage systems (Betzy, Fram, +Saga, NIRD), **we recommend `rsync`**. This tool is often faster than `scp` (for +many small files and it does not copy files that are already there) and +potentially also safer against accidental file overwrites. +For more details, see {ref}`advantages-over-scp`. + +When using `rsync`, there is **no need to zip/tar files first**. + +On Windows, many other tools exist ([WinSCP](https://winscp.net/), +[FileZilla](https://filezilla-project.org/), +[MobaXterm](https://mobaxterm.mobatek.net/), and others), but we recommend to +use `rsync` through [Windows Subsystem for Linux +(WSL)](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux). +``` + + +## Transferring files between your computer and a compute cluster or storage resource + +This is a good starting point but below we will explain what these components +and options mean: +```console +$ rsync --info=progress2 -a file-name username@cluster:receiving-directory +$ rsync --info=progress2 -a directory-name/ username@cluster:receiving-directory/directory-name +``` + +- `--info=progress2`: This will show progress (how many percent, how much time + left). You can also leave it out if you don't need to know how far the + copying is. There is also a `--progress` option but that one will show + progress for each file individually and often you rather want to know the + overall progress. +- `file-name` or `directory-name`: These are on your computer and you want to + transfer them to the receiving server. +- `-a`: Preserves ownership and time stamp and includes the `-r` option which copies + folders recursively. +- `username`: Your username on the remote cluster. If your usernames on your + local computer and on the remote resource are the same, you can leave out the + `username@` part. +- `cluster`: The remote server. For example: `saga.sigma2.no`. +- `receiving-directory`: The directory on the remote server which will receive the file(s) and/or directories. + +If you want to make sure that `rsync` does not overwrite files that are newer +on the receiving end, add the `--update` option. + +If you want to `rsync` between two computers that both offer an SSH connection, note that then +you can use `rsync` both ways: from cluster A to cluster B, but also the reverse. + +````{admonition} rsync directory + +Please note that there is a trailing slash (`/`) at the end of the first argument in the +syntax of the second command, while rsync directories, ie: + +```console +rsync --info=progress2 -a directory-name/ username@cluster:receiving-directory/directory-name +``` +This trailing slash (`/`) signifies the contents of the directory `directory-name`. +The outcome would create a hierarchy like the following on your cluster: +```console +~/receiving-directory/directory-name/contents-of-the-dir +``` +Without the trailing slash,`directory-name`, including the directory, would be placed within your receiving directory. +The outcome without the trailing slash (`/`) at the first argument in the above command, ie: + +```console +rsync --info=progress2 -a directory-name username@cluster:receiving-directory/directory-name +``` +would create a hierarchy like the following on your cluster: +```console +~/receiving-directory/directory-name/directory-name/contents-of-the-dir +``` + +```` +## rsync using compression + +If you have a strong CPU at both ends of the line, and you’re on a slow +network, you can save bandwidth by compressing the data with the `-z` flag: + +```console +$ rsync --info=progress2 -az file-name username@cluster:receiving-directory +$ rsync --info=progress2 -az directory-name username@cluster:receiving-directory/directory-name +``` + + +## Problem with many small files + +Many small files are often not great for the transfer (although `rsync` does +not seem to mind but for `scp` this can make a big difference, see below). Many +tiny files are often also a problem for parallel file systems. If you develop +programs for high-performance computing, avoid using very many tiny files. + + +(advantages-over-scp)= + +## Advantages over scp and similar tools + +- `rsync` will not transfer files if they already exist and do not differ. +- With `rsync --update` you can avoid accidentally overwriting newer files in the destination directory. +- You can use compression for file transfer. +- Resumes interrupted transfers. +- More flexibility and better cross-platform support. + +Typically people recommend `scp` for file transfer and we have also done this +in the past. But let us here compare `scp` with `rsync`. In this example I +tried to transfer a 100 MB file from my home computer (not on the fast +university network) to a cluster, either as one large file or split into 5000 +smaller files. + +For one or few files it does not matter: +```bash +$ scp file.txt username@cluster:directory +# 81 sec + +$ rsync --info=progress2 -a file.txt username@cluster:directory +# 79 sec + +$ rsync --info=progress2 -az file.txt username@cluster:directory +# 61 sec +``` + +However, **it can matter a lot if you want to transfer many small files**. +Notice how the transfer takes 10 times longer with `scp`: +```{code-block} bash +--- +emphasize-lines: 2, 5 +--- +$ scp -r many-files username@cluster:directory +# 833 sec + +$ rsync --info=progress2 -a many-files username@cluster:directory/many-files +# 81 sec + +$ rsync --info=progress2 -az many-files username@cluster:directory/many-files +# 62 sec +``` + +In the above example, `scp` struggles with many small files but `rsync` does +not seem to mind. For `scp` we would have to first `tar`/`zip` the small files +to one large file but for `rsync` we don't have to. + +````{admonition} How was the test data created? +Just in case anybody wants to try the above example on their own, we used this +script to generate the example data: +```bash +#/usr/bin/env bash + +# create a file that is 100 MB large +base64 /dev/urandom | head -c 100000000 > file.txt + +# split into 5000 smaller files +mkdir -p many-files +cd many-files +split -n 5000 ../file.txt +``` +```` + + +## Transferring files between Betzy/Fram/Saga and NIRD + +Since NIRD is mounted on the login nodes of Betzy, Fram, and Saga, +one can use regular +`cp` or `mv` commands on the cluster login nodes to copy or +move files into or out of the NIRD project areas. + +For more information, please check out the page about +{ref}`storage-areas`. + + +## What to do if rsync is not fast enough? + +Disk speed, meta-data performance, network speed, and firewall speed may limit +the transfer bandwidth. + +If you have access to a network with a large bandwidth and you are sure that +you are limited by the one `rsync` process and not by something else, you can +start multiple `rsync` processes, by piping a list of paths to `xargs` or +`parallel` which launches multiple `rsync` instances in parallel. But please +mind that this way you can saturate the network bandwidth for other users and +also saturate the login node with `rsync` processes or overwhelm the file +system. If you have to transfer large amount of data and one `rsync` process is +not enough, we recommend that you talk to us first: {ref}`support-line`. + +Please also **plan for it**: If you need to transfer large amount of data, +don't start on the last day of your project. Data transfer may take hours or +even days. + + +## Troubleshooting: "Broken pipe" error during transfer + +The organization which provides the network to the clusters, may perform daily +housekeeping of their [DNS](https://en.wikipedia.org/wiki/Domain_Name_System) +and then the connection from outside to the NRIS services can drop. This can +cause a "broken pipe" error during file transfer from outside. + +One way to avoid this, especially while copying large datasets, is to use IP +addresses instead of domain names. + +One way to get the IP of one of the login nodes (example: Saga): +```console +$ nslookup saga.sigma2.no +``` diff --git a/_sources/files_storage/nird/access_lmd.md.txt b/_sources/files_storage/nird/access_lmd.md.txt new file mode 100644 index 000000000..c0b58e78e --- /dev/null +++ b/_sources/files_storage/nird/access_lmd.md.txt @@ -0,0 +1,38 @@ +# Access and login + + +## Getting access + +To gain access to the storage services, a formal application is required. The +process is explained at the +[How to apply for a user account](https://www.sigma2.no/how-apply-user-account) +page. + +Users must be registered and authorised by the project responsible +before getting access. + +To access or transfer data, we recommend to use `ssh` or +`rsync`. More details on our page about {ref}`file-transfer`. + +The NIRD can be accessed via the following address + +```console +login.nird.sigma2.no +``` + +You will be logged into your home area `/nird/home/$USERHOME` + + +## Software access + +Software can be accessed as module just like on HPCs. Please see the {ref}`module-scheme` for details. + +```{warning} +Module command is not yet stable. Please report any issue encountered to support line, +contact **[support@nris.no](mailto:support@nris.no)**. + +``` +In addition a number of software can be accessed via command line. + + + diff --git a/_sources/files_storage/nird/backup_lmd.md.txt b/_sources/files_storage/nird/backup_lmd.md.txt new file mode 100644 index 000000000..6a3f8fdeb --- /dev/null +++ b/_sources/files_storage/nird/backup_lmd.md.txt @@ -0,0 +1,76 @@ +(backup service)= + + + +# Backup as a Service on NIRD + +NIRD provides backup as a service. NIRD projects on Tiered Storage (NIRD TS) +can utilise the service for the dataset(s) that needs a higher level of security. +This will stay during the tenure of the project. The backup service can be requested by flagging this in the application form for storage resources during regular calls. + +The backup is from NIRD TS to NIRD DL. + +Should there be requirement for backup for certain dataset(s), those **must** be placed on NIRD TS. + +For example, if a project has an allocation on both NIRD TS and NIRD DL, the project can decide how to use the dedicated storage resources. However, should one require backup for a particular dataset, then: + + - that shall be flagged during the application process, + - and that dataset shall be placed on the NIRD TS. + + +```{note} +Notice, that there is no backup service for the data in the Data Lake. +``` + + +- Tiered Storage (NIRD TS) path on the system is `/nird/projects` +- Data Lake (NIRD DL) path on the system is `/nird/datalake` + +We advice projects to assess which of the dataset needs a higher level of +security and should be backed up. + +In general, one can consider which data can be easily reproduced, and which +are copies of files stored on other storage resources. These data normally +do not need backup service. + + +Restoring data from backup is done by NRIS. Should you need to restore data from backup, please contact NRIS support. + + +## Include, exclude rules +```{warning} +Projects will be able to control which file are included and which are excluded from backup. + +Notice, this system is currently only partially implemented. Please notify operations if you have adjusted the control file. +``` + +The solution for including or excluding data from backup for project data stored on NIRD TS, +is implemented by using a control file. + +The control file is named `.replication_exclude` and must be placed in the +root of the project directory. + e.g.: `/nird/projects/NS1234K/.replication_exclude` + +To exclude specific files or directories, those shall be listed in the +`.replication_exclude` control file. Each file or directory which is to be +excluded from replication, shall be added as a separate line. + +Lines in the `.replication_exclude` control file starting with `#` or `;` are +ignored. + +### Excluding a specific file + +To exclude the `/nird/projects/NS1234K/datasets/experiment/tmp_file.nc` file, +add `/datasets/experiment/tmp_file.nc` into the `.replication_exclude` control +file as a line on it's own. + + +### Excluding a directory + +To exclude the `/nird/projects/NS1234K/datasets/non_important/` directory, +add `/datasets/non_important` into the `.replication_exclude` control file +as a line on it's own. + +Mentions of `/datasets` on its own, would exclude everything in that directory. + + diff --git a/_sources/files_storage/nird/faq_nird.md.txt b/_sources/files_storage/nird/faq_nird.md.txt new file mode 100644 index 000000000..50f2ed0d3 --- /dev/null +++ b/_sources/files_storage/nird/faq_nird.md.txt @@ -0,0 +1,105 @@ +# Frequently asked questions on NIRD + +The FAQ on NIRD is prepared based on the Q&A session during the {ref}`training-2023-spring-second-generation-nird` training event. + +1. I am an HPC user. Should I apply for HPC storage or NIRD storage? + + - higly depends on user need. But if you would like to use services like Toolkit, you need a nird storage project. Pls see the service description here. + + - HPC storage is expensive, prefer NIRD for large volumes (15-20 TB), security features and backups + +2. Can you connect multiple projects to a single application? For instance, we have CMIP data stored in different projects at the moment, and would like to compare across the different projects. + + - yes it is possible if the PI’s (Project leaders) agreed to have common application. + + +3. What is the network bandwidth between new NIRD and Sigma2 HPC systems? + + - the link which is provided between HPC system (Betzy,Fram,Saga) and new nird is 100Gbit/s + + - 200GbE is between nird and service platform. + +4. What is the difference between tier storage and datalake? + + - Please see the detailed description [here](https://documentation.sigma2.no/files_storage/nird/ts_dl.html) + + - You can also watch the training videos [here](https://www.youtube.com/watch?v=iBwhDsZtAzs&t=74s) from 48:18 minutes.. + +5. We have several instruments deployed in remote area. The instruments synchronize collected data with NIRD storage. Would it be possible do obtain dedicated ‘user accounts’ for such devices (to avoid using real user’s accounts e.g. for generating ssh keys)? + + - The way forward now is to give S3 access and authenticate the user for. The S3 functionality is not still in production. + + - with minio it is possible for the owner of the service to create dedicated service accounts with specific access to some folders in the project connected to the minio service. + +6. What does multitenancy mean? + + - It means that each user does not get dedicated hardware to run on. The software we run serve several users on the same system. The tenants are logically isolated, but using the same physical system. See more [here](https://en.wikipedia.org/wiki/Multitenancy). + +7. How is the data distributed between flash and disk based storage? Automatically (e.g. by access patterns) or manually? + + - We have set of policies which will take care of distribution automatically,and it is also possible to manually pin the file/folder on different tier. Pinning of data to specific tier is based on operational requirements. (NB: even the slowest tier is more capable than the Betzy high-performance storage.) + +8. What are your recommendations to migrate our home directories from the old to the new NIRD? Should we copy the whole home directory with all hidden files or just our folders? Moreover, I have Miniconda installed in my home directory. Is it better to install it again on the new NIRD or is a copy of the entire old home directory fine? + + - (edit 02/2024) Acess to OLD NIRD is not available anymore. If any information is needed, contact us at the support email + - You don’t need to copy the whole home directory. You can choose what do you want to migrate. We suggest to not copy any conda files. + +9. Are snapshots counted towards quota? + + - No, snapshots are not counted on quota. Please remember that snapshots are temporary back up. Please see [here](https://documentation.sigma2.no/files_storage/nird/snapshots_lmd.html) + +10. How does rclone compare to using rsync? + + - We are not recommending rclone yet, but you can find documentation [here](https://www.clusterednetworks.com/blog/post/rsync-vs-rclone-what-are-differences) + +11. where is the ‘conda’ installed? + + - you can load the module `ml Miniconda3/4.7.10` + +12. We are currently using MinIO to provide an S3-compatible interface to our data on NIRD. Should we be considering the new NIRD object storage/Swift as an alternative? Any advantages or disadvantages? + + - It is an option you can use if you have allocation on datalake. We haven’t done the benchmark test yet to say the advantages or disadvantages between minio and S3. + +13. Is S3 actually going to replace the minio bucket service you were providing? + + - No it is not going to replace minio. It is an additional service on NIRD. + +15. As i understood all storage will be placed in a single location in the future, if yes, what will be the impact on high availability concept? + + - The two storage clusters, TS and DL, are physically separated with redundant power, cooling lines, automated fire extinguishers. + +16. Can I start a kernel at nird, which I can access from my local programming environment (e.g. spyder) to access and work with the data stored at my project area? Or how to acces the data from my local programming session? + + - You can mount your NIRD project on your local machine using sshfs although it has some performance limitations. + + - It is also possible to deploy a dedicated webdav service on the service platform. You can then mount it locally. + +17. How to use underlying k8s and kubectl integration with NIRD resources? + + - It is possible, you can contact us via support@nris.no + +18. Could you please share the slides (.pdf) of the nird training? + + - You can access the slides from [here](https://drive.google.com/drive/u/0/folders/1uevX2-bm9S7SePHQC6YUrWrO6J4lDfCA) + +19. Is the NIRD training videos recording available? + + - Yes, you can find it [here](https://www.youtube.com/watch?v=iBwhDsZtAzs&t=74s) + +20. What are the optimal storage structure in the shared project area and access by users. + + - Data can be shared between the project with an agreement with the PIs. + - web service connected with projects can be used for external users + - minio and S3 are other options + +21. Is there a possibility of running calculations on NIRD? + + - High performance computing like climate simulations are for HPCs + + - You can run post processing on NIRD, also see the [NIRD Toolkit documentation](https://documentation.sigma2.no/nird_toolkit/overview.html). + +22. As I understood all storage will be placed in a single location in the future, if yes, what will be the impact on high availability concept? + + - The two storage clusters, TS and DL, are physically separated with redundant power, cooling lines, automated fire extinguishers. + + diff --git a/_sources/files_storage/nird/mounts_lmd.md.txt b/_sources/files_storage/nird/mounts_lmd.md.txt new file mode 100644 index 000000000..5d8dac935 --- /dev/null +++ b/_sources/files_storage/nird/mounts_lmd.md.txt @@ -0,0 +1,24 @@ +# NIRD mounts on clusters + + The NIRD Storage project areas,tiered storage(TS) +and datalake (DL), are mounted on the login nodes of Betzy, Fram, or Saga. +One can directly access the NIRD project area from the login nodes of Betzy, Fram, or Saga. + +The path for tiered storage(TS) project is + +`/nird/projects/NSxxxxK` + +and the path for datalake (DL) project is + +`/nird/datalake/NSxxxxK` + +where ` NSxxxxK` is the ID of the project. + + + +```{warning} + +To avoid performance impact and operational issues, NIRD $HOME and project +areas are _not_ mounted on any of the compute nodes of the HPC clusters. +``` + diff --git a/_sources/files_storage/nird/snapshots_lmd.md.txt b/_sources/files_storage/nird/snapshots_lmd.md.txt new file mode 100644 index 000000000..d86c7594c --- /dev/null +++ b/_sources/files_storage/nird/snapshots_lmd.md.txt @@ -0,0 +1,55 @@ +# Snapshots on NIRD + +Both home directories (`/nird/home/$USER`) and all project areas in +NIRD TS (`/nird/projects/NSxxxxK`) and NIRD DL (`/nird/datalake/NSxxxxK`) +have temporary backup in the form of snapshots. + +Snapshots are taken with the following frequencies: +* `/nird/home/$USER`: + - daily snapshots for the last 7 days + - weekly snapshots for the last 6 weeks + +* `/nird/projects/NSxxxxK`: + - daily snapshots for the last 7 days + - weekly snapshots for the last 6 weeks + +* `/nird/datalake/NSxxxxK`: + - daily snapshots for the last 7 days + - weekly snapshots for the last 6 weeks + +```{warning} + Kindly note that snapshots are temporary and if the datasets needs higher + level of security and permanent back up, project leaders must use {ref}`backup service`. +``` + +## Where the snapshots are located + +The NIRD `$HOME` and NS project snapshots are available under: +- `/nird/home/.snapshots` +- `/nird/projects/NSxxxxK/.snapshots` +- `/nird/datalake/NSxxxxK/.snapshots` + +A deleted/overwritten file in the home directory on NIRD can be recovered like this: + +```console +$ cp /nird/home/.snapshots/DATE/$USER/mydir/myfile /nird/home/$USER/mydir/ +``` +Note that snapshots are taken every night only. This means that deleted files +which did not exist yet yesterday cannot be recovered from snapshots. + +To recover a deleted or overwritten file in NIRD TS `/nird/projects/NSxxxxK/dataset1/myfile`, +you can copy a snapshot back to the folder and restore the deleted/overwritten file like this:: + +```console +$ cp /nird/projects/NSxxxxK/.snapshots/DATE/dataset1/myfile /nird/projects/NSxxxxK/dataset1/ +``` + +To recover a deleted or overwritten file in NIRD DL `/nird/datalake/NSxxxxK/dataset1/myfile`, +you can copy a snapshot back to the folder and restore the deleted/overwritten file like this: + +```console +$ cp /nird/projects/NSxxxxK/.snapshots/DATE/dataset1/myfile /nird/projects/NSxxxxK/dataset1/ +``` + +Select the DATE accordingly to your case. + diff --git a/_sources/files_storage/nird/storage-areas_lmd.md.txt b/_sources/files_storage/nird/storage-areas_lmd.md.txt new file mode 100644 index 000000000..80be0f567 --- /dev/null +++ b/_sources/files_storage/nird/storage-areas_lmd.md.txt @@ -0,0 +1,48 @@ +# Storage areas, quota, and backup + + +## Home directories + +Each user has a home directory `/nird/home/`, where +`` is the username. The default quota for home directories +is 60 GiB and 300 000 files. To check the disk usage and quotas, type: + +```console +$ dusage +``` + + +## Scratch directories + +The total storage space of `/scratch` is 30 TiB. +Each user has a scratch directory `/nird/scratch/`. +The area is meant as a temporary scratch area. This area is not backed up. +There is no quota in the scratch area. + +The `/scratch/` area enforces an automatic cleanup strategy, files in this +area will be deleted after 21 days. +If file system usage reaches 75%, files in `/scratch/` will be deleted even +before 21 days. + + +## Project area + +Each NIRD Data Storage project gets a project area either on NIRD TS `/nird/projects/NSxxxxK` + or on NIRD DL `/nird/datalake/NSxxxxK` based on the project allocation, + where `NSxxxxK` is the ID of the project. + +The project area has a quota on disk space and the number of files. + +Quotas are allocated by the Resource Allocation Committee (RFK) on the NIRD resources for project storage, that is, on NIRD TS (`/nird/projects`) and NIRD DL (`/nird/datalake`). The two resources have separate quota based on the project allocation. + +You can see the quota and the current usage by running: + +```console +$ dusage -p NSxxxxK +``` + +```{note} +Notice, that quotas are shown in TiB (tebibyte), and not TB (terabyte). +``` + + diff --git a/_sources/files_storage/nird/ts_dl.md.txt b/_sources/files_storage/nird/ts_dl.md.txt new file mode 100644 index 000000000..381814c86 --- /dev/null +++ b/_sources/files_storage/nird/ts_dl.md.txt @@ -0,0 +1,50 @@ +# NIRD TS vs NIRD DL + +NIRD consists of two separate storage systems, namely Tiered Storage (NIRD TS) and +Data Lake (NIRD DL). + +NIRD TS has several tiers spanned by single filesystem and designed for performance and used mainly for active project data. + +NIRD DL has a flat structure, designed mainly for less active data, sharing data across multiple projects, and interfacing with external storages. + +Both are based on IBM Elastic Storage System. + + +## Architecture comparison + +| | NIRD TS | NIRD DL | +| :------------- | :------------- | :------------- | +| Tiers | Performance and capacity tiers
Automatic, transparent tiering
Dedicated pools for metadata | Flat architecture (no tiers) | +| Designed for | - active project data
- data processing
- AI workloads| - less active data
- data libraries
- sharing data across multiple projects
- interfacing with external storages | +| Data integrity secured by | - erasure coding
- snapshots
- backup[^1] | - erasure coding
- snapshots | + +## Functionality comparison + +| | NIRD TS | NIRD DL | +| :------------- | :------------- | :------------- | +| Protocols| POSIX, GPFS and NFS | POSIX, GPFS and S3[^2] | +| APIs | GPFS, Discover REST API[^3] | GPFS, S3, Discover REST API[^3] | +| Possibilities for| - file access logs
-data insight: metadata harvesting[^3] | - file access logs
- data insight: metadata harvesting[^3]
- encrypted projects | +| Access controls | - ACLs
- extended attributes | - ACLs
- extended attributes
- RBAC via S3[^2] | + +## Filesystems + +### NIRD TS +- Project storage `/nird/projects` +- User’s home `/nird/home` +- Scratch storage `/nird/scratch`[^4] +- Archive `/archive`[^5] + +### NIRD DL +- Project storage `/nird/datalake` +- Backup `/backup`[^5] +- Archive `/archive`[^5] + + + +-- +[^1]: optional, see [backup page](backup_lmd.md) +[^2]: to be enabled Q4 2023 +[^3]: available at the moment only for internal purposes, plans for testing with pilot projects +[^4]: available on NIRD login nodes only +[^5]: not accessible to users \ No newline at end of file diff --git a/_sources/files_storage/nird_lmd.md.txt b/_sources/files_storage/nird_lmd.md.txt new file mode 100644 index 000000000..e0ac196dd --- /dev/null +++ b/_sources/files_storage/nird_lmd.md.txt @@ -0,0 +1,75 @@ +(nird)= + + +# NIRD +## National Infrastructure for Research Data + +**NIRD** is the **N**ational e-**I**nfrastructure for **R**esearch **D**ata. It + is owned by [Sigma2](https://www.sigma2.no) and operated by [NRIS](https://www.sigma2.no/nris). + +```{note} +NIRD offers [storage services](https://www.sigma2.no/data-storage), [archiving services](https://www.sigma2.no/research-data-archive), [cloud services](https://www.sigma2.no/nird-service-platform) and processing capabilities on the stored data. It offers services +and capacities to any scientific discipline that requires access to +advanced, large scale, or high-end resources for storing, processing, +publishing research data or searching digital databases and collections. + +NIRD is a high-performance storage system, capable of supporting AI and analytics workloads, offering simultaneous multi-protocol access to the same data. +``` + +The next generation NIRD storage system is installed in [Lefdal Mine Datacenter](https://www.sigma2.no/data-centre-facility). +The new NIRD is redesigned for the evolving needs of Norwegian researchers and has +been procured through [the NIRD2020 project](https://www.sigma2.no/procurement-project-nird2020). + + +NIRD provides storage resources with yearly capacity upgrades, data security through backup services and adaptable application services, + multiple storage protocol support, migration to third-party cloud providers and much more. Alongside the national high-performance computing resources, NIRD forms the backbone of the national e-infrastructure for research and education in Norway, connecting data and computing resources for efficient provisioning of services. + + +### Technical Specifications + + +#### Hardware +**NIRD** consists of two separate storage systems, namely Tiered Storage (NIRD TS) and Data Lake (NIRD DL). The total capacity of the system is 49 PB (24 PB on NIRD TS and 25 PB on NIRD DL). + +NIRD TS has several tiers spanned by single filesystem and designed for performance and used mainly for active project data. + +NIRD DL has a flat structure, designed mainly for less active data. NIRD DL provides a unified access, i.e., file- and object storage for sharing data across multiple projects, and interfacing with external storages. + +NIRD is based on IBM Elastic Storage System, built using ESS3200, ESS3500 and ESS5000 building blocks. I/O performance is ensured with IBM POWER servers for I/O operations, having dedicated data movers, protocol nodes and more. + +| NIRD | | | +| :------------- | :------------- | :------------- | +| System |Building blocks |IBM ESS3200
IBM ESS3500
IBM ESS5000
IBM POWER9 | +| Clusters | Two physically separated clusters | NIRD TS
NIRD DL | +| Storage media | NIRD TS
NIRD DL | NVMe SSD & NL-SAS
NL-SAS +| Capacity | Total capacity: 49 PB | NIRD TS: 24 PB
NIRD DL: 25 PB | +| Performance | Aggregated I/O throughput | NIRD TS: 209 GB/s
NIRD DL: 66 GB/s | +| Interconnect | 100 Gbit/s Ethernet | NIRD TS: balanced 400 Gbit/s
NIRD DL: balanced 200 Gbit/s | +| Protocol nodes | NFS
S3 | 4 x 200 Gbit/s
5 x 50 Gbit/s| + + + +#### Software +IBM Storage Scale (GPFS) is deployed on NIRD, providing a software-defined high-performance file- and object storage for AI and data intensive workloads. + +Insight into data is ensured by IBM Storage Discover. + +Backup services and data integrity is ensured with IBM Storage Protect. + +## In-depth documentation for NIRD + +```{eval-rst} +.. toctree:: + :maxdepth: 1 + + nird/access_lmd.md + nird/storage-areas_lmd.md + nird/snapshots_lmd.md + nird/backup_lmd.md + nird/mounts_lmd.md + nird/ts_dl.md + nird/faq_nird.md + sharing_files.md +``` + + diff --git a/_sources/files_storage/performance.md.txt b/_sources/files_storage/performance.md.txt new file mode 100644 index 000000000..0b6ef4410 --- /dev/null +++ b/_sources/files_storage/performance.md.txt @@ -0,0 +1,146 @@ +(storage-performance)= + +# Optimizing storage performance + + +## What to avoid + +- Avoid having a **large number of files in a single directory** and + rather split files in multiple sub-directories. +- **Avoid repetitive `stat`** operations because it can create a significant + load on the file system. +- **Do not use `ls -l`** on large directories, because it can be slow. Rather + use `ls` and run `ls -l` only for the specific files you need + extended information about. + +(lustre-filesystem)= +## Lustre file system (Betzy and Fram) + +To get best throughput on the scratch file system (`/cluster/work`), you may +need to change the data striping. Striping shall be adjusted based on the +client access pattern to optimally load the object storage targets (OSTs). +On Lustre, the OSTs are referring to disks or storage volumes constructing the +whole file system. + +The `stripe_count` indicates how many OSTs to use. +The `stripe_size` indicates how much data to write to one OST before moving to +the next OST. + +* Striping will only take affect *only* on new files, created or copied + into the specified directory or file name. +* Default `stripe_count` on `/cluster` file system on Fram is 1. +* Betzy is implementing Progressive File Layouts to dynamically set file stripe + size based on file size growth. + +For more detailed information on striping, please consult the +[Lustre](https://www.lustre.org/) documentation. + +```{note} +**Betzy: Progressive File Layouts** + +PFL removes the need to explicitly specify striping for each file, +assigning different Lustre striping characteristics to contiguous +segments of a file as it grows. +Dynamic striping allows lower overhead for small files and assures +increased bandwidth for larger files. +However, note that for workloads with significant random read phases it is +best to manually assign stripe size and count. + +**Betzy: Data on Metadata** + +Lustre file system performance is optimized for large files. To balance +that, data on metadata (DoM) is enabled on Betzy to ensure higher +performance in case of frequently accessed small files. +Files accessed with a size of 2KB or smaller will be stored on a very +fast NVMe JBOD directly connected to the metadata servers. +``` + + +### How to find out the current striping + +To see the current stripe size (in bytes), use `lfs getsripe [file_system, dir, file]` +command. e.g.: +```console +$ lfs getstripe example.txt + +example.txt +lmm_stripe_count: 1 +lmm_stripe_size: 1048576 +lmm_pattern: raid0 +lmm_layout_gen: 0 +lmm_stripe_offset: 75 + obdidx objid objid group + 75 54697336 0x3429d78 0 +``` + + +### Rules of thumb to set stripe counts + +For best performance we urge you to always profile the I/O characteristics of +your HPC application and tune the I/O behavior. + +Here is a list of rules you may apply to set stripe count for +your files: +- files smaller than 1 GB: default striping +- files size between 1 GB - 10 GB: stripe count 2 +- files size between 10 GB - 1 TB: stripe count 4 +- files bigger than 1 TB: stripe count 8 + + +### Large files + +For large files it is advisable to increase stripe count and perhaps chunk size, +too. e.g.: +```bash +# stripe huge file across 8 OSTs +$ lfs setstripe --stripe-count 8 "my_file" + +# stripe across 4 OSTs using 8 MB chunks. +$ lfs setstripe --stripe-size 8M --stripe-count 4 "my_dir" +``` + +It is advisable to use higher stripe count for applications that +write to a single file from hundreds of nodes, or a binary executable that +is loaded by many nodes when an application starts. + +Choose a stripe size between 1 MB and 4 MB for sequential I/O. Larger than 4 MB +stripe size may result in performance loss in case of shared files. + +Set the stripe size a multiple of the write() size, if your application is +writing in a consistent and aligned way. + + +### Small files + +For many small files and one client accessing each file, change stripe count to 1. +Avoid having small files with large stripe counts. This negatively impacts the +performance due to the unnecessary communication to multiple OSTs. +```console +$ lfs setstripe --stripe-count 1 "my_dir" +``` + +(saga-filesystem)= +## BeeGFS filesystem (Saga) + +Striping in BeeGFS (`/cluster`) cannot be re-configured on Saga by users, it can currently +only be modified by system administrators. + +But one thing you can do is to check the current stripe size (here for an example file): +```console +$ beegfs-ctl --getentryinfo example.txt + +Entry type: file +EntryID: 55-628B3E1D-144 +Metadata node: mds3-p2-m4 [ID: 324] +Stripe pattern details: ++ Type: RAID0 ++ Chunksize: 512K ++ Number of storage targets: desired: 4; actual: 4 ++ Storage targets: + + 1203 @ oss-4-1-stor2 [ID: 12] + + 2101 @ oss-4-2-stor1 [ID: 21] + + 2102 @ oss-4-2-stor1 [ID: 21] + + 2103 @ oss-4-2-stor1 [ID: 21] +``` + +This shows that this particular file is striped over 4 object storage targets. diff --git a/_sources/files_storage/quota.md.txt b/_sources/files_storage/quota.md.txt new file mode 100644 index 000000000..cbac45a2e --- /dev/null +++ b/_sources/files_storage/quota.md.txt @@ -0,0 +1,236 @@ +(storage-quota)= + +# Storage quota + +```{contents} Table of Contents + +``` + +```{admonition} Frequently asked questions +- **I cannot copy files although we haven't used up all space**: + You have probably exceeded the quota on the number of files. + +- **I have moved files to the project folder but my home quota usage did not go down**: + Depending on the cluster, moving files does not change ownership of the files. + You need to also change the ownership of the files in the project folder from + you to the project (change the ownership from `username_g` to `username`; see + also below). +``` + +## What is quota and why is it needed? + +**Storage is a shared and limited resource** and in a number of places we need to +enforce quota to avoid that some script accidentally fills up the disk and the +system becomes unusable for everybody. + +Storage quota is specified in: + +- **Number of files** (or "inodes"): limits how many files you or a group may own. + When this limit is reached, you or the group cannot create new files (but you + might still increase the size of existing files). "Inodes" are entries + in the index node table which store attributes and disk block locations + for each file and folder. +- **Space limit**: affects the aggregated + size of all your files or files of a group. When this limit is reached you + or the group cannot store more data (new data or increasing file sizes) on + the system. + +## Quota applies to specific folders + +Often it is intended that storage quota applies to a specific folder on the +file system. For example, the so-called HOME quota shall apply to your home +folder `/cluster/home/user`. A project may have dedicated quota for data +stored under their project folder which is found under +`/cluster/projects/nnABCDk` where `nnABCDk` is the account name of your +project. + +Because file systems have different +features, unfortunately it is not always guaranteed that what you observe on +the system matches this intention. Below, we will discuss how to detect and +troubleshoot such situations. + +## Getting information about your usage and quota + +We can get an overview with the `dusage` command. This is not a built-in +Unix command but rather a tool which [we have +developed](https://github.com/NordicHPC/dusage) for NRIS clusters to wrap +around lower-level commands and tools to get a quick overview. The actual +output might be different for every user: + +```console +$ dusage + +dusage v0.3.0 + + path space used quota (s) quota (h) files quota (s) quota (h) +------------------------- ------------ ----------- ----------- --------- ----------- ----------- + /cluster/ 14.5 GiB 235 068 + /cluster/home/**** 5.4 GiB 20.0 GiB 20.0 GiB 39 921 100 000 100 000 + /cluster/work/users/**** 7.6 GiB 195 083 +/cluster/projects/nn****k 1.7 TiB 2.0 TiB 2.0 TiB 1 036 089 2 000 000 2 000 000 +/cluster/projects/nn****k 3.0 TiB 10.0 TiB 10.0 TiB 2 805 458 10 000 000 10 000 000 +/cluster/projects/nn****k 134.2 MiB 1.0 TiB 1.0 TiB 168 1 000 000 1 000 000 + +Please report issues at: https://github.com/NordicHPC/dusage +``` + +The column "files" (number of files) actually lists inodes and we know that +these are not precisely the same thing but we have chosen the name "files" +since it is hopefully more intuitive to the users who may have never heard of +"inodes". + +````{admonition} What are inodes? +[Inodes](https://en.wikipedia.org/wiki/Inode) are entries in the index node +table which store attributes and disk block locations for each file and folder. +If you want to see the inode numbers for your files and folders, +try: +```console +$ ls -li +``` +```` + +## Troubleshooting: Disk quota is full + +- **This can be surprising for users and difficult to debug for staff**: + + - On Saga and Fram: Depending on the state of the file system there can be a + lag between going over quota and experiencing "Disk quota exceeded" errors. + - On Saga and Fram: If you moved files and kept wrong group permissions, this + can exceed quota but we have overnight scripts which fix group permissions + so it can look good again next morning. + - `dusage` can indicate that you are above quota although `du` may show that + there are almost no files or data used: the reason is that moving files + does not change ownership and in this case `du` and `dusage` can give a different + information. Only `dusage` gives you reliable information about how your + quota is affected. + +- **Recovery on Fram and Saga**: + + - Moving files to project data or `$USERWORK` may not be enough since `mv` + preserves group permissions. Therefore you have the following options: + - Copy files and then carefully delete the files in `$HOME`. + - Move files and adjust group permission with `chown` or `chgrp`. + - Move files and wait overnight for our scripts to adjust them for you. + +- **Recovery on Betzy**: + + - Try to move data from `$HOME` to project data. + - Consider using `/cluster/work/users/$USER` (`$USERWORK`). But also mind + that files older than 21 days might get automatically deleted and + no recovery option exists then (auto-cleanup period is at least 21 days and + up to 42 days if sufficient storage is available). + - If the above are not enough or not suitable, contact support and discuss + whether it can make sense to increase project or user quota. + +- **Recommendations**: + - If you tend to fill up quota in your job scripts, add a `dusage` at the + beginning and at the end of the job script. Having the output will make + diagnostics easier. If you don't `dusage` right when you run the job, then + a job crash and a later `dusage` may tell different stories. + - `rsync` users: Please be careful adjusting the group ownership on Saga and + Fram. + +## Troubleshooting: Too many files/inodes on Fram + +Fram has a default 1 million inode quota for each user under `/cluster` filesystem regardless of project and group inode quota : + +```{code-block} +--- +emphasize-lines: 3 +--- + path space used quota (s) quota (h) files quota (s) quota (h) +------------------------- ------------ ----------- ----------- ------- ----------- ----------- + /cluster/ 247.4 GiB 70 225 1 000 000 3 000 000 + /cluster/home/**** 1.8 GiB 20.0 GiB 30.0 GiB 44 395 100 000 120 000 + /cluster/work/users/**** 243.8 GiB 3 763 +/cluster/projects/nn****k 927.4 GiB 1.0 TiB 1.1 TiB 434 697 1 048 576 1 150 976 +/cluster/projects/nn****k 2.3 TiB 10.0 TiB 11.0 TiB 665 879 10 000 000 11 000 000 +/cluster/projects/nn****k 4.0 KiB 1.0 TiB 1.0 TiB 1 1 000 000 1 000 000 +``` + +We can think of "inodes" as files or file chunks. + +This means that on Fram it is possible to fill the "files"/inode quota by +putting more than 1 M files in `/cluster/work/users/user` although the latter +is not size-quota controlled. + +To check the number of inodes in a directory and subsequent subdirectories, use the following command: + +```console +$ find . -maxdepth 1 -type d -exec sh -c 'echo -n "{}: "; find "{}" -type f | wc -l' \; | sort -n -k2 -r + +/cluster/home/user: 75719 +/cluster/home/user/.conda: 39222 +/cluster/home/user/.rustup: 20526 +/cluster/home/user/work: 11983 +/cluster/home/user/project: 1134 +/cluster/home/user/something: 602 +``` + +The above command counts the number of files in each directory and lists them +sorted with the most numerous directory on top. + +Please contact support if you are in this situation and we can then together evaluate +whether it makes sense to increase the inode quota for you. + +## Troubleshooting: Too many files in a Conda installation + +- A Conda installation can fill your storage quota because it can install + thousands of files. +- **Recommendation**: Do not install a Conda environment into `$HOME`. +- **Recovery** from a `$HOME`-installed Conda environment: + - Install a new environment into project data or `$USERWORK` and then delete + the `$HOME`-installed Conda environment. + But also mind + that files older than 21 days might get automatically deleted and + no recovery option exists then (auto-cleanup period is at least 21 days and + up to 42 days if sufficient storage is available). + - Advanced alternative: Use a Singularity container for the Conda environment. + +## Changing file ownership on Fram or Saga + +```{note} +This section is **not relevant for Betzy** as disk quotas on Betzy are based on +directories instead of groups. +``` + +Since file permissions are persistent across the file system, it might be +necessary to manually change the ownership of one or more files. This page +will show an example of how to change ownership on a file that was moved from +`$HOME` to `$USERWORK` in order to update the disk quotas. + +In this example we have a file in our `$HOME` called "myfile.txt" which is 1 +GiB in size that we're moving to `$USERWORK` for use in a job: + +```console +$ ls -l + +total 1048576 +-rw-rw-r-- 1 username username_g 1073741824 Nov 13 13:11 myfile.txt +``` + +```console +$ mv myfile.txt /cluster/work/users/username +``` + +By checking our disk usage with `dusage` we could confirm that the file is still +counted towards the `$HOME` quota. The reason for this is that the file is +still owned by the `username_g` group, which is used for the `$HOME` quota.: + +Files in `$USERWORK` should be owned by the default user group, in this - the +group named `username`. To change the file group ownership we can use the +command `chgrp`: + +```console +$ chgrp username myfile.txt +``` + +```console +$ ls -l + +total 1048576 +-rw-rw-r-- 1 username username 1073741824 Nov 13 13:11 myfile.txt +``` + +The file is now owned by the correct group and we can verify that the disk +quotas have been updated by running `dusage` again. diff --git a/_sources/files_storage/sharing_files.md.txt b/_sources/files_storage/sharing_files.md.txt new file mode 100644 index 000000000..853beae44 --- /dev/null +++ b/_sources/files_storage/sharing_files.md.txt @@ -0,0 +1,76 @@ +# Data handling and storage policy + +```{warning} +**User areas and project areas are private** + +You can share files with other project members using project areas. +``` + +All data accessed, stored, communicated, or transferred on any national HPC +system (Betzy, Fram and Saga) or the National e-Infrastructure for Research Data (NIRD), +must be handled in compliance to legal and regulatory requirements. + +In addition, all data has to be directly related to the work effectuated and/or +the research project(s) the user is participating. + + +## User areas + +User's private data (such as keys, sessions, e-mails, etc.) may reside in their +home directory (`$HOME`). +`$HOME` **is not a shared area** and all data stored there has to be treated as +being private, regardless of its content. + +To limit access to `$HOME` only to the user and designated system administrators, +the directory permissions are set to 0700 (meaning: only the user can read, write, and execute). +Permissions are regularly controlled, and in case of mismatch, reset. + +On the HPC clusters, users also have a *user work area*, +`/cluster/work/users/$USER` (`$USERWORK`). It is possible to grant other users access here, f.ex +for debugging purposes, but you may not grant *write* access to *others*. Be mindful of the permissions +you set here, as you may inadvertently allow others to delete or modify your files. + + +## Project areas + +Project data is private to the project and shared between the project members. +The project leader (PL) has sole discretion over project members, thus access +to the project area(s). + +Project local to a particular HPC system has its own directory, created with +permissions set to 2770 (meaning that only the group can read, write, and execute). + +Group ownership is regularly controlled for each project directory and reset in +case needed to the group ID. This is required for storage accounting purposes. + + +## Shared project areas + +In special cases there might be a need for sharing data between projects for +collaboration and possibly preventing data duplication. + +If such a need is justified, a meta-group and the corresponding directory can be +created. Access to the shared project area is at the project leader's (PL) sole discretion. +For example, if the PL of the project owning the file group `acme` wants +`/cluster/shared/acme/inventory_db` to be world readable, the project leader is allowed to do this +(e.g., by running, `chmod -R o+r /cluster/shared/acme/inventory_db`). + +Please note that: +- The shared project areas **must not** contain any private data. +- You **must never** set any directory or file to world writable. + +For accounting purposes, the group ownerships are regularly controlled, and +in case needed, reset. + +If you need to share with the outside world (outside of your HPC systems), +please refer to our [File transfer documentation](file_transfer.md). + +## Decommissioning + +Starting at the 2020.1 resource allocation period, storage decommissioning + procedures have been established for both HPC and NIRD project storages, + to make storage more predictable for the projects and the provisioning + more sustainable to Sigma2. + For more details, please visit the +[data decommissioning policies](https://www.sigma2.no/data-decommissioning-policies) + page. diff --git a/_sources/getting_help/course_resources.md.txt b/_sources/getting_help/course_resources.md.txt new file mode 100644 index 000000000..d01bbf9d4 --- /dev/null +++ b/_sources/getting_help/course_resources.md.txt @@ -0,0 +1,8 @@ +(course-resources)= + +# CRaaS - Course Resources as a Service + +CRaaS is a service for researchers who require e-infrastructure resources to be +used in a course or a workshop for research purposes. + +For further details, see [here](https://www.sigma2.no/course-resources-as-a-service) diff --git a/_sources/getting_help/extended_support.rst.txt b/_sources/getting_help/extended_support.rst.txt new file mode 100644 index 000000000..1fc28a6dc --- /dev/null +++ b/_sources/getting_help/extended_support.rst.txt @@ -0,0 +1,40 @@ +.. _extended-support: + +Extended support +================ + +NRIS provides resources, such as computers, networks, storage systems and a software stack that is adapted to the current research being performed in Norway. This provides the community with a valuable resource. + +However, what we believe is even more valuable is the fact that you as a user have easy access to a broad selection of competences that can be utilized to improve the quality, reproducibility and efficiency of your current flow of work. Our staff are at the forefront of digital competence and can assist you in reaching post state-of-the-art and address that competitive edge you want to reach or keep. We also possess domain competence in key fields and can not only provide digital competence, but also adapt this to the challenge you are having. Together we can thus ensure that this is addressed in the most relevant, efficient and result oriented way. + +Even though our digital competence is at the forefront, that does not mean we only provide assistance and appreciate complex state-of-the-art challenges. On the contrary, we would really like to help you and your activity, regardless where you are coming from. In fact, sometimes nothing feels better than seeing you and your flow or work evolve after getting a little help in the start. + +Whether that challenge is small, large, complex or trivial does not matter, please `reach out `_ and we will find a common ground for collaboration. Using that as an entry point we can figure out together how, who and what kind of competences we should involve to address the challenge. And for the most part this is free of charge. + +A few examples of what we can offer you: + +- Figure out what kind of competence you and your team might be lacking and put you in contact with sources of such competences and/or provide training and competence transfer. + +- Introduce you and your team to modern practices with respect to code development, maintenance and support. + +- Improving the digital and/or scientific quality of your code base. + +- Contribute to porting your code base to be ready for high-performance compute facilities. + +- Contribute to porting your code base to be ready for utilization of GPU resources. + +- Optimizing your code to run more efficient on high-performance compute facilities, including the utilization of GPUs, or the combinations of CPUs and GPUs. + +- Assist in developing and facilitating for more automation, reproducibility and high-throughput possibilities in your flow of work. For instance developing workflows that ensures data provenance. + +- Competence and insight that sits close to the domain topic of interest. Sometimes it is not enough to have deep digital competence to address a challenge, but one also needs overview of the role and character of this challenge for your flow of work. Typically, optimizing a specific solver might incur unwanted effects on other parts of the program which is hard to understand unless you also understand what the solvers are actually doing. + +If you are interested in reading more about our specific stimuli frameworks please take a look at :ref:`Extended User Support (EUS) `, :ref:`Advanced User Support (AUS) `, :ref:`GPU Support `, :ref:`National Competence Center (NCC) `). But we encourage you to contact us first to spare you the acronyms and letting us do the work of coordinating how to facilitate further progress on your challenge. + +.. toctree:: + :maxdepth: 1 + + extended_support/eus.md + extended_support/aus.md + extended_support/gpu.md + extended_support/ncc.md diff --git a/_sources/getting_help/extended_support/aus.md.txt b/_sources/getting_help/extended_support/aus.md.txt new file mode 100644 index 000000000..dadf981f1 --- /dev/null +++ b/_sources/getting_help/extended_support/aus.md.txt @@ -0,0 +1,9 @@ +(extended-support-aus)= + +# Advanced User Support (AUS) + +Advanced User Support offers services to provide specialised and more in-depth +competence to a research group or community. + +For service description, applying for AUS and further details, follow the +documentation [here](https://www.sigma2.no/advanced-user-support). diff --git a/_sources/getting_help/extended_support/eus.md.txt b/_sources/getting_help/extended_support/eus.md.txt new file mode 100644 index 000000000..fb1ac5a62 --- /dev/null +++ b/_sources/getting_help/extended_support/eus.md.txt @@ -0,0 +1,8 @@ +(extended-support-eus)= + +# Extended User Support + +For a smaller adoptions, optimisations and changes which are exceeding ordinary support, you can ask support staff for extended user support (formerly called mini Advanced User Support). Extended user support can also be initiated by the support staff, typically if they see that this could be of a relevance, f.ex. in connection with in-effective jobs. In this case you will be contacted by the support staff and offered the Extended User Support. + +Extended User Support projects are not requiring any funding or in-kind from the project or user. The total amount of work effort must not exceed **5 working days**. A likely outcome is that Extended User Support could lead to formulation of a proper AUS project, which needs an application and evaluation, but in turn could assign a few months of dedicated work from one of our experts. Again, see the +documentation [here](https://www.sigma2.no/advanced-user-support). diff --git a/_sources/getting_help/extended_support/gpu.md.txt b/_sources/getting_help/extended_support/gpu.md.txt new file mode 100644 index 000000000..f42ea6ca1 --- /dev/null +++ b/_sources/getting_help/extended_support/gpu.md.txt @@ -0,0 +1,23 @@ +(extended-support-gpu)= + +# GPU Support +GPU (**G**raphics **P**rocessing **U**nit) are accelerators that can be +efficiently utilized in HPC to speed-up calculations that deal with lots of +data. GPUs have successfully been used to speed-up applications such as machine +learning, weather forecasting, molecular dynamics, among many other +applications. + +At NRIS we have a dedicated team to support researchers in utilizing the +different accelerators at your disposal. You can contact the team through the +{ref}`normal support channel `. + +Example of activities that NRIS supports: +- Helping to use Slurm and GPUs +- Ensure software is set up for GPUs +- Advice about how to get starting using GPUs +- Transition to using GPUs +- Optimizing existing GPU implementations + - With a focus on software developed at the universities in Norway +- Training + - Both from NRIS and in collaboration with vendors +- Documentation and [tutorials](code_development). diff --git a/_sources/getting_help/extended_support/ncc.md.txt b/_sources/getting_help/extended_support/ncc.md.txt new file mode 100644 index 000000000..72de0b338 --- /dev/null +++ b/_sources/getting_help/extended_support/ncc.md.txt @@ -0,0 +1,5 @@ +(extended-support-ncc)= + +# National Competence Center + +The National Competence Center (NCC) raises awareness and provides Norwegian industry with the expertise necessary to take advantage of the innovation possibilities created by HPC (High-Performance Computing) and associated technologies. This includes HPDA (High-Performance Data Analytics), ML (Machine learning) and AI (Artificial intelligence) and auxiliary technologies necessary to realize successful utilization of those. The NCC thus strive to increasing the competitiveness of entities in the industry. NORCE, SINTEF and Sigma2 are partners in the NCC and it is supported by the EU and Norwegian Research Council. The NCC is a resource for the industry and public sector where competence is available, not only in the digital domain, but also, importantly in the intersection between the respective domains. As such, the NCC can be used by the industry or public sector to realize a faster and more efficient digital shift, in addition to being a hub for on-demand competence and resource access for entities not willing to take on this investment on their own. The NCC can also be utilized if entities want to establish digital competence and/or facilities to provide compute or storage resources on their own, and need advice or consulting to realize it. diff --git a/_sources/getting_help/faq.md.txt b/_sources/getting_help/faq.md.txt new file mode 100644 index 000000000..6ffb898c6 --- /dev/null +++ b/_sources/getting_help/faq.md.txt @@ -0,0 +1,228 @@ +# Frequently asked questions + + +## Access and connections + +### How do I change my password? + +Please consult {ref}`lost-passwords`. + + +### I forgot my password - what should I do to recover it? + +Please consult {ref}`lost-passwords`. + + +### What is the ssh key fingerprint for our systems? + +Please consult {ref}`this page `. + + +### Connecting to the cluster + +Typically users connect to our clusters with an SSH client. Please consult {ref}`this page ` for additional details. + + +### How can I access a compute node from the login node? + +Log in to the login node, for instance Fram: +```console +$ ssh myusername@fram.sigma2.no +``` + +Then connect to the compute node (on Fram and Saga): +```console +$ ssh c3-5 +``` + +Or on Betzy: +```console +$ ssh b4296 +``` + +Notice that you typically can only log into a compute node where you have a running job. + + +### My ssh connections are freezing. How to fix it? + +If your ssh connections more or less randomly are freezing, try +to add the following to `~/.ssh/config` file on your computer/laptop: +```cfg +ServerAliveCountMax 3 +ServerAliveInterval 10 +``` + +The above configuration is for [OpenSSH](https://www.openssh.com), if you're +using +[PUTTY](https://www.chiark.greenend.org.uk/~sgtatham/putty/docs.html) +you can take a look at this page explaining +[keepalives](https://the.earth.li/~sgtatham/putty/0.60/htmldoc/Chapter4.html#config-keepalive) +for a similar solution. + +--- + +## Installing software + +### I need to use Python but I am not satisfied with system default + +You can choose different Python versions using either the {ref}`module-scheme` or +{ref}`Anaconda/Miniconda `. +In Anaconda, you +typically load first the Anaconda module you like and then from within that you +can chose and configure the Python version and environment. Please consult the +[Anaconda documentation](https://docs.anaconda.com/) for details. + +In cases where these routes still do not solve your problem or you would like +to install a package yourself, please consult this +page about {ref}`installing-software-as-user`. +If you are still stuck or would like +support, please {ref}`contact us `. + + +### Can I install software as a normal user without sudo rights or a root account? + +Yes. In fact, this is the recommended approach to install software that we do +not offer to all users. +Please consult this +page about {ref}`installing-software-as-user`. + + +--- + +## Compute and disk usage, in addition to allocated quota + +### How can I check my disk quota and usage? + +Please consult the page on {ref}`storage-quota`. + + +### How can I check my CPU hours quota and usage? + +Please consult the page on {ref}`projects-accounting`. + + +--- + +## Graphical interfaces + +### How can I export the display from a compute node to my desktop? + +Please consult this note on {ref}`x11-forwarding`. + +This example assumes that you are running an X-server on your local +desktop, which should be available for most users running Linux, Unix +and Mac Os X. If you are using Windows you must install some X-server +on your local PC. + + +--- + +## Jobs, submission, and queue system + +### I am not able to submit jobs longer than the maximum set walltime + +For all {ref}`job-types` there is a maximum walltime. If you try to set a +walltime that is larger than this, the job will not be accepted when you submit it. We recommend you +to try to segment the job using {ref}`job-scripts`. If this does not suit your need, +please {ref}`contact us `. The main +intention to have a limit on the max walltime is to make sure the queue system works as best as possible and +as such would give a better experience for most users. + + +### Where can I find an example of job script? + +Here we have examples for {ref}`job-scripts-on-fram` and {ref}`job-scripts-on-saga`. + + +### When will my job start? + +To find out approximately when the job scheduler thinks your job will +start, use the command: +```console +$ squeue --start -j +``` + +where `` is the number of the job you want to check. +This command will give you information about how many CPUs your job requires, +for how long, as well as when approximately it will start and complete. It +must be emphasized that this is just a best guess, queued jobs may start +earlier because of running jobs that finishes before they hit the walltime +limit and jobs may start later than projected because new jobs are submitted +that get higher priority. + + +### How can I see the queue situation of my job(s)? + +How can I see how my jobs are doing in the queue, if my jobs are idle, blocked, running etc. by issuing: +```console +$ squeue -u +``` +where `` is your username. You can of course also check the queue by not adding a username. For additional +details on how to monitor job(s), please consult page about {ref}`monitoring-jobs`. + +### Why are my devel/short/preproc jobs put in the “normal” queue even though I specify `--qos` in my job script? + +The `--qos` specified jobs, like `devel`, `short` and `preproc`, by default run in the standard partition - i.e. `normal` but will have different properties. For detailed explanation see {ref}`queue-system`. +In order to see your jobs in the devel queue, use the following command, (you can replace `devel` with `short` or `preproc` to see the respective queues) +```console +$ squeue -q devel -u +``` + +### Why does my job not start or give me error feedback when submitting? + +Most often the reason a job is not starting is that the resources are busy. Typically there are many jobs waiting +in the queue. But sometimes there is an error in the job script and you are asking for a configuration (say a combination of +memory and cores) that is not possible. In such a cases you do not always get a message that the options are invalid on submission +and they might not be, but the combination will lead to a job that never starts. + +To find out how to monitor your jobs and check their status see {ref}`monitoring-jobs`. + +**Priority** means that resources are in principle available, but someone else has +higher priority in the queue. **Resources** means the at the moment the requested +resources are not available. + + +### How can I run many short tasks? + +The overhead in the job start and cleanup makes it not practical to run +thousands of short tasks as individual jobs on our resources. + +The queueing setup, or rather, the accounting system generates +overhead in the start and finish of a job. About a few seconds at each end +of the job for instance. This overhead is insignificant when running large parallel +jobs, but creates scaling issues when running a massive amount of +shorter jobs. One can consider a collection of independent tasks as one +large parallel job and the aforementioned overhead becomes the serial or +unparallelizable part of the job. This is because the queuing system can +only start and account one job at a time. This scaling problem is +described by [Amdahl's Law](https://en.wikipedia.org/wiki/Amdahl%27s_law). + +If the tasks are extremely short, you can use the example below. If you want to +spawn many jobs without polluting the queueing system, please use {ref}`array-jobs`. + +By using some shell trickery one can spawn and load-balance multiple +independent task running in parallel within one node, just background +the tasks and poll to see when some task is finished until you spawn the +next: + +```{eval-rst} +.. literalinclude:: ./files/multiple.sh + :language: bash +``` + +And here is the `dowork.sh` script: + +```{eval-rst} +.. literalinclude:: ./files/dowork.sh + :language: bash +``` + +### Another user is clogging up the queue with lots of jobs! + +The job scheduler on NRIS systems is normally configured to use a "Priority" attribute to determine which jobs to start next. This attribute increases over time (up to 7 days max), and is applied to a maximum of 10 jobs per user. There is no limit on the number of jobs or resources one user/project may request. + +Superficially this may seem like a "first come first serve" system that allows a single user to 'block' others by submitting a large amount of jobs, but in reality it is a bit more complex since jobs may be of different sizes and lengths. + +If there is a pending job with a high priority ranking that requires many CPUs for a long time, the scheduler will try to create a slot for this job in the future. As already running jobs finish up at different points in time, freeing up resources, the scheduler will attempt to squeeze in other jobs into the now-idle resource in a manner that does not extend the waiting time before the slot for the larger job is freed up in order to utilize the cluster as much as possible. + +The "fairness" of this might be debatable, but in our experience this is the least unfair method that also ensures that the systems are idle as little as possible. diff --git a/_sources/getting_help/how_to_write_good_support_requests.md.txt b/_sources/getting_help/how_to_write_good_support_requests.md.txt new file mode 100644 index 000000000..4d1d42c11 --- /dev/null +++ b/_sources/getting_help/how_to_write_good_support_requests.md.txt @@ -0,0 +1,95 @@ +(good-support-requests)= + +# Writing good support requests + +Writing descriptive and specific support requests helps the support team +understand your request quicker. Below is a list of good practices. + + +## Create a ticket + +Send an email to [support@nris.no](mailto:support@nris.no) to create a support ticket. Tickets +are tracked and have higher visibility. Everyone in the support team can see +the tickets and respond appropriately. + + +## Create a ticket for each issue + +Creating a ticket for separate issues ensures that each issue is given the +appropriate priority and can be tracked easily. Adding a new issue to a +resolved or unrelated issue diminishes its visibility. + + +## Give descriptive and specific subject line + +The subject line should be descriptive and specific to the issue. "Problem on +Fram" is not descriptive enough and does not differentiate itself from other +issues. + + +## Specify the environment and intent + +Describe the system environment such as which modules and build environment +were used. Details such as compilers and script commands are also important to +write in the support mail. The support team can then replicate the environment +and reproduce the issue. + + +## Tell us what has been done + +Tell us what actually worked so far and what was attempted to solve the issue. +Often we get requests of the type "I cannot get X to run on two nodes". The +request does not mention whether either or both has ever worked or if this was +the first attempt. + + +## Create an example which reproduces the problem + +Create an example that demonstrates the problem. Examples should be easy to set +up and run, otherwise, it is time consuming if the support team needs to +diagnose the issue with only a description. Make sure that we can run the +example. Note that the support team does not access read-protected files +without your permission. + +Try to reduce the example so that the support team encounters the issue +quickly. It is easier to schedule and debug a problem which crashes after few +seconds compared to problem that happens after a few hours. + + +## Please send us full paths to examples + +Instead of telling us that the example can be found in `~/myexample/` it is +much easier for us if you give us the full path, e.g. +`/home/myuser/myexample/`. +Use `pwd` to get the full path for your current folder. + +The reason is that we don't know where `~` points to in your case. We have +hundreds of users and we do not remember usernames. For the staff `~` will +point to a different place (their home folder) and we will have to look up your +username and it's an extra step that we would prefer to avoid. + + +## Describe the original problem and intent (The XY problem) + +Often we know the solution but we don't know the problem. Please read + which happens when a user's original issue is masked +by a different problem. + +In short (quoting from ): + +- User wants to do X. +- User doesn't know how to do X, but thinks they can fumble their way + to a solution if they can just manage to do Y. +- User doesn't know how to do Y either. +- User asks for help with Y. +- Others try to help user with Y, but are confused because Y seems + like a strange problem to want to solve. +- After much interaction and wasted time, it finally becomes clear + that the user really wants help with X, and that Y wasn't even a + suitable solution for X. + +To avoid the XY problem, if you struggle with Y but really what you are +after is X, please also tell us about X. Tell us what you really want to +achieve. Solving Y can take a long time. We have had cases where after +enormous effort on Y we realized that the user wanted X and that Y was +not the best way to achieve X. diff --git a/_sources/getting_help/lost_forgotten_password.md.txt b/_sources/getting_help/lost_forgotten_password.md.txt new file mode 100644 index 000000000..a1eb2aab7 --- /dev/null +++ b/_sources/getting_help/lost_forgotten_password.md.txt @@ -0,0 +1,39 @@ +(lost-passwords)= + +# Lost, expiring or changing passwords + +## How do I change my password? + +The password can be changed on [this page](https://www.metacenter.no/user/password/). +Log in using your existing username. + +## I forgot my password. How can I reset it? + +Go to [this page](https://www.metacenter.no/user/reset/) and fill in your username. +An activation key will be sent to you to the mobile number associated with your username. +A mobile number is required to get access, but we do accept foreign numbers (just make sure the number registered contain the full + callable number including correct country code etc.). If you have no number registered, you have two options: (i) use your OpenIdP or Feide account to log into your account on [this page](https://www.metacenter.no/user/login/) and enter a valid mobile number, or (ii) contact [contact@sigma2.no](mailto:contact@sigma2.no). + +Note that it can take 15 minutes for the change to propagate to the login nodes. + +See also [this page](https://www.sigma2.no/how-reset-passwords) for a step by step guide, which also indicates +the times it takes for the change to take effect. + +## My password is expiring and I am asked to change it. + +You can change your password [here](https://www.metacenter.no/user/password/). + +Note that it can take 15 minutes for the change to propagate to the login nodes. + +See also [this page](https://www.sigma2.no/how-change-passwords) for a step by step guide, which also indicates +the times it takes for the change to take effect. + + +## I typed my password wrong several times. Now it seems I can not log in. Has my account been closed? + +Your account is most likely not closed. To prevent brute-force attacks, our +firewall has temporarily blocked your computer's IP address. Please try again +in 15 minutes. + +If you still can not connect, please contact +. diff --git a/_sources/getting_help/qa-sessions.md.txt b/_sources/getting_help/qa-sessions.md.txt new file mode 100644 index 000000000..0b0ec87d7 --- /dev/null +++ b/_sources/getting_help/qa-sessions.md.txt @@ -0,0 +1,76 @@ +# Open Question & Answer Sessions for All Users + +__Every last Thursday in a month from 13:00 till 14:30.__ + +## Learn new tricks and ask & discuss questions + +Meet the HPC staff, discuss problems and project ideas, give feedback or +suggestions on how to improve services, and get advice for your +projects. + +Join us on +[Zoom](https://uit.zoom.us/j/63238817048?pwd=Z0NsSkxJb0JqeEZ2NENVZE5LY0RkQT09), +get yourself a coffee or tea and have a chat. It doesn't matter from which +institute or university you are, you are **welcome to join at any time**. + +Sometimes we will have short 15 min presentation about various topics like +"Useful tools and services". But your questions can always be about any topic +you want. + +We can talk about: +- Questions regarding compute resources, data storage and management. +- Help with programming and software management. +- Help with parallelization, improving performance and/or scaling etc. +- Help with project organization and data management. +- Anything else. + +If you think you might have a challenging question or topics for us, +you can also send them to us before, so we can come prepared and +help you better. If you have general or specific questions about +the event, please write to `jorn.dietze@uit.no`. + + + +## Next events + +[Join us on Zoom](https://uit.zoom.us/j/63238817048?pwd=Z0NsSkxJb0JqeEZ2NENVZE5LY0RkQT09) + +- 2024-01-25, 13:00 - 14:30 +- 2024-02-29, 13:00 - 14:30 + + +## Past events + +[Questions and answers of previous session](https://hackmd.io/@hpc/q-a) + +- 2023-11-30, 13:00 - 14:30 +- 2023-10-26, 13:00 - 14:30, "Containers on NRIS HPC machines" +- 2023-09-28, 13:00 - 14:30, "Common problems and solutions with __Conda__ (Anaconda/Miniconda)" + +- 2023-08-31, 13:00 - 14:30 +- 2023-06-29, 13:00 - 14:30 +- 2023-05-25, 13:00 - 14:30 +- 2023-04-27, 13:00 - 14:30 +- 2023-03-30, 13:00 - 14:30 +- 2023-02-23, 13:00 - 14:30 +- 2023-01-26, 13:00 - 14:30 +- 2022-11-09, 13:00 - 14:30 +- 2022-09-29, 13:00 - 14:30, "Focus on bioinformatics software & databases" +- 2022-08-30, 13:00 - 15:00, "Running jobs on GPUs" +- 2022-06-14, 13:00 - 15:00, "NIRDToolkit for data analysis and visualisation" +- 2022-05-10, 13:00 - 15:00, ["Useful and good to know stuff for new users" Slides](https://docs.google.com/presentation/d/1pgueQ6w8sFW4-1y3iRwiWgkypUhrlLfhEPTFSY2_Lw8/edit?usp=sharing) +- 2022-03-08, 13:00 - 15:00, ["State of Gaussian and VASP" Slides](https://docs.google.com/presentation/d/13vm5-Yx_VTfg02SAgrzki9rgSlUTDW5cERVSIdKCrfc/edit?usp=sharing) +- 2022-01-26, 13:00 - 15:00, ["The new European HPC cluster LUMI" Slides](https://docs.google.com/presentation/d/1mSl6q6dvi12ouY0Rt5eephgFR-G_4WzB/edit?usp=sharing&ouid=109172959781988137007&rtpof=true&sd=true) +- 2021-12-08, 13:00 - 15:00, ["Common but cryptic errors" Slides](https://docs.google.com/presentation/d/1U-GaHeyLOFM0HUObrYQzJELpS4UL10hOS5AGFMeEzVU/edit?usp=sharing) +- 2021-10-13, 13:00 - 15:00 +- 2021-09-09, 13:00 - 15:00, ["Our More and Less Known Resources" Slides](https://docs.google.com/presentation/d/1kEmxUYJJa2b6jKgiJYvdzFopyB3owkQkK-uJTolXny0/edit?usp=sharing) +- 2021-02-03, 13:00 - 15:00, ["How To Installation" Slides](https://docs.google.com/presentation/d/1fOzq_ob19TFIZ0lERSPw7oXx6irVVDwB2vmHOrz2AKA/edit?usp=sharing) +- 2020-10-13, 13:00 - 15:00, ["Helpful tools & Services" Slides](https://docs.google.com/presentation/d/1HKC5-G41lwVxAMjHU_UOTWKsRO3nD7B8uh9IVQRNtsk/edit?usp=sharing) +- 2020-06-10, 13:00 - 15:00 +- 2020-04-23, 13:00 - 15:00 + + +## Similar events which serve as inspiration + +- +- diff --git a/_sources/getting_help/support_line.md.txt b/_sources/getting_help/support_line.md.txt new file mode 100644 index 000000000..363a55ecc --- /dev/null +++ b/_sources/getting_help/support_line.md.txt @@ -0,0 +1,73 @@ +(support-line)= + +# Getting help + +We very much appreciate that you check our [status system](https://opslog.sigma2.no) before contacting us about problems with the infrastructure. You can learn more about this system on the page {ref}`using-opslog`. + +```{admonition} Administrative support +{octicon}`briefcase;2em;sd-text-info` +Contact **[contact@sigma2.no](mailto:contact@sigma2.no)** for support requests concerning: + +**NOTE: Use this e-mail if you need to add / delete users to a group or change allocation details such as quotas.** + +- resource applications +- projects +- compute and storage allocations +- user accounts +- group memberships +``` + +```{admonition} Technical support +{octicon}`tools;2em;sd-text-info` +Contact **[support@nris.no](mailto:support@nris.no)** for all other support requests: + +**NOTE: Use this e-mail if you need to extend your job's walltime.** + +- technical issues +- software installation requests +- questions about job scripts +- technical questions about storage and compute +- technical questions about the services and toolkits +``` + +Behind these addresses are issue trackers (ticket system) and you will get an +automatic reply first and can expect response from our staff within hours +during normal work-hours, or at the latest the following working day (except +holidays). + +When replying to email from the support line **please do not change the email subject**. + +However, for a new problem, **do not reply to emails with unrelated subjects** (and +ticket numbers) as this can be confusing for us. + + +## This information will help us to answer faster + +We get many requests for help which are too vague to give a useful response. +So, when sending us a question, please answer these questions and you’ll get +the fastest useful response: + +- **Has it ever worked?** (If so, what has changed?) +- **What are you trying to accomplish?** (Your ultimate goal, not current technical obstacle.) +- **What did you do?** (Be specific enough to be reproducible - copy and paste exact commands you run, scripts, inputs, output messages, etc.) + +If you don’t know something, it’s OK, just do your best and we’ll help from +there! You can also chat with us to brainstorm about issues in general. + +This summary is copied and adapted from the [Aalto Science-IT help +pages](https://scicomp.aalto.fi/triton/help/#give-enough-information). Please +also look at the page for a more detailed guide on +{ref}`good-support-requests`. + + +## What can you expect from the support line + +You can expect efficiency and a friendly and competent staff with decades +of combined experience on supercomputers. + +However, the load on the support staff can sometimes be significant and we may +need to prioritize issues. In busy times we may not be able to help you with +questions that would be better addressed to your local IT departments. + +Domain-specific questions may be beyond our expertise. However, we maintain a +few application liaisons for very heavility used programs and domains. diff --git a/_sources/getting_started/R.md.txt b/_sources/getting_started/R.md.txt new file mode 100644 index 000000000..e147a03dd --- /dev/null +++ b/_sources/getting_started/R.md.txt @@ -0,0 +1,196 @@ +(first-r-calculation)= + +# First R calculation + +Our goal on this page is to get an R calculation to run +on a compute node, both as serial and parallel calculation. + +```{contents} Table of Contents +``` + + +## Simple example to get started + +We will start with a very simple R script (`simple.R`): +```r +print("hello from the R script!") +``` + +We can launch it on {ref}`saga` with the following job script (`simple.sh`). +**Before submitting**, adjust at least the line with `--account` to match your +allocation: +```{code-block} bash +--- +emphasize-lines: 3 +--- +#!/bin/bash + +#SBATCH --account=nn9997k +#SBATCH --job-name=example +#SBATCH --partition=normal +#SBATCH --mem=1G +#SBATCH --ntasks=1 +#SBATCH --time=00:02:00 + +# it is good to have the following lines in any bash script +set -o errexit # make bash exit on any error +set -o nounset # treat unset variables as errors + +module restore +module load R/4.2.1-foss-2022a + +Rscript simple.R > simple.Rout +``` + +Submit the example job script with: +```console +$ sbatch simple.sh +``` + + +## Longer example + +Here is a longer example that takes ca. 25 seconds (`sequential.R`): +```r +library(foreach) + + +# this function approximates pi by throwing random points into a square +# it is used here to demonstrate a function that takes a bit of time +approximate_pi <- function() { + # number of points to use + n <- 2000000 + + # generate n random points in the square + x <- runif(n, -1.0, 1.0) + y <- runif(n, -1.0, 1.0) + + # count the number of points that are inside the circle + n_in <- sum(x^2 + y^2 < 1.0) + + 4 * n_in / n +} + + +foreach (i=1:100, .combine=c) %do% { + approximate_pi() +} +``` + +And the corresponding run script (`sequential.sh`). +**Before submitting**, adjust at least the line with `--account` to match your +allocation: +```{code-block} bash +--- +emphasize-lines: 3 +--- +#!/bin/bash + +#SBATCH --account=nn9997k +#SBATCH --job-name=example +#SBATCH --partition=normal +#SBATCH --mem=2G +#SBATCH --ntasks=1 +#SBATCH --time=00:02:00 + +# it is good to have the following lines in any bash script +set -o errexit # make bash exit on any error +set -o nounset # treat unset variables as errors + +module restore +module load R/4.2.1-foss-2022a + +Rscript sequential.R > sequential.Rout +``` + + +## Parallel job script example + +```{warning} +We have tested this example and it works but the scaling/speed-up is pretty +poor and not worth it in this example. If you know the reason, can you please +suggest a change? + +When running jobs in parallel, please always verify that it actually scales and +that the run time goes down as you use more cores. + +When testing this example on the desktop, the speed-up was much better. + +Often, a good alternative to run R code in parallel is to launch many +sequential R jobs at the same time, each doing its own thing. +``` + +Let's start with the run script (`parallel.sh`), where we ask for 20 cores: +```{code-block} bash +--- +emphasize-lines: 7 +--- +#!/bin/bash + +#SBATCH --account=nn9997k +#SBATCH --job-name=example +#SBATCH --partition=normal +#SBATCH --mem=2G +#SBATCH --ntasks=20 +#SBATCH --time=00:02:00 + +# it is good to have the following lines in any bash script +set -o errexit # make bash exit on any error +set -o nounset # treat unset variables as errors + +module restore +module load R/4.2.1-foss-2022a + +Rscript parallel.R > parallel.Rout +``` + +Notice how in the R script (`parallel.R`) we indicate to use these 20 cores +and how we changed `%do%` to `%dopar%`: +```{code-block} r +--- +emphasize-lines: 23, 25 +--- +library(parallel) +library(foreach) +library(doParallel) + + +# this function approximates pi by throwing random points into a square +# it is used here to demonstrate a function that takes a bit of time +approximate_pi <- function() { + # number of points to use + n <- 2000000 + + # generate n random points in the square + x <- runif(n, -1.0, 1.0) + y <- runif(n, -1.0, 1.0) + + # count the number of points that are inside the circle + n_in <- sum(x^2 + y^2 < 1.0) + + 4 * n_in / n +} + + +registerDoParallel(20) + +foreach (i=1:100, .combine=c) %dopar% { + approximate_pi() +} +``` + + +## Which of the many R modules to load? + +The short answer is to get an overview about available modules first: +```console +$ module spider R +$ module spider bioconductor +``` + +We have more information here: {ref}`installing-r-libraries-modules` + + +## Installing R libraries + +We have a separate page about {ref}`installing-r-libraries`. diff --git a/_sources/getting_started/applying_account.md.txt b/_sources/getting_started/applying_account.md.txt new file mode 100644 index 000000000..3db8064fe --- /dev/null +++ b/_sources/getting_started/applying_account.md.txt @@ -0,0 +1,13 @@ +(applying-account)= + +# How do I get an account? + +To apply for an account to be able to log in and use our compute and storage +resources, go to the +[user application form](https://www.metacenter.no/user/application). + +For more a step-by-step guide, follow the documentation from +[here](https://www.sigma2.no/how-apply-user-account). + +After you obtain an account, you may need to apply for or connect to an existing +{ref}`compute or storage allocation `. diff --git a/_sources/getting_started/applying_resources.md.txt b/_sources/getting_started/applying_resources.md.txt new file mode 100644 index 000000000..4b6195c4d --- /dev/null +++ b/_sources/getting_started/applying_resources.md.txt @@ -0,0 +1,28 @@ +(applying-computing-storage)= + +# Applying for computing and storage + +Applications for computing and storage resources should be submitted through NRIS Administration System (MAS). + + +```{warning} +Resources on the national e-infrastructure are granted on a period basis and +the **allocation periods have deadlines**. + +Please check +to find information about when the next allocation period starts and to not +miss the next application deadline. +``` + + +## If you do not have a compute (nn) or storage (ns) project + +Please refer to the information on [this page](https://www.sigma2.no/apply-e-infrastructure-resources) for more details +and follow the link that takes you to the application form. + + +## If you already have a compute (nn) or storage (ns) project + +To apply for additional compute or storage resources to an **existing project +allocation**, we offer a dedicated and [simplified +procedure](https://www.sigma2.no/extra-allocation). diff --git a/_sources/getting_started/editing_files.md.txt b/_sources/getting_started/editing_files.md.txt new file mode 100644 index 000000000..1af00d070 --- /dev/null +++ b/_sources/getting_started/editing_files.md.txt @@ -0,0 +1,28 @@ +# Editing files + +We highly recommend using the text mode when using Emacs on Fram, Saga +or NIRD, e.g., + +```sh +$ emacs -nw +``` + +If you wish to use the graphical user interface, then we recommend to run +Emacs on your local computer and open the file remotely. For this you do not +need to copy the files from the cluster to your local computer, you simply open them +as you would open a remote web page in your browser. The procedure uses an +Emacs package called TRAMP (Transparent Remote (file) Access, Multiple +Protocol). See their web page https://www.gnu.org/software/tramp/ for more details. + +Procedure for Fram (Saga and NIRD will be similar): + * Open emacs on your laptop/machine + * C-x C-f (Ctrl+x, then Ctrl+f (or Mac equivalent), then you will get a “find file” prompt) + * ```/ssh:username@fram.sigma2.no:pathname``` (**note the leading slash**) + * You may get the following message “Offending key for IP in …. Are you sure you want to continue connecting (yes/no)?“ type yes and enter + * Depending on the network state you might see the message “Waiting for prompt from remote shell” for few seconds to a minute, before the connection opens. + +For example if your user name is “newuser” and if you want to open a file called “myfile.txt”, located in your home area on Fram, you would use the following: + + * ```/ssh:newuser@fram.sigma2.no:/cluster/home/newuser/myfile.txt``` + +If you specify a directory name, you can browse the remote file system until you have found the file you wish to open. diff --git a/_sources/getting_started/fingerprints.md.txt b/_sources/getting_started/fingerprints.md.txt new file mode 100644 index 000000000..3ef84c98b --- /dev/null +++ b/_sources/getting_started/fingerprints.md.txt @@ -0,0 +1,78 @@ +--- +orphan: true +--- + +(ssh-fingerprints)= + +# Key fingerprints of our systems + +The following overview displays the different keys (depending on which key type +you use to connect), both as `MD5` and `SHA256`, for all systems. + +```{warning} +If the fingerprints do not match what is presented to you upon first-time +login, please {ref}`contact us ` immediately. +``` + + +## fram.sigma2.no + +- ED25519: + - `MD5:5f:15:92:e1:22:74:69:68:6c:1c:27:f5:a5:b1:76:3f` + - `SHA256:hLb9eJdGcTT2PHoWamc/+06LlF+vgcnyfvFEqh60cT8` +- RSA: + - `MD5:05:d0:0e:fa:cb:72:c0:03:cb:8f:d0:b4:dc:09:04:4e` + - `SHA256:Cq5Vt82wQAAhMu4q05L3gmB4QeW1POpNNKgTIP8A2f4` + +## saga.sigma2.no + +- ED25519: + - `MD5:2b:c2:ce:c0:f1:b8:0a:95:ec:db:b4:f3:fb:ee:e9:70` + - `SHA256:YOkZ1uudXrFmaigdnpZ64z497ZccNhdZe/abFkDXOH8` +- ECDSA: + - `MD5:13:4e:ae:66:89:0d:24:27:b8:15:87:24:31:ed:32:af` + - `SHA256:qirKlTjO9QSXuCAiuDQeDPqq+jorMFarCW+0qhpaAEA` +- RSA: + - `MD5:61:e4:49:4b:4e:00:14:2d:9d:b9:ac:99:c2:16:e6:ab` + - `SHA256:mY+Po9LKAlZGzMRHUmq1abrSOohifdN7+5VUmRTW4tE` + + +## betzy.sigma2.no + +- ED25519: + - `MD5:de:75:8c:93:40:f6:32:94:b6:bd:47:43:62:a5:1a:58` + - `SHA256:7M0HDP163k9fOUeZq3KtzLdjISE9Kq/gVygCpyrZPDQ` +- ECDSA: + - `MD5:37:da:0d:cd:fe:66:47:71:3f:08:59:d7:bb:76:ec:cc` + - `SHA256:l0adSAGOHM4CNOqxvBNh5Laf+PlDSXQiargVoG/cue4` +- RSA: + - `MD5:f6:a9:4e:a7:f6:1e:10:5c:01:e7:44:ac:34:4d:4b:b4` + - `SHA256:wSirru+JTpcAZKQe/u6jLRj3kVCccNNUWU2PxzgbebM` + + +## login.nird.sigma2.no + +- ED25519: + - `MD5:c4:23:90:52:eb:d9:eb:e5:41:0d:ef:4d:ac:78:2c:db` + - `SHA256:A8gq7aiQHoK4QzRi1hMpLNbo40ZTZxlGfDCpDWZy/ZQ` +- ECDSA: + - `MD5:78:ea:cb:f7:f0:6b:02:55:17:0c:b1:5f:de:2a:3e:78` + - `SHA256:lawnWA5fHTX64XB8OU0WUrQu/dCtFgCfvMC+i/zBCrI` +- RSA: + - `MD5:54:c9:b7:71:22:9e:bd:e9:ad:5c:18:fe:7b:41:e7:01` + - `SHA256:xpJZ+XiY4oy3df/R5/LN8i30Z5/EiYo6YSCUQdKkQ/U` + + +## Display all fingerprints for a certain server + +To display all fingerprints for a certain server, you can use the following +command on your local machine (Linux or macOS): + +```console +$ ssh-keyscan login.nird.sigma2.no | ssh-keygen -l -f - -E md5 +$ ssh-keyscan login.nird.sigma2.no | ssh-keygen -l -f - -E sha256 +``` + +# Common SSH errors when keys have changed + +Please, take a look at this {ref}`page ` diff --git a/_sources/getting_started/getting_started.md.txt b/_sources/getting_started/getting_started.md.txt new file mode 100644 index 000000000..c1963cec2 --- /dev/null +++ b/_sources/getting_started/getting_started.md.txt @@ -0,0 +1,151 @@ +(getting-started)= + +# Getting started + +This page is meant to get you started on our resources and briefly list the +essentials. In the menu on the left you will then find more in-depth +documentation on these topics. + + +## Getting access + +To get access you need two things and possibly you have these already: +- {ref}`Apply for a user account ` +- {ref}`Compute/storage resource allocation ` + + +## Logging in + +Logging into the machines involves the use of {ref}`Secure Shell (SSH) ` protocol, +either in a terminal shell or through a graphical tool using this protocol +under the hood. SSH login is available natively to Linux or macOS. Also on +Windows a number of good tools for this exists. + +Replace `` with your registered username and `` with the +specific machine name: +```console +$ ssh @ +``` + +The machine names are: +- `betzy.sigma2.no` - {ref}`betzy` +- `fram.sigma2.no` - {ref}`fram` +- `saga.sigma2.no` - {ref}`saga` +- `login.nird.sigma2.no` - {ref}`nird` + +The Fram and Saga systems also provide a {ref}`remote-desktop` service. + + +## Learning about the Linux command line + +Learning the basics about the Linux command line (shell) navigation and shell +scripting will make your daily work easier and more efficient. + +If you are new to command line, go through some Linux tutorials on the subject +first. Here are some useful pages you can look into: [shell +novice](https://swcarpentry.github.io/shell-novice/) or [Effective +shell](https://effective-shell.com). O therwise consult your local IT resources +for help. + +However, do not be afraid to contact support if you are not an expert or have +knowledge of this. We will try our best to help you. + + +## Transferring files + +We recommend to use `rsync` to transfer files. It is often faster and safer (in +terms of overwriting files) than using `scp`. On Windows we recommend to use +`rsync` through [Windows Subsystem for Linux +(WSL)](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux). Read more +on our page about {ref}`file-transfer`. + + +## Running applications + +The HPC machines provide compute nodes for executing applications. To ensure +fair access to the resources, the HPC machines run applications as _jobs_ in a +_queue system_, which schedules the tasks and process to run on compute nodes. +All systems use the Slurm queue system. + +A job is described by a _batch script_, which is a shell script (a text file) +with `SBATCH` options to specify the needed resources and commands to perform +the calculations. All batch scripts must contain _at least_ the following +two `SBATCH` options (on {ref}`saga` you also need to indicate maximum memory): + +```bash +#!/bin/bash -l + +# account name +#SBATCH --account=nnXXXXk + +# max running time in d-hh:mm:ss +# this helps the scheduler to assess priorities and tasks +#SBATCH --time=0-00:05:00 +``` + +For more details please see {ref}`running-jobs`. + + +## Information on available CPU hours and disk space + +This will list your available projects and the remaining CPU hours +(see also {ref}`projects-accounting`): +```console +$ cost +``` + +This will give you information about your disk {ref}`storage-quota`: +```console +$ dusage +``` + + +## Modules + +To keep track of the large number of different pieces of software that is +typically available on a shared HPC cluster, we use something called a software +module system. This allows us to have many different versions of compilers, +libraries, and applications available for different users at the same time +without conflicting each other. + +By default when you log in to the cluster you will get a clean environment with +nothing but standard system compilers and libraries. In order to make your +favourite software application available to you, you need to load its module +into your environment, which is done using the `module` command + +```console +$ module +``` + +Some of the more common options include: + +* `avail` - list the available modules +* `list` - list the currently loaded modules +* `load ` - load the module called `modulename` +* `unload ` - unload the module called `modulename` +* `show ` - display configuration settings for `modulename` + +For more details please see {ref}`module-scheme`. + + +## How to get the most out of your allocation + +We want to support researchers in getting the most out of the +high-performance computing services. When supporting users, we see that +these problems are very frequent: + +- **Reusing outdated scripts** from colleagues without adapting them to + optimal parameters for the cluster at hand and thus leaving few cores + idle. Please check at least how many cores there are on a particular + cluster node. +- **Requesting too much memory** which leads to longer queuing and less + resource usage. Please check {ref}`choosing-memory-settings`. +- **Requesting more cores than the application can effectively use** without + studying the scaling of the application. You will get charged more than + needed and others cannot run jobs. If others do this, your own jobs queue. +- **Submitting jobs to the wrong queue** and then queuing longer than + needed. Please take some time to study the different {ref}`job-types`. + +If you are unsure about these, please contact us via +support@nris.no and we will help you to use your allocated +resources more efficiently so that you get your research results faster. diff --git a/_sources/getting_started/getting_started/shell.md.txt b/_sources/getting_started/getting_started/shell.md.txt new file mode 100644 index 000000000..45b9d5077 --- /dev/null +++ b/_sources/getting_started/getting_started/shell.md.txt @@ -0,0 +1,11 @@ +--- +orphan: true +--- + +# I would like to change my shell. What do I do? + +`bash` is the only supported shell by the NRIS. You may however +change to another shell of your preference as long as you deal on your own with +problems which might arise as a result of it. For temporary use, you can +simply call the new shell from bash. For persistent change, please contact +support. diff --git a/_sources/getting_started/opslog.md.txt b/_sources/getting_started/opslog.md.txt new file mode 100644 index 000000000..bea76535e --- /dev/null +++ b/_sources/getting_started/opslog.md.txt @@ -0,0 +1,71 @@ +(using-opslog)= + +# Status and maintenance of systems + +```{contents} Table of Contents +``` + +Opslog is our status system to keep you informed about events on our infrastructure. +There you will find posts and updates regarding maintenance, incidents and general changes you should know about. +We have integrated the system with our other services to push information more actively as well - more on this below. + +## Where to find it? + +[The system is available as a web frontend](https://opslog.sigma2.no). This is the best place to get a quick overview of the overall status. + +### Interacting with groups in the web frontend +Services are divided into groups. These are collapsed by default to save space and make the site cleaner, **unless** there are ongoing events. In such case the relevant group will be expanded. + +You can expand or collapse these groups yourself by clicking on the boxes to the left of group names. + +![Expand groups](img/opslog-expand-groups.png "Expand groups") + +### Details about events + +When we have an active event, you can see it below the groups as a card. It gives a brief overview of the details such as severity, system(s) affected, timestamp, status and last update. + +![Inident card](img/opslog-incident-card.png "Inident card") + +Clicking on this card opens the full details of this specific event with a timeline of updates. + +![Incident details](img/opslog-incident-details.png "Incident details") + +### Banner on our websites + +You may also see a pop up while browsing our websites (like this one!) when there is an ongoing incident or maintenance. +Incidents will always have priority in case there is ongoing maintenance at the same time. + +![Integrated banner](img/opslog-banner.png "Integrated banner") + +These banners follows you while browsing the same site. Click the X in the top right corner of the banner to close this specific notification. If something new is posted, it will appear again. +Clicking `"View latest updates"` will take you to the details of this event on the status page itself. + +### Latest events in MOTD in the terminal + +We are integrating with Message Of The Day (MOTD) on our systems so that you can get a quick update on what is going on when logging in via the terminal. + +Upcoming maintenance on the specific system you are logging in to will also be displayed. + +![Message of the day integration](img/opslog-motd.png "Message of the day integration") + +## Subscribing to get active notifications + +You are in control when it comes to active notifications from this system. +We offer a broad variety of platforms where you can subscribe to receive notifications when something is posted or updated. + +Subscribing is quickly done by [going to the frontend ](https://opslog.sigma2.no) and clicking "Subscribe to updates". From here you can choose the platform(s) you want notifications on, and multiselect the applicable system(s) you want notifications for. + +In the example below we are subscribing to the HPC machine Saga only on e-mail. If you want notifications for _everything_, simply select "All" instead of "Select services". + +![Subscribe to notifications](img/opslog-subscribe.png "Subscribe to notifications") + +There will be a link in the footer of every e-mail to unsubscribe or change preferences. Other platforms (like Slack as an example) require you to remove the integration and set it up from scratch. + +### Types of notifications + +When subscribing, you will be notified about: +* New incidents +* Upcoming maintenance (14 days before the start date) +* Maintenance is started +* Maintenance is completed +* Any updates added to incidents and maintenances. diff --git a/_sources/getting_started/remote-desktop.md.txt b/_sources/getting_started/remote-desktop.md.txt new file mode 100644 index 000000000..3f1580a92 --- /dev/null +++ b/_sources/getting_started/remote-desktop.md.txt @@ -0,0 +1,78 @@ +(remote-desktop)= + +# Remote Desktop + +```{contents} Table of Contents +``` + +## Introduction + +The remote desktop service makes it possible to run graphical applications on all NRIS machines with reasonable performance over the network. Currently (Apr. 2023) the login-system has no hardware acceleration so running advanced 3D rendering applications will probably not have adequate performance. + + +## [X2Go](https://wiki.x2go.org/doku.php/start) + +The supported solution on NRIS machinery is X2Go - . X2Go is an application that enables you to do remote visualization on external machines. It works as a Remote Desktop Protocol on top of NX/X11. X2Go Client requires a local X11 server to display the remote sessions. +The server you use depends on your local operating system, as follows: + +* Windows systems: an X11 server is provided with X2Go client. +* Linux systems: the client component of X2Go uses the local Xorg server. +* Mac OS X systems: you must install the XQuartz X11 server as an extra component. + + +### How to use the service on NRIS machines + +X2Go requires both a server application to run on the remote server and a client application to run on the user machine. **X2Go Server** is installed on all login nodes on all NRIS machines, thus standard login procedure should work perfectly fine. + +In order to use X2Go, users have to install **X2Go Client** on their computer(s). The way to do this is to download the X2Go client valid for your operating platform from and follow the instructions for installing on your machine. + +### How to configure the X2Go client for NRIS machines + +#### Create a session: + +![First display of the X2Go client](X2Go_First.png) + +Give this session a suitable name, for instance FramDesktop. + +**In the ``Session`` tab:** + +![X2Go client session setup window](X2Go_SessionSetup.png) + +*Server section* + +* Host: + * For Fram: fram.sigma2.no + * For Saga: saga.sigma2.no + * For Betzy: betzy.sigma2.no +* Login: (aka username) - your NRIS system user name +* SSH port: 22 +* Use RSA/DSA key for ssh connection: + * For Mac and Linux: Leave blank if you want to use your ssh-keys from standard location. + * For windows: Create {ref}`SSH keys ` on your local machine and point to the public key. +* Try auto login: `Check` (see picture below). Especially advised for machines running Mac and Linux OS, which has "hidden path" standard for ssh-key folder. + +![X2Go client session setup window emphasizing Auto SSH log in](X2Go_SessionSetupSSH.png) + +*Proxy server section* + +* Leave unchecked + +*Session type section* + +* Choose ``XFCE``in the drop down menu of you want a desktop type setup. +* Choose ``Single application``and add ``usr/bin/xterm``in the command window if you just want an xterm application running. + +Then you are basically good to go. Log in to the machine in question by clicking on one of the predefined sessions you have just made, start a terminal window and start working. + +![Starting an X2Go session from a session icon](X2Go_SessionStart.png) + +To start an X2Go session, you double click on your session icon in the top right corner of the X2Go client window (see above). + + +### Troubleshooting + +In the past we have experienced that a manual kill of the `x2goagent` processes on the server side have created challenges for users who wants to restart sessions. This seems to be handled intuitively well inside the X2Go client now. However, if there are issues related to usage of X2Go, please notify firstline support about this so we are able to update documentation accordingly. + +> Acknowledgement: +> +> *Much of the content of and inspiration for this page is borrowed from the X2Go documentation provided by NTNU for the Hunt Cloud service: * diff --git a/_sources/getting_started/security-policy.md.txt b/_sources/getting_started/security-policy.md.txt new file mode 100644 index 000000000..a87055fd8 --- /dev/null +++ b/_sources/getting_started/security-policy.md.txt @@ -0,0 +1,26 @@ +--- +orphan: true +--- + +# Security policy for Sigma2 infrastructure + + +## Privileges + +- Users shall not be able to escalate their privilege by any means. Any patch + or mechanism to accomplish this shall be rolled out without undue delay. + + +## Data access and file permissions + +- Users data are personal and private +- Users shall not be able to access other users home directories or scratch areas +- Project data is private to the project and is controlled by the project lead +- The PL has sole discretion over access to the project, and thus to its project area. + + +## Network access for users + +- Users shall enter the system via approved login nodes +- Compute nodes shall not have direct access to the public internet +- In cases where compute nodes need internet access, e.g. license servers, this is to be documented and traceable diff --git a/_sources/getting_started/ssh.md.txt b/_sources/getting_started/ssh.md.txt new file mode 100644 index 000000000..717d020f8 --- /dev/null +++ b/_sources/getting_started/ssh.md.txt @@ -0,0 +1,401 @@ +(ssh)= + +# SSH + +This page assumes that the reader: +- is working on a Linux machine, a macOS or a Windows machine with OpenSSH + installed (default on recent Windows 10+ versions) +- check in the terminal with `ssh -V` that you indeed have OpenSSH available +- has an account on the server of interest + +This page is adapted from the very nice documentation written by our +colleagues at Aalto University (Finland): +. + + +## What is SSH + +SSH is an abbreviation for *secure shell protocol*. It is a protocol to +communicate data between two computers over an encrypted connection. When you +log into one of the clusters with `ssh` and read and edit files and type +commands or copy files using `rsync`, then data is transmitted via this +encrypted connection (see also our guide about {ref}`file-transfer`). + + +## Connecting to a server + +When you type `ssh myusername@saga.sigma2.no`, then `myusername` is your +username on the remote server and `saga.sigma2.no` is the server/cluster you +are connecting to. + +If `myusername` is the same on your computer and the remote server, you can +leave it out: +```console +$ ssh saga.sigma2.no +``` + +## Jumping through login nodes + +When already logged in, you can easily jump from one login node to another by typing `ssh login-X` (for Fram, Saga and Betzy) or `ssh loginX` (for NIRD). Please, replace "X" with the number of the login node you want to access. + +Also, the same is valid for when we want to access a specific compute node we are running our jobs. However, it is only possible to access compute nodes that you currently have jobs running. + + +{ref}`Further below ` we will show how we can configure SSH so that we +don't have to type the same lengthy command every time + + +## First-time login + +When you ssh to a remote server for the very first time, you will be prompted +to affirm that the remote server is indeed the one you expected to connect to: +``` +The authenticity of host 'saga.sigma2.no (2001:700:4a01:10::37)' can't be established. +ED25519 key fingerprint is SHA256:ryqAxpKDjNLLa5VeUPclQRaZBOIjd2HFgufUEnn4Jrw. +This key is not known by any other names. +Are you sure you want to continue connecting (yes/no/[fingerprint])? +``` + +This question is to prevent some other server impersonating the remote +resource and subsequently impersonating you to the real resource. This is not +very likely but it is possible, therefore it's a good idea to double check the +fingerprint and compare it with published {ref}`ssh-fingerprints`. + +If the fingerprint matches, you can confirm by typing `yes` and press `Enter`. +Note that the trailing "." is not part of the fingerprint. + +```{warning} +If the fingerprints do not match, please {ref}`contact us ` +immediately. +``` + +(ssh-config)= + +## Configuring SSH for less typing + +Remembering the full settings list for the server you are working on each time +you log in can be tedious: the username is the same every time, the server is +the same every time, ... **There is a better way!** + +A configuration file allows you to store your preferred settings and map them +to much simpler login commands. + +Create or edit (if it already exists) the file `~/.ssh/config`. +Here is an example entry for one of our clusters: +``` +Host saga + User myusername + Hostname saga.sigma2.no +``` + +Now instead of: +```console +$ ssh myusername@saga.sigma2.no +``` + +I can type: +```console +$ ssh saga +``` + +Also `rsync` and `scp` and any other tool that uses `ssh` under the hood will +understand these shortcuts. There is a lot more that can be configured. Search +the web for more examples if you are interested. + + +## Using SSH keys instead of passwords + +It's boring to type the password every time, especially if you regularly have +multiple sessions open simultaneously (there exist also other tools to help +with that). The tedium of typing it 20-50 times each day could motivate some +to make the password very short or very memorable, thus reducing security. +See also [the relevant XKCD comic](https://xkcd.com/936/). + +**There is a better way**: using SSH key pairs. This is not only less tedious +(you will only have to type a passphrase typically once per day), but also +more secure (we will explain why). + +An SSH key pair consists of a private key (which you never share with anybody) +and a public key (which you can share with others without problems). Others +can then encrypt messages to you using the public key, and you can decrypt them +using your private key. Others can only encrypt. Only you can decrypt. + +The private key is a file on your computer. Also the public key is a different +file on your computer. Anybody who has access to your private key can read +data between you and remote servers and impersonate you to the remote servers. + +One way to visualize this is to image the public key to be a box into which +somebody can put a secret message. Anybody can put something into a box and +close the box and send the box to you, but only you have the key to open it +(private key). + +To make sure that your private key (file) does not fall into the wrong hands, +it is custom and **recommended to encrypt it with a passphrase**. Having the +private key "encrypted" with an empty passphrase is possible, but it is the +equivalent of leaving your house key under the door mat or the equivalent of +having a bank card without any pin. + +**Why are SSH key pairs more secure than using a password?** There is still the +passphrase to unlock the private key, so why is this easier and better? We +will show later how it is easier, but it is more secure since the passphrase is +never communicated to the remote server: it stays on your computer. When the +remote server is authenticating you, it encrypts a large number and sends it +encrypted to you and asks you to decrypt it and send the decrypted number back +and then compares the two. If they match, the remote server knows that you are +you and from there on can trust you for the duration of the session. No +password or passphrase needs to leave your computer over the network. + + +### Generating a new SSH key pair + +While there are many options for the key generation program ``ssh-keygen``, here are the main ones: +- `-t`: The encryption type used to make the unique key pair. +- `-b`: The number of key bits. +- `-f`: Filename of key. +- `-C`: Comment on what the key is for. +- `-a`: Number of key derivation function rounds. Default is 16. The higher, + the longer it takes to verify the passphrase but also the better + protection against brute-force password cracking. + +We recommend the following command to create a key pair: +```console +$ ssh-keygen -t ed25519 -a 100 +``` + +After running this command in the terminal, you will be prompted to enter a +passphrase. **Make sure to enter a passphrase to encrypt the key!** A private +key with an empty passphrase can be used by anybody who gets access to your +private key file. Never share it with anybody! + +Upon confirming the password, you will be presented with the key fingerprint +as both a SHA256 hex string and as randomart image. Your new key-pair +should be found in the hidden `~/.ssh` directory. If you ran the command +above, you will find there `id_ed25519` (private key, never share it) and +`id_ed25519.pub` (public key, no problem to share). + + +### Copy public key to server + +In order to use your key pair to log in to the remote server, you first need to +securely copy the desired *public key* to the machine with ``ssh-copy-id``. +The script will also add the key to the ``~/.ssh/authorized_keys`` file on the +server. You will be prompted to enter your *password* (not the *passphrase* +associated with the private key) to initiate the secure copy of the file. + +To copy and install the public key to the server, for example Saga, we use: +```console +$ ssh-copy-id -i ~/.ssh/id_sigma2 myusername@saga.sigma2.no +``` + +This command creates the directory `~/.ssh` on the target machine +(`saga.sigma2.no` in the example above) if it did not exist yet. When created +by OpenSSH (e.g. through `ssh-copy-id`), the directory gets the required +strict permissions `0700`, which may be different from the shell's +file-creation mask returned by `umask -S`. You can check the permissions by +running `ls -ld ~/.ssh` on Saga, and change the permissions to `0700` with the +command `chmod 0700 ~/.ssh`. + +Once the public key has been copied to the remote server, you can log in using +the SSH key pair. Try it. **It should now ask you for your passphrase and not +for the password.** + +This approach works not only for our clusters but also for services like +GitHub or GitLab. But let's focus here on clusters. + +````{admonition} Help! It still asks for a password! + +In this case, debug with: +```console +$ ssh -v myusername@saga.sigma2.no +``` + +Instead of `-v` you can also try `-vv` or `-vvv` for more verbose output. +Study the output and try to figure out what goes wrong. Does it try the key +pair you created? +```` + + +### How many key pairs should I create? + +We recommend creating key pair per hardware device. Not a key pair per +remote server. + +In other words, if you have a laptop and a desktop and want to authenticate to +4 different servers, create a key pair on the laptop and another one on the +desktop, and upload both public keys to all 4 remote servers. + +The motivation to have one key pair per hardware device is that if you lose +your hardware device (e.g. laptop) or it gets stolen, you know which key to +revoke access from. + + +### Using the OpenSSH authentication agent + +Further up we motivated that we don't want to type the password every time +many times a day. Now we instead need to type the private key passphrase every +time, so it feels like this was not a win. But again there is a better way: To +avoid having to type the decryption passphrase, the *private key* needs to be +added to the ``ssh-agent`` with the command: + +On Linux and Windows: +```console +$ ssh-add +``` + +On Windows, remember to have the service "OpenSSH Authentication Agent" enabled and starting automatically. + + +On macOS, use this instead: +```console +$ ssh-add --apple-use-keychain +``` + +If you are unsure whether the `ssh-agent` process is running on your machine, +`ps -C ssh-agent` will tell you if there is. To start a new agent, use: +```console +$ eval $(ssh-agent) +``` + +Once the password is added, you can ssh into the remote server as normal but +will immediately be connected without any further prompts. + +In other words, we use `ssh-add` typically once per day but then can `ssh` and +`rsync` as often as we like without re-authenticating. + + +## SSH client on Windows + +In Windows 10 and newer you can now get a fully functional Linux terminal by +[installing WSL](https://docs.microsoft.com/en-us/windows/wsl/install-win10). + +Yet another alternative is to use the [Windows SSH Client](https://learn.microsoft.com/en-us/windows/terminal/tutorials/ssh) directly. + + +(x11-forwarding)= + +## X11 forwarding + +X11 forwarding is a method to send the graphical screen output from the remote +server to your local computer. + +X11 forwarding should be used with caution due to security implications. +Please note that if someone can read your X authorization database, that +person would be able to access the local X11 display through the forwarded +connection. By default, your X authority database is stored in the +`~/.Xauthority` file. This file contains records with authorization +information used in connecting to the X server. + +We suggest switching it on *only* when needed, with the use of options (`-X` +or `-Y`) passed to the `ssh` command. Whenever possible, use `-X` option to +mark remote X11 clients untrusted. + +In some cases `-X` will fail to work and either the use of `-Y` option or +setting `ForwardX11Trusted` in your SSH configuration file to "yes" is required. In +this case remote X11 clients will have full access to the original X11 display. + +Alternatively, if X11 forwarding is always needed, you can configure it on a +per-host basis in your `.ssh/config` file: +``` +# global settings +ForwardX11 no # disable X11 forwarding +ForwardX11Trusted no # do not trust remote X11 clients + +# per-host based settings, example for Fram +Host fram # alias, you may run "ssh fram" only + HostName fram.sigma2.no # actual hostname for Fram + User my_username # replace with your username on Fram + IdentityFile ~/.ssh/id_rsa_fram # pointer to your private SSH key + ForwardX11 yes # enable X11 forwarding + ForwardX11Trusted no # do not trust remote X11 clients +``` + + +## SSHFS + +[SSHFS](https://github.com/libfuse/sshfs) allows you to mount a remote +file system using SFTP. + +If you wish to use SSHFS, please note that `fram.sigma2.no`, +`login.fram.sigma2.no`, and addresses for other clusters are round-robin +entries. This means that every time you log in, you might end up on a +different actual login node (e.g. `login-1.fram.sigma2.no` or +`login-2.fram.sigma2.no`). This is done to balance load between login nodes. + +When you use `sshfs`, you should always specify one of the actual login nodes, +not the "front-ends", otherwise you risk getting your IP address blacklisted, +since your session is authenticated against only one actual login node and not +the other login nodes. + + +## Compressing data for poor connections + +In case of poor connection to the server, likely from a very remote area and +usually noticeable with X11 forwarding enabled, you may request data +compression by using the `-C` option. + +Please note that the compression uses the CPU to compress and decompress all data. If you are on a fast network, then this option will have a negative impact on your bandwidth. + + +## SSH over breaking connections + +If you experience intermittent connectivity when on Wi-Fi, cellular, and +long-distance links and get frustrated with SSH losing connection and you +having to open a new terminal every time, have a look at [Mosh (mobile +shell)](https://mosh.org/). + +Mosh is in many instances a drop-in replacement for `ssh` (and actually +utilizes `ssh` under the hood for establishing a connection). It is +recommended to use Mosh if you connect from a laptop and want to keep the +connection when roaming on Wi-Fi or putting the laptop to sleep. + +(ssh_errors)= + +# Common SSH errors + +## WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED! + +The SSH connection was working fine until one day the following message appears: + +``` +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@ WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED! @ +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +IT IS POSSIBLE THAT SOMEONE IS DOING SOMETHING NASTY! +Someone could be eavesdropping on you right now (man-in-the-middle attack)! +It is also possible that a host key has just been changed. +The fingerprint for the ED25519 key sent by the remote host is +SHA256:XXX. +Please contact your system administrator. +Add correct host key in /home/username/.ssh/known_hosts to get rid of this message. +Offending ECDSA key in /home/username/.ssh/known_hosts:13 + remove with: + ssh-keygen -f "/home/username/.ssh/known_hosts" -R fram.sigma2.no +ED25519 host key for fram.sigma2.no has changed and you have requested strict checking. +Host key verification failed. +``` + +It may be frightening at first but, generally, it just means the SSH Keys from the server have changed and this is common after a system upgrade (so, take a look at our OpsLog page to check if that was the case: https://opslog.sigma2.no/). + +The fix is already in the message itself and, in this example, we just have to locate the file `known_hosts` inside `/home/username/.ssh/` and delete line 13. + +**NOTE:**: +- The number at the end indicates where the problem lies. +- The path will be different according to the operating system you are running. Also, on Linux, having a folder starting with `.` means it is a hidden folder. + + +Also, if you are familiar with Linux terminal, running the suggested command also has the same effect: `ssh-keygen -f "/home/username/.ssh/known_hosts" -R fram.sigma2.no` + +After following the steps above, try to log in again and accept the new fingerprint (if you want to make sure it is the correct one, check this [page](https://documentation.sigma2.no/getting_started/fingerprints.html)). + +## References + +- - inspiration for this page +- - long-form guide +- - long-form guide +- +- +- +- +- +- +- - commercial site diff --git a/_sources/hpc_machines/betzy.md.txt b/_sources/hpc_machines/betzy.md.txt new file mode 100644 index 000000000..c564e3da7 --- /dev/null +++ b/_sources/hpc_machines/betzy.md.txt @@ -0,0 +1,39 @@ +--- +orphan: true +--- + +(betzy)= + +# Betzy + +Named after [Mary Ann Elizabeth (Betzy) Stephansen](https://en.wikipedia.org/wiki/Elizabeth_Stephansen), the first Norwegian woman to be awarded a doctorate degree. + + +## The most powerful supercomputer in Norway + +Betzy is a BullSequana XH2000, provided by Atos, and will give Norwegian researchers almost three times more capacity than previously, with a theoretical peak performance of 6.2 PetaFlops. The supercomputer is located at NTNU in Trondheim, and was set in production 24 November 2020. Betzy provides 1.5 billion CPU hours per year, and has a contracted operating time of four years (2024), with an optional one-year extension (2025). + +Betzy has also gotten a GPU partition and a preproc partition. The GPU partition consists of 4 nodes. Each node has 64 +CPU cores, 512 GiB memory and 4 x Nvidia A100 GPUs with 40GB memory connected by NVLink. The Preproc partition consists of 6 nodes +with 1 TiB RAM each. + +| Details | Betzy | +| :------------- | :------------- | +| System |BullSequana XH2000 | +| Max Floating point performance, double | 6.2 Petaflops | +| Number of compute nodes | 1338 | +| CPU type | AMD® Epyc™ 7742 2.25GHz | +| CPU cores in total | 172032 | +| CPU cores per node | 128 | +| Memory in total | 344 TiB | +| Memory per node | 256 GiB | +| GPU type | Nvidia A100 40 GB with NVLink | +| GPUs per node | 4 | +| GPUs in total | 16 | +| Operating System | Red Hat Enterprise Linux 7 | +| Total disc capacity | 7.7 PB | +| Interconnect | InfiniBand HDR 100, Dragonfly+ topology | +| Top500 June 2020 | 55th place \@ 1250 nodes, 76% efficiency| + +Almost all components are liquid cooled resulting in a very high cooling efficiency, 95% of heat being captured to water. + diff --git a/_sources/hpc_machines/fram.md.txt b/_sources/hpc_machines/fram.md.txt new file mode 100644 index 000000000..1b4a61841 --- /dev/null +++ b/_sources/hpc_machines/fram.md.txt @@ -0,0 +1,29 @@ +--- +orphan: true +--- + +(fram)= + +# Fram + +Named after the Norwegian arctic expedition ship [Fram](http://en.wikipedia.org/wiki/Fram), +the new Linux cluster hosted at [UiT Arctic University of Norway](https://uit.no/startsida) is a shared resource for research computing capable of 1.1 PFLOP/s +theoretical peak performance. + +Fram is a distributed memory system which consists of 1004 dual socket and 2 +quad socket nodes, interconnected with a high-bandwidth low-latency Infiniband +network. The interconnect network is organized in an island topology, with 9216 +cores in each island. Each standard compute node has two 16-core Intel Broadwell +chips (2.1 GHz) and 64 GiB memory. In addition, 8 larger memory nodes with 512 +GiB RAM and. +The total number of compute cores is 32256. + +| Details | Fram | +| :------------- | :------------- | +| System |Lenovo NeXtScale nx360 | +| Number of Cores | 32256 | +| Number of nodes | 1004 | +| CPU type | Intel E5-2683v4 2.1 GHz | +| Max Floating point performance, double | 1.1 Petaflop/s | +| Total memory | 72 TiB | +| Total disc capacity | 2.5 PB | diff --git a/_sources/hpc_machines/hardware_overview.md.txt b/_sources/hpc_machines/hardware_overview.md.txt new file mode 100644 index 000000000..7fa7b8873 --- /dev/null +++ b/_sources/hpc_machines/hardware_overview.md.txt @@ -0,0 +1,43 @@ +--- +orphan: true +--- + +(hardware-overview)= + +# Overview over our machines + +The current Norwegian academic HPC infrastructure consists of three systems, located in Tromsø ([Fram](/hpc_machines/fram.md)) and Trondheim ([Saga](/hpc_machines/saga.md) and [Betzy](/hpc_machines/betzy.md)). + +Each of the facilities consists of a compute resource (a number of compute nodes each with a number of processors and internal shared-memory, plus an interconnect that connects the nodes), a central storage resource that is accessible by all the nodes, and a secondary storage resource for back-up (and in few cases also for archiving). All facilities use variants of the UNIX operating system. + +Additionally, {ref}`nird` provides a service platform, storage and archiving services for research data. Here you can for instance request your application to be deployed, build integrated solutions and make sure data is safe and accessible. + + +## Comparison between current hardware + +The table below compares the available systems with respect to the type of applications they are suited for. Second column indicates which job profile type is supported by the resource given in first column; **A** means GPU-accelerated jobs, **P** means parallel jobs, **S** means serial jobs, and **L** means large I/O jobs. +The **Memory** column specifies physical node memory in Gigabytes (GiB), with minimum and maximum numbers given. There are uneven distributions in the memory categories, please read specs for every given machine in order to find the exact numbers. The **Cores** column is the number of physical cores in each node. + +|Resource | Job types | Memory (min/max) | Cores/Node | +| :------------- | :------------- | :------------- | :------------- | +| [Betzy](/hpc_machines/betzy.md) | P L | 256/256 | 128 | +| [Fram](/hpc_machines/fram.md) | P L | 64/512 | 32 | +| [Saga](/hpc_machines/saga.md) | A P S L | 186/3066 | 24/64 | + + + +The resource allocation committee (RFK) manages a part of the total cores on the resources for national allocations. The national share is 9824 cores on Saga, approximately 31600 cores on Fram and 172032 cores on Betzy. + +The following considerations should be kept in mind when selecting a system to execute applications: + +* [Saga](/hpc_machines/saga.md) is a throughput system. It can be used for sequential (single-threaded) as well as parallel applications. + +* [Betzy](/hpc_machines/betzy.md) and [Fram](/hpc_machines/fram.md) are large scale parallel (distributed-memory) application systems. +Applications that use less than 128 cores (4 nodes) on Fram and less than 1024 cores (8 nodes) on Betzy for production are discouraged. +Requests for access to execute applications that use fewer cores are often rejected or moved to other systems. + +* [Saga](/hpc_machines/saga.md) and [Fram](/hpc_machines/fram.md) run CentOS Linux distributions. + +* [Betzy](/hpc_machines/betzy.md) runs Bull Super Computer Suite 5 (SCS5) based on RHEL 7™. + +* In case you need to install a specific software package, please make sure that you know for which environments the software is supported, before choosing a system. diff --git a/_sources/hpc_machines/lumi.md.txt b/_sources/hpc_machines/lumi.md.txt new file mode 100644 index 000000000..d87487d7a --- /dev/null +++ b/_sources/hpc_machines/lumi.md.txt @@ -0,0 +1,93 @@ +--- +orphan: true +--- + +(lumi)= + +# LUMI + +> The European High-Performance Computing Joint Undertaking (EuroHPC JU) is +> pooling European resources to develop top-of-the-range exascale supercomputers +> for processing big data, based on competitive European technology. +> +> At the time of installation, LUMI will be one of the world’s fastest computer +> systems, having theoretical computing power of more than 500 petaflops which +> means 500 quintillion calculations per second. LUMI’s performance will be more +> than tenfold compared to one of Europe’s fastest supercomputer today (Piz +> Daint, Switzerland). LUMI will also be one of the world’s leading platforms +> for artificial intelligence. + +```{note} +LUMI will be installed in Kajaani, Finland. Basic operations are done by the +vendor and CSC. Researchers will be supported by different teams: the Norwegian +EuroHPC competence centre for porting and tuning applications on the LUMI +hardware and programming environment, Sigma2 for handling allocation requests, +and The LUMI User Support Team (LUST) for day-to-day issues. +``` + +## Overview +LUMI is the first of a new class of pre-exascale supercomputers set up by [a +consortium of countries in +Europe](https://www.lumi-supercomputer.eu/lumi-consortium/). All partner +countries will get access to an equal share of the resources, currently +estimated to be `2%` for Norway. In addition, project can apply for resources +through EuroHPC JU which control the other `50%` of LUMI capacity. + +LUMI is aimed at AI and HPC workloads that can take advantage of GPU +accelerators. + +LUMI is divided into the following partitions, where the largest will be LUMI-G +followed by LUMI-C: +![LUMI partition +overview](https://www.lumi-supercomputer.eu/content/uploads/2020/11/lumiSlide-1024x576.png) + +| Details | LUMI-G | +|:--------|:-----| +| Peak performance | 375 PetaFLOPs | +| CPU type | AMD® Trento™ 64-Core | +| GPU type | AMD® Instinct™ MI250X GPU | +| GPU memory | 128 GB HBM2e per GPU | +| Node configuration | 1 CPU and 4 x GPUs | +| Number of nodes | 2560 | +| Interconnect | Cray Slingshot 200 Gbit/s, GPUs directly connected to interconnect | +| Storage capacity |
  • 117 PB total
    • 7 PB fast flash storage
    • 80 PB parallel filesystem
    • 30 PB object storage
| +| Expected Top500 | Top 3 | + +[Full system specification](https://www.lumi-supercomputer.eu/lumis-full-system-architecture-revealed/) + +## LUMI-G +LUMI-G is the main partition of LUMI and is based on AMD accelerators. The main +interactions with the accelerators is through +[`ROCm`](https://rocm.docs.amd.com/), `OpenMP` and +[`HIP`](https://rocm.docs.amd.com/projects/HIP/). +CUDA is *not* supported on LUMI and existing users should consider porting their +application to `HIP` through the tools offered. Starting early with the porting +effort is very important and will be supported by the EuroHPC CC team. + +Once LUMI-G is operational and pilot testing is completed, all interested users +will be able to request access. Applications that can take advantage of GPU +accelerators will see massive speed-ups on LUMI-G and NRIS will continue to aid +in porting applications to this architecture, see +{ref}`here for more information about our GPU support `. + +### Porting to accelerators +Since LUMI-G is based on AMD GPU Accelerators, not all applications will be able +to instantly take advantage of the additional compute power. AI researchers +using one of the larger frameworks, such as `TensorFlow` and `pyTorch`, will be +able to use LUMI-G directly. + +NRIS is still building documentation for taking advantage of +accelerator resources. Researchers that want to begin the transition should +evaluate `OpenACC` (see our {ref}`dev-guides_gpu`), +`OpenMP` +or directly using [accelerated +libraries](https://rocmdocs.amd.com/en/latest/ROCm_Libraries/ROCm_Libraries.html). + +If you are interested in porting your application to GPUs, or already have +ported your application and need assistance transitioning to AMD GPUs, +{ref}`please contact NRIS support `. + +## LUMI-C +LUMI-C is the compute partition of LUMI, dealing with CPU based HPC +applications. Users interested in this partition should also consider the other +clusters already in operation in Norway. diff --git a/_sources/hpc_machines/migration2metacenter.md.txt b/_sources/hpc_machines/migration2metacenter.md.txt new file mode 100644 index 000000000..71ea81001 --- /dev/null +++ b/_sources/hpc_machines/migration2metacenter.md.txt @@ -0,0 +1,27 @@ +# Migration to an NRIS HPC machine + +In general, the user environment on all NRIS machines should be as similar as possible. +Thus, for users moving internally between machines run by NRIS, they only need to focus on the following: + +* [Hardware differences](/hpc_machines/hardware_overview.md): number of CPU cores, memory size, GPU-availability, external access from compute nodes. +* Software differences: [Installed SW](/software/installed_software.md), type of file-system, access limitation rules to \$HOME, same or different file systems on \$TMP and \$HOME. +* Jobtype policy differences in Resource Management System (Slurm) on different machines. +* Storage options. + +For users either being novel to HPC in general, or having experience from other clusters - either local/private or foreign setup, basically the same rules apply - one must try to identify the critical differences in what one is used to and then adapt behaviour accordingly. + +## Major steps in migrating to an NRIS HPC machines + +* Read this documentation. +* Get an [account](/getting_started/applying_account.md) and [project](/getting_started/applying_resources.md) quota. +* Become aware of differences in disk quota, module system, job types, running jobs, how to get help, file system policies. +* Transfer data, scripts etc from other machines to the new machine. +* Modify scripts & routines to match differences on the new machine. +* **Verify that your jobs run efficiently and produce the same results as on other systems!** +* Be patient with user support [(support@nris.no)](mailto:support@nris.no), but don't hesitate to ask questions! + +## Read about the current machines operated by NRIS + +* [Fram](/hpc_machines/fram.md) +* [Saga](/hpc_machines/saga.md) +* [Betzy](/hpc_machines/betzy.md) diff --git a/_sources/hpc_machines/saga.md.txt b/_sources/hpc_machines/saga.md.txt new file mode 100644 index 000000000..44ae641d9 --- /dev/null +++ b/_sources/hpc_machines/saga.md.txt @@ -0,0 +1,55 @@ +--- +orphan: true +--- + +(saga)= + +# Saga + +The supercomputer is named after the goddess in norse mythology associated with wisdom. Saga is also a term for the Icelandic epic prose literature. The supercomputer, placed at NTNU in Trondheim is designed to run both sequential and parallel workloads. It was made available to users right before the start of the 2019.2 period (October 2019). + +Saga is provided by Hewlett Packard Enterprise and has a computational capacity of approximately 140 million CPU hours a year. + + +## Technical details + +### Main components + +* 200 standard compute nodes with 40 cores and 192 GiB memory each +* 120 standard compute nodes with 52 cores and 192 GiB memory each +* 28 medium memory compute nodes, with 40 cores and 384 GiB of memory each +* 8 big memory nodes, with 3 TiB and 64 cores each +* 2 huge memory nodes, with 6 TiB and 64 cores each +* 8 GPU nodes, with 4 NVIDIA P100 GPUs and 2 CPUs with 24 cores and 384 GiB memory each +* 8 GPU nodes, with 4 NVIDIA A100 GPUs and 1 CPU with 32 cores and 1 TiB memory each +* 8 login and service nodes with 256 cores in total +* 6.5 PB high metadata performance BeeGFS scratch file system + +| Details | Saga | +| :------------- | :------------- | +| System |Hewlett Packard Enterprise - Apollo 2000/6500 Gen10 | +| Number of Cores | 16064 | +| Number of nodes | 364 | +| Number of GPUs | 32 | +| CPU type | Intel Xeon-Gold 6138 2.0 GHz / 6230R 2.1 GHz (normal)
Intel Xeon-Gold 6130 2.1 GHz (bigmem)
Intel Xeon E7-4850 v4 2.1 GHz (hugemem)
Intel Xeon-Gold 6126 2.6 GHz (accel)
AMD EPYC 7542 32-Core (a100) | +| GPU type | NVIDIA P100, 16 GiB RAM (accel)
NVIDIA A100, 80 GiB RAM (a100) | +| Total max floating point performance, double | 645 Teraflop/s (CPUs) + 150 Teraflop/s (GPUs) | +| Total memory | 97.5 TiB | +| Total NVMe+SSD local disc | 89 TiB + 60 TiB | +| Total parallel filesystem capacity | 1 PB | + + +### Mapping of processors to memory specifications: + +- 6126 + - HPE 16GB (12 x 16GB per node) Dual Rank x8 DDR4-2666 CAS-19-19-19 Registered Smart Memory Kit + - Max number of memory channels: 6 +- 6130 + - HPE 32GB (48 x 32GB per node) Dual Rank x4 DDR4-2666 CAS-19-19-19 Registered Smart Memory Kit + - Max number of memory channels: 6 +- 6138 + - HPE 64GB (12 x 64GB per node) Quad Rank x4 DDR4-2666 CAS-19-19-19 Load Reduced Smart Memory Kit + - Max number of memory channels: 6 +- 6230R + - HPE 16GB (12 x 16GB per node) Dual Rank x8 DDR4-2933 CAS-21-21-21 Registered Smart Memory Kit + - Max number of memory channels: 6 diff --git a/_sources/index.md.txt b/_sources/index.md.txt new file mode 100644 index 000000000..9f1271a27 --- /dev/null +++ b/_sources/index.md.txt @@ -0,0 +1,217 @@ +**News**: +[Latest changes and events](https://opslog.sigma2.no) | +[Hardware live status](https://www.sigma2.no/hardware-status) | +{ref}`known-issues` + + +# Norwegian Research Infrastructure Services + +The Norwegian research infrastructure services (NRIS) is a collaboration +between Sigma2 and the universities of Bergen (UiB), Oslo (UiO), Tromsø (UiT +The Arctic University of Norway) and NTNU, to provide national supercomputing +and data storage services. These services are operated by NRIS and coordinated +and managed by Sigma2. + +NRIS provides valuable resources for the research communities: state of the art +{ref}`compute ` and {ref}`storage facilities `, backed +by {ref}`support ` and a guarantee that your data always stays in +Norway. Possibly more important is easy access to a {ref}`wide selection of +competences ` that can assist, realize or take your project +to the next level. {ref}`Read more ... ` + +**Compute resources**: +{ref}`Overview ` | {ref}`saga` | {ref}`fram` | {ref}`betzy` | {ref}`lumi` + +**Storage resources**: +{ref}`nird` | {ref}`research-data-archive` + +**Tools and other services**: +{ref}`nird-toolkit` | {ref}`EasyDMP ` | {ref}`Course resources ` + +````{grid} 2 +:gutter: 1 + +```{grid-item-card} Getting started + :shadow: none + :class-card: sd-border-1 + :link: getting-started + :link-type: any + + {octicon}`stopwatch;2em;sd-text-info` + + New to high-performance computing? Click here to learn how to setup an + account and run your first calculations. +``` + +```{grid-item-card} Getting help + :shadow: none + :class-card: sd-border-1 + :link: support-line + :link-type: any + + {octicon}`question;2em;sd-text-info` + + Need help using our compute or storage facilities? Check out our support + pages. +``` + +```{grid-item-card} Training + :shadow: none + :class-card: sd-border-1 + :link: training-events + :link-type: any + + {octicon}`mortar-board;2em;sd-text-info` + + Check out our information on training events so that + you get more done in less time. +``` + +```{grid-item-card} Files, storage, and quota + :shadow: none + :class-card: sd-border-1 + :link: storage-areas + :link-type: any + + {octicon}`database;2em;sd-text-info` + + For more information on our storage areas and to learn more + about quota. +``` + +```{grid-item-card} Job types and scripts + :shadow: none + :class-card: sd-border-1 + :link: running-jobs + :link-type: any + + {octicon}`codespaces;2em;sd-text-info` + + Here you can find example job scripts and guides on how to run + efficiently. +``` + +```{grid-item-card} Code development and tutorials + :shadow: none + :class-card: sd-border-1 + :link: code_development + :link-type: any + + {octicon}`rocket;2em;sd-text-info` + + Are you a developer of scientific software or running your own code on our + systems? Find useful resources here. +``` + +```` + +Projects are required to **acknowledge the use of the national e-infrastructure +resources** in their scientific publications. Papers, presentations and other +publications that feature work that relied on resources provided by Sigma2 +should include an +[acknowledgement following this template](https://www.sigma2.no/acknowledgements). + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: Policies + +code-of-conduct.md +``` + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: Getting help + +getting_help/support_line.md +getting_help/extended_support.rst +getting_help/faq.md +getting_help/how_to_write_good_support_requests.md +getting_help/qa-sessions.md +getting_help/lost_forgotten_password.md +Project Leader Support +``` + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: Training + +training/events.md +training/notes_qa.md +training/videos.md +training/material.md +``` + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: Getting started + +getting_started/getting_started.md +getting_started/opslog.md +getting_started/applying_account.md +getting_started/applying_resources.md +getting_started/editing_files.md +code_development/guides/vs_code/connect_to_server.md +getting_started/ssh.md +getting_started/remote-desktop.md +getting_started/R.md +``` + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: Files, storage and backup + +files_storage/nird_lmd.md +files_storage/clusters.md +files_storage/quota.md +files_storage/backup.md +files_storage/file_transfer.md +files_storage/sharing_files.md +files_storage/performance.md +``` + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: HPC usage + +hpc_machines/migration2metacenter.md +computing/responsible-use.md +jobs/overview.rst +computing/tuning-applications.md +``` + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: Software + +software/modulescheme.md +software/installed_software.md +software/userinstallsw.rst +software/appguides.md +software/licenses.md +``` + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: Services + +nird_archive/user-guide.md +nird_toolkit/overview.rst +services/easydmp-user-documentation.md +getting_help/course_resources.md +``` + +```{toctree} +:hidden: +:maxdepth: 1 +:caption: Code development and tutorials + +code_development/overview.rst +``` diff --git a/_sources/jobs/arm-perf/linpack.md.txt b/_sources/jobs/arm-perf/linpack.md.txt new file mode 100644 index 000000000..26790ffd3 --- /dev/null +++ b/_sources/jobs/arm-perf/linpack.md.txt @@ -0,0 +1,82 @@ +--- +orphan: true +--- + +# LINPACK benchmark {#linpack-benchmark} + +The [LINPACK benchmark](http://www.netlib.org/benchmark/hpl/), used by +the [TOP500 HPC list](https://www.top500.org/), solves a dense +system of linear equations and is used to measure the _floating-point +capabilities_ of CPUs. + +The [LINPACK sourcecode](http://www.netlib.org/benchmark/hpl/) is +compiled using the Intel `icc` compiler as follows: + +``` +$ module load intel/2018b +$ cd hpl-2.3 +$ ./configure CC=mpicc CXX=mpicxx --prefix=/cluster/projects/nn9999k/marcink/hpl LDFLAGS=-lm +$ make +``` + +To run, the benchmark requires a configuration file +({download}`inputs/HPL.dat`) to reside in the same directory as the +`xhpl` binary. We run the benchmark on 32 cores of a +single compute node (all communication can be done through shared memory): + +``` +$ mpirun -np 32 ./xhpl +[...] +T/V N NB P Q Time Gflops +-------------------------------------------------------------------------------- +WR11C2R4 50000 192 4 8 86.82 9.5985e+02 +[...] +``` + +The program reports computing at 960 GFLOP/s. Looking at the nominal +CPU frequency on Fram (E5-2683 v4 @ 2.1GHz), the peak FLOP/s +performance is 2.1 GHz*clock/core * 16 FLOP/clock * 32 cores = 1075 +FLOP/s. During the run the cores were actually running at ~2.3-2.4GHz, +hence LINPACK achieves between 80% and 90% of the theoretical +peak. This is a very good result, not often achieved by real-world +codes. Clearly, the code is compute bound. + +The same run with profiling reports: + +``` +$ perf-report -n 32 ./xhpl +[...] +T/V N NB P Q Time Gflops +-------------------------------------------------------------------------------- +WR11C2R4 50000 192 4 8 89.73 9.2871e+02 +[...] +``` + +Hence, performance with profiling is roughly 3% lower than +without. While the profiling overhead is not significant, the entire +profiled run (together with setup and final data collection and +interpretation) took much longer than the LINPACK benchmark itself: 15 +minutes with profiling vs. 1.5 minute without profiling. This +extension, which might be different for individual codes and depends on the +number of MPI ranks, must be accounted for by the user when submiting +profiled jobs the queuing system. + +Below is the HTML performance summary produced by ARM +`perf-report`: + +![LINPACK perf-report](img/perf_report_linpack.png "LINPACK perf-report") + +As expected, the code is *Compute-bound*, but there is a visible +communication component. The report correctly suggests that there is +not much to do in terms of per-core optimizations, and that speed +improvements are achievable at scale. + +Previous estimates on floating point efficiency have shown that the +code runs at 80-90% of peak FLOP/s performance. However, in the above +report FP operations only account for ~60% of the CPU time, while +the rest is attributed to memory access. As in the case of STREAM, +this is an artifact of profiling: during runtime the memory and +floating point operations are as much as possible overlapped by the +CPU. Hence it is sometimes difficult to say, which class of +instructions is the bottleneck. That's why such a performance report +should be treated with a grain of salt. diff --git a/_sources/jobs/arm-perf/osu.md.txt b/_sources/jobs/arm-perf/osu.md.txt new file mode 100644 index 000000000..91dc2753f --- /dev/null +++ b/_sources/jobs/arm-perf/osu.md.txt @@ -0,0 +1,77 @@ +--- +orphan: true +--- + +# OSU benchmark + +The [OSU benchmark +suite](http://mvapich.cse.ohio-state.edu/benchmarks/) measures the +communication speed using a number of tests implemented using MPI, +OpenSHMEM, UCP, and UPCXX. To compile it on Fram you can use either +OpenMPI (recommended for best MPI performance), or Intel MPI. At the +time of writing Arm Performance Reports did not work with OpenMPI on +Fram. Results using Intel MPI are presented instead: + +``` +$ module load intel/2018b +$ cd osu-micro-benchmarks-5.4.4 +$ ./configure CC=mpicc CXX=mpicxx --prefix=$HOME/osu +$ make install +``` + +Below are results of the `osu_barrier` test (`MPI_Barrier` call) on 32 +compute nodes, using 512 MPI ranks (16 ranks per node): + +``` +$ mpirun ./osu_barrier -i 100000 + +# OSU MPI Barrier Latency Test v5.4.4 +# Avg Latency(us) + 9.25 +``` + +And the results of the same test with profiling: + +``` +$ perf-report mpirun ./osu_barrier -i 100000 + +# OSU MPI Barrier Latency Test v5.4.4 +# Avg Latency(us) + 238.15 +``` + +Here the profiling overhead is enormous: `MPI_Barrier` takes ~26 times +longer than in the non-profiled tests. The following is the generated +HTML peformance report: + +![MPI_Barrier perf-report](img/perf_report_barrier.png "MPI_Barrier perf-report") + +The report correctly identified the code as an MPI benchmark, and +attributed all the work to MPI collective calls. However, the +profiling overhead is very large and can have impact on actual performance +of real-world applications, and hence on the applicability of the +entire analysis. + +In practice, this overhead **cannot be estimated** by simply measuring +the total execution time of profiled and non-profiled runs to see how +much the profiling slowed down our application: + +``` +$ time mpirun -np N +[...] + +$ time perf-report -n N +[...] +``` +Remember that, in addition to the profiling overhead, there is the +profiling startup and data collection costs, which +can be by far larger than the application run time (see the discussion +in the [LINPACK benchmark](linpack.md) section). +To overcome this problem one needs to include some time measurement +facilities inside the profiled application using, e.g., the C `printf` +statements. Alternatively, `perf-report` should be started to profile +the `time` command itself: + +``` +$ perf-report /usr/bin/time mpirun +``` diff --git a/_sources/jobs/arm-perf/overhead.md.txt b/_sources/jobs/arm-perf/overhead.md.txt new file mode 100644 index 000000000..65749f2e5 --- /dev/null +++ b/_sources/jobs/arm-perf/overhead.md.txt @@ -0,0 +1,51 @@ +--- +orphan: true +--- + + + +# Quantifying the profiling overhead + +As most performance evaluation tools, Arm Performance Reports work by +sampling the running code to obtain statistical information about what +the code is doing. The sampling activity does introduce +overheads, which can affect the performance of the inspected +code. This may be an important aspect, which the users must be aware +of when using any performance evaluation tool. The overhead is problem +specific, as demonstrated by the example analyses: from little to no +overhead ([STREAM benchmark](stream.md), [LINPACK +benchmark](linpack.md)) to factor 26 slowdown +([OSU benchmark](osu.md)). + +To understand how ARM Performance Reports affects the MPI performance +we investigate the performance of `osu_barrier` with and without +profiling on up to 512 cores and 32 compute nodes (maximum 16 ranks +per compute node). The following figure shows the run time in +micro-seconds (us) of a single `MPI_Barrier` call for profiled and +non-profiled runs. + +![MPI_Barrier performance](img/barrier.png "MPI_Barrier performance") + +For up to 32 ranks each compute node runs only a single MPI rank. After +that, multiple MPI ranks are started on each compute node. The figure +demonstrates that the profiling overhead grows significantly with +increasing number of MPI ranks, while the cost of the barrier depends +mostly on the number of compute nodes and remains roughly constant +for more than 32 MPI ranks. The profiling overhead is smallest with +up to 16 MPI ranks, and grows significantly from that point on. + +While `MPI_Barrier` is a very important and often used collective, it +is latency limited (no user data is sent, nor received). The following +figure analyzes profiled and non-profiled performance of +`osu_alltoall`. + +![MPI_AllToAll performance](img/all2all.png "MPI_AllToAll performance") + +For smallest message sizes the overhead is significant (factor +5-6). However, for 8KB and larger messages the overhead is essentially +gone. This is because for small message sizes the time required to +transfer a message is comparable or lower than the data collection +time used by the profiler. As messages grow, the actual data transfer +becomes much more time consuming. Hence, depending on the application +and the specifics of the MPI communication, profiling will, or will not +influence the application runtime. diff --git a/_sources/jobs/arm-perf/stream.md.txt b/_sources/jobs/arm-perf/stream.md.txt new file mode 100644 index 000000000..3b5e6f5ca --- /dev/null +++ b/_sources/jobs/arm-perf/stream.md.txt @@ -0,0 +1,85 @@ +--- +orphan: true +--- + +# STREAM benchmark + +The purpose of the [STREAM benchmark](https://github.com/jeffhammond/STREAM) +is to measure the effective memory bandwidth of +modern CPU-based architectures. This is done by measuring the time +of four loops that operate on *large arrays* (vectors) that do not fit +into the CPU cache: + +* *Copy* : `b[:] = a[:]` +* *Scale*: `b[:] = const*a[:]` +* *Add* : `c[:] = a[:] + b[:]` +* *Triad*: `c[:] = a[:] + const*b[:]` + + +The [sourcecode +(`stream.c`)](https://github.com/jeffhammond/STREAM/blob/master/stream.c) is +compiled using the Intel `icc` compiler as follows: + +``` +$ module load intel/2018b +$ icc -shared-intel -mcmodel=medium -O3 -qopt-streaming-stores always -qopenmp -DSTREAM_ARRAY_SIZE=200000000 -o stream stream.c +``` +Executing it on Fram without profiling yields the following results: + +``` +$ OMP_NUM_THREADS=32 GOMP_CPU_AFFINITY=0-31 ./stream +[...] +------------------------------------------------------------- +Function Best Rate MB/s Avg time Min time Max time +Copy: 115552.5 0.027806 0.027693 0.028218 +Scale: 115161.9 0.027867 0.027787 0.028039 +Add: 123187.5 0.039026 0.038965 0.039094 +Triad: 123121.2 0.039070 0.038986 0.039528 +------------------------------------------------------------- +``` + +The maximum achieved bandwidth is roughly 123GB/s. The same test can +be executed with profiling: + +``` +$ OMP_NUM_THREADS=32 GOMP_CPU_AFFINITY=0-31 perf-report ./stream +------------------------------------------------------------- +Function Best Rate MB/s Avg time Min time Max time +Copy: 114720.9 0.028145 0.027894 0.028477 +Scale: 115345.0 0.027875 0.027743 0.028063 +Add: 122577.0 0.039353 0.039159 0.039696 +Triad: 122761.6 0.039490 0.039100 0.039679 +------------------------------------------------------------- +``` + +The results are essentially the same, hence we can be sure that in this +case there is no significant overhead due to profiling. + +Below is the HTML performance summary generated by `perf-report`: + +![STREAM perf-report](img/perf_report_stream.png "STREAM perf-report") + +The profiler identified the code as _Compute-bound_. Strictly speaking, +the code is _memory bandwidth bound_, which becomes clear when you look +at the CPU time breakdown: 80% of the time is reported as spent in the +memory access. Some time is reported as used by numeric +(floating-point) operations. While it is true that STREAM does use +FLOPs, when running on all CPU cores the bottleneck is the memory +access, and the time needed to execute the FP instructions is +fully overlapped by the slow memory instructions. This can be seen when +comparing the results of *Copy* and *Scale*, or *Add* and *Triad* tests: +those pairs differ by one floating point operation, but their +execution time is essentially the same. So in fact the __Memory +Access__ row should read close to 100%. + +This discrepancy is an artifact of profiling. Since during runtime the +memory and floating point operations are as much as possible +overlapped by the CPU, it is sometimes difficult to say, which class +of instructions is the bottleneck. That's why such a performance +report should be treated as a high level overview and a suggestion, +rather than a definite optimization guide. + +The code is parallelized using OpenMP. In the *Threads* section of +the report there is no mention of thread synchronization overhead, +which is correct: STREAM is trivially parallel, with no explicit data +exchange between the threads. diff --git a/_sources/jobs/checkpointing.md.txt b/_sources/jobs/checkpointing.md.txt new file mode 100644 index 000000000..a223289e7 --- /dev/null +++ b/_sources/jobs/checkpointing.md.txt @@ -0,0 +1,109 @@ +# Checkpointing Jobs + +Checkpointing is the action of saving the state of a running process to a check point image file. +Users can utilize checkpointing to pickup a job where it left off due to failing resources (e.g. hardware, +software, exceeded time and memory resources) and continue running. +Users are encouraged to use application level checkpointing, that means to investigate whether the software +tools they're using are capable of stopping and restarting where a job leaves off. If it is available, +it is recommended to use the software built in tools for checkpointing. + +## Checkpointing on our Clusters + +[DMTCP](http://dmtcp.sourceforge.net) (Distributed MultiThreaded Checkpointing)is a checkpointing package for applications. +DMTCP Checkpoint/Restart allows one to transparently checkpoint to disk a distributed computation. It works under Linux, +with no modifications to the Linux kernel nor to the application binaries. It can be used by users (no root privilege needed). +One can later restart from a checkpoint. DMTCP supports both sequential and multi-threaded applications and it provides support +for SLURM resource manager. +The DMTCP module is available in all our machines **(Saga, Fram, Betzy)** and it is enabled by typing + +```module load DMTCP/2.6.0-GCCcore-9.3.0``` + + There are two steps involved after loading the DMTCP module. + +- First is to launch your application using `dmptcp_launch` by running the following + +```[user1@login-1.SAGA ~]$ dmtcp_launch --new-coordinator --rm --interval ``` + +where `--rm` option enables SLURM support, `` is the time in seconds between automatic checkpoints, +and `` is the actual command you want to run and checkpoint + +`dmtcp_launch` creates few files that are used to resume the cancelled job, such as `ckpt_*.dmtcp` and `dmtcp_restart_script*.sh`. + Unless otherwise stated (using `--ckptdir option`), these files are stored in the current working directory. + + More `dmtcp_launch` options can be found by using : + +```dmtcp_launch --help``` + +- The second step of DMTCP is to restart the cancelled job. This can be done by doing + +```./dmtcp_restart_script.sh``` + + **Sample example of how to use DMPTCP in your slurm script** + +- First submit your job with dmptcp `generic_job.sh' + +```bash +#!/bin/bash + +# Job name: +#SBATCH --job-name=YourJobname +# Project: +#SBATCH --account=nnXXXXk +# Wall time limit: +#SBATCH --time=DD-HH:MM:SS +# Other parameters: +#SBATCH ... +## Set up job environment: +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error +module --quiet purge # Reset the modules to the system default +### Load DMPTCP module +module load DMTCP/2.6.0-GCCcore-9.3.0 +### Load your software module +module load SomeProgram/SomeVersion +module list +## Do some work: Running under dmptcp control +dmtcp_launch --new-coordinator --rm --interval 3600 YourCommands +``` + +In this example, DMTCP takes checkpoints every hour '(--interval 3600)' + +- Second, restart the job: If the job is killed for various reasons, it can be restarted using the following submit file: `generic_job_dmptcp_restart.sh` +``` +#!/bin/bash + +# Job name: +#SBATCH --job-name=YourJobname +# Project: +#SBATCH --account=nnXXXXk +# Wall time limit: +#SBATCH --time=DD-HH:MM:SS +# Other parameters: +#SBATCH ... +## Set up job environment: +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error +module --quiet purge # Reset the modules to the system default +### Load DMPTCP module +module load DMTCP/2.6.0-GCCcore-9.3.0 +### Load your software module +module load SomeProgram/SomeVersion +module list +# Start DMTCP +dmtcp_coordinator --daemon --port 0 --port-file /tmp/port +export DMTCP_COORD_HOST=`hostname` +export DMTCP_COORD_PORT=$(` in order to help you and others using the HPC +machines utilize these resources more efficiently, and in turn get work done +more speedily. + +If you ask for too little memory, your job will be stopped and it might be +stopped late in the run. + + +## Run a test job before running many similar jobs + +We recommend users to run a test job before submitting many similar runs to the +queue system and find out how much memory is used (see below for examples on +how to do that). **Once you know, add perhaps 15-20% extra memory** (and runtime) +for the job compared to what your representative test case needed. + +Remember to check the [Slurm documentation](https://slurm.schedmd.com/squeue.html#lbAG), +[job types](choosing_job_types.md), +[queue system concepts](submitting/queue_system_concepts.md), +and [HPC machines](/hpc_machines/hardware_overview.md) +to verify that you are submitting the right job to the right partition and +right hardware. + + +## How to get more memory if you need it + +Speaking of right partition, one way to get more memory if a node is not enough +is to spread the job over several nodes by asking for more cores than needed. +But this comes at the price of paying for more resources, queuing longer, and +possibly blocking others. A good alternative for jobs that need a lot of memory +is often to get access to "highmem" nodes which are designed for jobs with high +memory demand. + + +## How to find out how much memory you need + + +### Example code which you can use for testing this + + +You can test some of the approaches with the following example code (`example.f90`) +which allocates 3210 MB (you can adapt that value if you want it to consume +more or less memory): + +```{code-block} fortran +--- +emphasize-lines: 5-5 +--- +program example + + implicit none + + integer(8), parameter :: size_mb = 3210 + integer(8) :: size_mw + real(8), allocatable :: a(:) + +! print *, 'will try to allocate', size_mb, 'MB' + + size_mw = int(1000000*size_mb/7.8125d0) + allocate(a(size_mw)) + + ! this is here so that the allocated memory gets actually used and the + ! compiler does not skip allocating the array + a = 1.0d0 + print *, 'first element:', a(1) + + ! wait for 35 seconds + ! because slurm only samples every 30 seconds + call sleep(35) + + deallocate(a) + print *, 'ok all went well' + +end program +``` + +Compile it like this and later we can examine `mybinary` and check whether we can find out that +it really allocated 3210 MB: +```console +$ gfortran example.f90 -o mybinary +``` + + +### Using top + +While the job is running, find out on which node(s) it runs using `squeue --me`, +then `ssh` into one of the listed compute nodes and run `top -u $USER`. + + +### By checking the Slurm output generated with your job + +We can use the following example script (adapt `--account=nn____k`; this is tested on Saga): +```{code-block} +--- +emphasize-lines: 3-3 +--- +#!/usr/bin/env bash + +#SBATCH --account=nn____k + +#SBATCH --job-name='mem-profiling' +#SBATCH --time=0-00:01:30 +#SBATCH --mem-per-cpu=3500M +#SBATCH --ntasks=1 + +# we could also compile it outside of the job script +gfortran example.f90 -o mybinary + +./mybinary +``` + +Slurm generates an output for each job you run. For instance job number `10404698` +generated output `slurm-10404698.out`. + +This output contains the following: +```{code-block} +--- +emphasize-lines: 14,17 +--- +Submitted 2024-02-04T13:11:03; waited 27.0 seconds in the queue after becoming eligible to run. + +Requested wallclock time: 2.0 minutes +Elapsed wallclock time: 44.0 seconds + +Task and CPU statistics: +ID CPUs Tasks CPU util Start Elapsed Exit status +10404698 1 0.0 % 2024-02-04T13:11:30 44.0 s 0 +10404698.batch 1 1 2.7 % 2024-02-04T13:11:30 44.0 s 0 + +Used CPU time: 1.2 CPU seconds +Unused CPU time: 42.8 CPU seconds + +Memory statistics, in GiB: +ID Alloc Usage +10404698 3.4 +10404698.batch 3.4 3.1 +``` + +From this (see below `Memory statistics`) we can find out that the job used 3.1 +GiB memory. + +Note that **Slurm samples the memory every 30 seconds**. This means that if your +job is shorter than 30 seconds, it will show that your calculation consumed +zero memory which is probably wrong. The sampling rate also means that if your +job contains short peaks of high memory consumption, the sampling may +miss these. + + +#### Slurm reports values for each job step + +If you call `srun` or `mpirun` multiple times in your job script they will get +one line each in the `sacct` output with separate entries for both `AvgRSS` and +`MaxRSS`. + +Also the job script itself (commands run in the jobscript without using `srun` +or `mpirun`) is counted as a *step*, called the `batch` step. + + +### By using sacct + +This creates a short version of the above. + +As an example, I want to know this for my job which had the number `10404698`: +```console +$ sacct -j 10404698 --format=MaxRSS + + MaxRSS +---------- + + 3210588K + 0 +``` + +From this we see that the job needed `3210588K` memory, same as above. The +comment above about possibly multiple steps applies also here. + + +### Using jobstats + +`jobstats` is always run as part of your job output. But knowing that the +command exists also on its own can be useful if you still know/remember the job +number (here: 10404698) but you have lost the Slurm output. + +```console +$ jobstats -j 10404698 +Job 10404698 consumed 0.0 billing hours from project nn****k. + +Submitted 2024-02-04T13:11:03; waited 27.0 seconds in the queue after becoming eligible to run. + +Requested wallclock time: 2.0 minutes +Elapsed wallclock time: 44.0 seconds + +Task and CPU statistics: +ID CPUs Tasks CPU util Start Elapsed Exit status +10404698 1 0.0 % 2024-02-04T13:11:30 44.0 s 0 +10404698.batch 1 1 2.7 % 2024-02-04T13:11:30 44.0 s 0 + +Used CPU time: 1.2 CPU seconds +Unused CPU time: 42.8 CPU seconds + +Memory statistics, in GiB: +ID Alloc Usage +10404698 3.4 +10404698.batch 3.4 3.1 +``` + + +### Using seff + +`seff` is a nice tool which we can use on **completed jobs**. For example here we ask +for a summary for the job number 10404698: + +```{code-block} console +--- +emphasize-lines: 11-12 +--- +$ seff 10404698 + +Job ID: 10404698 +Cluster: saga +User/Group: someuser/someuser +State: COMPLETED (exit code 0) +Cores: 1 +CPU Utilized: 00:00:01 +CPU Efficiency: 2.27% of 00:00:44 core-walltime +Job Wall-clock time: 00:00:44 +Memory Utilized: 3.06 GB +Memory Efficiency: 89.58% of 3.42 GB +``` + + +### By prepending your binary with /usr/bin/time -v + +In your job script instead of running `./mybinary` directly, prepend it with `/usr/bin/time -v`: +```{code-block} +--- +emphasize-lines: 14 +--- +#!/usr/bin/env bash + +#SBATCH --account=nn____k + +#SBATCH --job-name='mem-profiling' +#SBATCH --time=0-00:01:30 +#SBATCH --mem-per-cpu=3500M +#SBATCH --ntasks=1 + +# instead of this: +# ./mybinary + +# we do this: +/usr/bin/time -v ./mybinary +``` + +Then in the Slurm output we find: +```{code-block} +--- +emphasize-lines: 10-10 +--- +Command being timed: "./mybinary" +User time (seconds): 0.51 +System time (seconds): 0.64 +Percent of CPU this job got: 3% +Elapsed (wall clock) time (h:mm:ss or m:ss): 0:36.16 +Average shared text size (kbytes): 0 +Average unshared data size (kbytes): 0 +Average stack size (kbytes): 0 +Average total size (kbytes): 0 +Maximum resident set size (kbytes): 3212160 +Average resident set size (kbytes): 0 +Major (requiring I/O) page faults: 1 +Minor (reclaiming a frame) page faults: 2394 +Voluntary context switches: 20 +Involuntary context switches: 41 +Swaps: 0 +File system inputs: 57 +File system outputs: 0 +Socket messages sent: 0 +Socket messages received: 0 +Signals delivered: 0 +Page size (bytes): 4096 +Exit status: 0 +``` + +The relevant information in this context is `Maximum resident set size +(kbytes)`, in this case 3210916 kB which is what we expected to find. Note +that it has to be `/usr/bin/time -v` and `time -v` alone will not do it. + + +### By using Arm Performance Reports + +You can profile your job using {ref}`Arm Performance Reports `. + +Here is an example script (adapt `--account=nn____k`; this is tested on Saga): +```{code-block} +--- +emphasize-lines: 11-12, 14 +--- +#!/usr/env/bin bash + +#SBATCH --account=nn____k +#SBATCH --qos=devel + +#SBATCH --job-name='mem-profiling' +#SBATCH --time=0-00:01:00 +#SBATCH --mem-per-cpu=3500M +#SBATCH --ntasks=1 + +module purge +module load Arm-Forge/22.1.3 + +perf-report ./mybinary +``` + +This generates a HTML and text summary. These reports contain lots of +interesting information. Here showing the relevant part of the text report for +the memory: +```{code-block} +--- +emphasize-lines: 4-4 +--- +Memory: +Per-process memory usage may also affect scaling: +Mean process memory usage: 1.39 GiB +Peak process memory usage: 3.07 GiB +Peak node memory usage: 27.0% |==| +``` + + +### By reducing the memory parameter until a job fails + +This is not an elegant approach but can be an OK approach to calibrate one +script before submitting 300 similar jobs. + +What you can do is to start with a generous memory setting: +``` +#SBATCH --mem-per-cpu=3500M +``` + +And gradually reduce it until your job fails with `oom-kill` ("**oom**" or "**OOM**" is short for "out of memory"): +``` +slurmstepd: error: Detected 1 oom_kill event in StepId=10404708.batch. +Some of the step tasks have been OOM Killed. +``` + +Or you start with a very conservative estimate and you gradually increase until +the job is not stopped. + +Then you also know. But there are more elegant ways to figure this out (see +options above). diff --git a/_sources/jobs/choosing-number-of-cores.md.txt b/_sources/jobs/choosing-number-of-cores.md.txt new file mode 100644 index 000000000..ef60a5b04 --- /dev/null +++ b/_sources/jobs/choosing-number-of-cores.md.txt @@ -0,0 +1,395 @@ +(choosing-number-of-cores)= + +# How to choose the number of cores + +```{warning} +- Asking for too few cores can lead to underused nodes or longer run time +- Asking for too many cores can mean wasted CPU resources +- Asking for too much can mean a lot longer queuing +``` + +```{contents} Table of Contents +:depth: 3 +``` + + +## Why it matters + +We request resources from the scheduler (queuing system). But the scheduler +cannot tell how long the job will run and what resources it will really +consume. Just the fact that I am asking the scheduler for 40 cores does not +mean that the code will actually run in parallel and use all of them. + +Just because a website says that code X can run in parallel or "scales well" +does not mean that it will scale well for the particular feature/system/input +at hand. + +Therefore **it is important to verify and calibrate this setting** for your use +case before computing very many similar jobs. Below we will show few +strategies. + +Note that **you don't have to go through this for every single run**. This is +just to calibrate a job type. If many of your jobs have similar resource +demands, then the calibration will probably be meaningful for all of them. + + +## Using top + +While the job is running, find out on which node(s) it runs using `squeue --me`, +then `ssh` into one of the listed compute nodes and run `top -u $USER`. + +Some clusters also have `htop` available which produces similar output as `top` +but with colors and possibly clearer overview. + + +## Timing a series of runs + +Here is an example C code (`example.c`) which we can compile and test a bit: +```c +#include +#include +#include +#include +#include + +double compute_something(int n, int m) +{ + double s = 0.0; + for(int i = 0; i < n; i++) + { + double f = rand(); + for(int j = 0; j < m; j++) + { + f = sqrt(f); + } + s += f; + } + return s; +} + +int main(int argc, char* argv[]) +{ + MPI_Init(&argc, &argv); + + int size; + MPI_Comm_size(MPI_COMM_WORLD, &size); + + int my_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + + const int k = 10000; + + double my_values[k]; + double buffer_recv[k]; + + for(int j = 0; j < 750; j++) + { + for(int i = 0; i < k; i++) + { + my_values[i] = compute_something(1000/size, 10); + } + for(int l = 0; l < 1000; l++) + { + MPI_Alltoall(&my_values, 10, MPI_DOUBLE, buffer_recv, 10, MPI_DOUBLE, MPI_COMM_WORLD); + } + } + + MPI_Finalize(); + + return EXIT_SUCCESS; +} +``` + +It does not matter so much what the code does. Here we wish to **find out how +this code scales** and what an optimum number of cores for it might be on our +system. + +We can build our example binary with this script (`compile.sh`): +``` +#!/usr/bin/env bash + +module purge +module load foss/2022b + +mpicc example.c -O3 -o mybinary -lm +``` + +Now take the following example script +(tested on Saga, please adapt the line containing `--account=nn____k` to reflect your project number): +```{code-block} +--- +emphasize-lines: 8-9 +--- +#!/usr/bin/env bash + +#SBATCH --account=nn____k + +#SBATCH --job-name='8-core' +#SBATCH --time=0-00:10:00 +#SBATCH --mem-per-cpu=1GB +#SBATCH --ntasks=8 +#SBATCH -o 8.out + +module purge +module load foss/2022b + +time srun ./mybinary +``` + +Run a series of calculations on 1, 2, 4, 8, 16, 32, 64, and 128 cores. + +You might get the following timings: + +| Number of cores | Time spent in mybinary | +|-----------------|------------------------| +| 1 | 7m21s | +| 2 | 2m21s | +| 4 | 1m15s | +| 8 | 0m41s | +| 16 | 0m27s | +| 32 | (technical problem) | +| 64 | 0m46s | +| 128 | 2m07s | + +Please try this. What can we conclude? And how can we explain it? + +Conclusions: +- For this particular example it does not make sense to go much beyond 16 cores +- Above 8 cores communication probably starts to dominate over computation + + +## Using seff + +`seff JOBID` is a nice tool which we can use on **completed jobs**. + +Here we can compare the output from `seff` when using 4 cores and when using +8 cores. + +Run with 4 cores: +```{code-block} +--- +emphasize-lines: 8-9 +--- +Job ID: 10404723 +Cluster: saga +User/Group: someuser/someuser +State: COMPLETED (exit code 0) +Nodes: 1 +Cores per node: 4 +CPU Utilized: 00:04:45 +CPU Efficiency: 91.35% of 00:05:12 core-walltime +Job Wall-clock time: 00:01:18 +Memory Utilized: 933.94 MB (estimated maximum) +Memory Efficiency: 22.80% of 4.00 GB (1.00 GB/core) +``` + +Run with 8 cores: +```{code-block} +--- +emphasize-lines: 8-9 +--- +Job ID: 10404725 +Cluster: saga +User/Group: someuser/someuser +State: COMPLETED (exit code 0) +Nodes: 1 +Cores per node: 8 +CPU Utilized: 00:05:06 +CPU Efficiency: 86.93% of 00:05:52 core-walltime +Job Wall-clock time: 00:00:44 +Memory Utilized: 1.84 GB (estimated maximum) +Memory Efficiency: 23.01% of 8.00 GB (1.00 GB/core) +``` + + +## Using jobstats + +Try it with one of your jobs: +```console +$ jobstats -j 10404723 + +Job 10404723 consumed 0.1 billing hours from project nn****k. + +Submitted 2024-02-04T13:52:44; waited 0.0 seconds in the queue after becoming eligible to run. + +Requested wallclock time: 10.0 minutes +Elapsed wallclock time: 1.3 minutes + +Task and CPU statistics: +ID CPUs Tasks CPU util Start Elapsed Exit status +10404723 4 0.0 % 2024-02-04T13:52:44 78.0 s 0 +10404723.batch 4 1 0.7 % 2024-02-04T13:52:44 78.0 s 0 +10404723.mybinary 4 4 95.7 % 2024-02-04T13:52:48 74.0 s 0 + +Used CPU time: 4.8 CPU minutes +Unused CPU time: 26.7 CPU seconds + +Memory statistics, in GiB: +ID Alloc Usage +10404723 4.0 +10404723.batch 4.0 0.0 +10404723.mybinary 4.0 0.9 +``` + + +## If it does not scale, what can be possible reasons? + +Here are typical problems: +- At some point more time is spent communicating than computing +- Memory-bound jobs saturate the memory bandwidth +- At some point the non-parallelized code section dominates the compute time ([Amdahl's law](https://en.wikipedia.org/wiki/Amdahl%27s_law)) + + +## What is MPI and OpenMP and how can I tell? + +These two parallelization schemes are very common (but there exist other schemes): +- [Message passing interface](https://en.wikipedia.org/wiki/Message_Passing_Interface): + typically each task allocates its own memory, tasks communicate via messages. + It is no problem to go beyond one node. +- [OpenMP](https://www.openmp.org/): + threads share memory and communicate through memory. + We cannot go beyond one node. + + +### How to tell if the code is using one of the two? + +- If you wrote the software: then you probably know +- If it is written by somebody else: + - It can be difficult to tell + - Consult manual for the software or contact support (theirs or ours) + - **If you have access to the source code**, `grep -i mpi` and `grep -i "omp "` the source code + - Example: (uses both MPI and OpenMP) + + +### Python/R/Matlab + +- Small self-written scripts are often not parallelized +- Libraries that you include in your scripts can use parallelization (e.g. + `mpi4py` or `multiprocessing`) + + +### Code may call a library which is shared-memory parallelized + +- Examples: BLAS libraries, NumPy, SciPy + +Here is an example which you can try (`example.py`) where we compute a couple +of matrix-matrix multiplications using [NumPy](https://numpy.org/): +```python +import numpy as np + +n = 10000 + +# run it multiple times, just so that it runs longer and we have enough time to +# inspect it while it's running +for _ in range(5): + matrix_a = np.random.rand(n, n) + matrix_b = np.random.rand(n, n) + + matrix_c = np.matmul(matrix_a, matrix_b) + +print("calculation completed") +``` + +We will try two different job scripts and below we highlight where they differ. + +Job script A (adapt `--account=nn____k`; this is tested on Saga): +```{code-block} +--- +emphasize-lines: 10-11 +--- +#!/usr/bin/env bash + +#SBATCH --account=nn____k + +#SBATCH --job-name='example' +#SBATCH --time=0-00:05:00 +#SBATCH --mem-per-cpu=1500M + +#SBATCH --nodes=1 +#SBATCH --tasks-per-node=1 +#SBATCH --cpus-per-task=4 + +module load SciPy-bundle/2023.02-gfbf-2022b + +python example.py + +env | grep NUM_THREADS +``` + +Job script B: +```{code-block} +--- +emphasize-lines: 10 +--- +#!/usr/bin/env bash + +#SBATCH --account=nn____k + +#SBATCH --job-name='example' +#SBATCH --time=0-00:05:00 +#SBATCH --mem-per-cpu=1500M + +#SBATCH --nodes=1 +#SBATCH --tasks-per-node=4 + +module load SciPy-bundle/2023.02-gfbf-2022b + +python example.py + +env | grep NUM_THREADS +``` + +Run both examples and check the timing. +It can also be +interesting to log into the compute node while the job is running and using +`top -u $USER`. Can you explain what is happening here? + +This was the job script with `--cpus-per-task=4`: +```{code-block} console +--- +emphasize-lines: 11 +--- +$ seff 10404753 + +Job ID: 10404753 +Cluster: saga +User/Group: someuser/someuser +State: COMPLETED (exit code 0) +Nodes: 1 +Cores per node: 4 +CPU Utilized: 00:03:56 +CPU Efficiency: 75.64% of 00:05:12 core-walltime +Job Wall-clock time: 00:01:18 +Memory Utilized: 3.03 GB +Memory Efficiency: 51.75% of 5.86 GB +``` + +And this was the job script with the two export lines active: +This was the job script with `--tasks-per-node=4`: +```{code-block} console +--- +emphasize-lines: 11 +--- +$ seff 10404754 + +Job ID: 10404754 +Cluster: saga +User/Group: someuser/someuser +State: COMPLETED (exit code 0) +Nodes: 1 +Cores per node: 4 +CPU Utilized: 00:03:55 +CPU Efficiency: 24.79% of 00:15:48 core-walltime +Job Wall-clock time: 00:03:57 +Memory Utilized: 3.02 GB +Memory Efficiency: 51.56% of 5.86 GB +``` + +The explanation is that the former job script automatically sets `OMP_NUM_THREADS=4`, +whereas the latter sets `OMP_NUM_THREADS=1`. + +The morale of this story is that for Python and R it can be useful to verify +whether the script really uses all cores you give the job script. If it is +expected to use them but only runs on 1 core, check whether the required +environment variables are correctly set. Sometimes you might need to set them +yourself. diff --git a/_sources/jobs/choosing_job_types.md.txt b/_sources/jobs/choosing_job_types.md.txt new file mode 100644 index 000000000..7598cbacf --- /dev/null +++ b/_sources/jobs/choosing_job_types.md.txt @@ -0,0 +1,67 @@ +(job-types)= + +# Job Types + +The clusters are designed for different work loads, and each cluster +has several types of jobs. This page gives an overview of the +available job types on each cluster, and their main characteristics. +See the sub pages of each cluster for a more detailed description of +the job types and their purposes. + +You should always choose the job type that fulfils the resource +requirements of your job best without being excessive as this ensures +the highest possible priority and therefore the shortest queuing time. +If for example your job on Saga needs 20GB memory and 5 CPUs for 5 days, +you should choose a _normal_ instead of a _bigmem_ job. +If it instead needs 200GB memory you should use _bigmem_. + +## Betzy + +| Name | Description | Job limits | Max walltime | Priority | +|:---------------------------------------:|-------------------------------------------|:------------:|:------------:|:--------:| +| {ref}`normal ` | default job type | 4--512 nodes | 4 days | normal | +| {ref}`accel ` | jobs needing GPUs | | 7 days | normal | +| {ref}`preproc ` | pre-/postprocessing jobs | 1--16 units | 1 day | normal | +| {ref}`devel ` | development jobs (compiling, testing)[^1] | 1--4 nodes | 60 mins | high | + +For jobs that don't request GPUs or much memory, the "units" of *accel* or *preproc* jobs +are simply the number of cpus the job requests. +For other jobs, see {ref}`projects-accounting` for how the units are calculated. + +{ref}`Betzy Job Types `. + +## Fram + +| Name | Description | Job limits | Max walltime | Priority | +|:----------------------------------------:|------------------------------------------|:-----------:|:-------------------------------------------:|:--------:| +| {ref}`normal ` | default job type | 1--32 nodes | 7 days | normal | +| {ref}`bigmem ` | jobs needing more memory | | 14 days | normal | +| {ref}`devel ` | development jobs (compiling, testing) | 1--8 nodes | 30 mins | high | +| {ref}`short ` | short jobs | 1--10 nodes | 2 hours | high | +| {ref}`optimist ` | jobs w/checkpointing, or very short jobs | 1--32 nodes | {ref}`see details ` | low | + +[Fram Job Types](job_types/fram_job_types.md). + + +## Saga + +| Name | Description | Job limits | Max walltime | Priority | +|:----------------------------------------:|-------------------------------------------|:------------:|:-------------------------------------------:|:--------:| +| {ref}`normal ` | default job type | 1--256 units | 7 days | normal | +| {ref}`bigmem ` | jobs needing more memory | 1--256 units | 14 days | normal | +| {ref}`hugemem ` | jobs needing even more memory | 1--256 units | 14 days | normal | +| {ref}`accel ` | jobs needing P100 GPUs | 1--256 units | 14 days | normal | +| {ref}`a100 ` | jobs needing A100 GPUs | 1--256 units | 14 days | normal | +| {ref}`devel ` | development jobs (compiling, testing)[^2] | 1--128 units | 2 hours | high | +| {ref}`optimist ` | jobs w/checkpointing, or very short jobs | 1--256 units | {ref}`see details ` | low | + +For jobs that don't request GPUs or much memory, the "units" on Saga are +simply the number of cpus the job requests. +For other jobs, see {ref}`projects-accounting` for how the units are calculated. + +[Saga Job Types](job_types/saga_job_types.md). + +[^1]: On Betzy it is possible to combine _devel_ with _accel_, {ref}`see details `. + +[^2]: On Saga it is possible to combine _devel_ with _accel_, _a100_, + _bigmem_ or _hugemem_, {ref}`see details `. diff --git a/_sources/jobs/common_job_failures.md.txt b/_sources/jobs/common_job_failures.md.txt new file mode 100644 index 000000000..31771bcaa --- /dev/null +++ b/_sources/jobs/common_job_failures.md.txt @@ -0,0 +1,61 @@ +# Common job failures + +Although users run very different types of jobs on the clusters, two errors are common and here we +describe how to detect these as well as steps to fix these common job failures. + + +## Running out of memory + +A job will stop if it tries to use more memory than requested from Slurm. + +This error is reported in the Slurm output: +``` +slurm_script: line 11: 33333 Killed ./mybinary +slurmstepd: error: Detected 1 oom-kill event(s) in step 997857.batch cgroup. +Some of your processes may have been killed by the cgroup out-of-memory +handler. +``` + +The cryptic but interesting information here is `oom-kill` ("oom" is short for "out of memory") and `out-of-memory`. + +You can fix this by requesting more memory in you job script: +``` +#SBATCH --mem-per-cpu=1G +``` + +**But don't ask for way too much memory either**, otherwise you can get billed +for a lot more than you use, and your jobs may queue for a lot longer than you +would like to. In addition this can also block resources for others. + +To find out how much memory your job really needs, please have a look at +{ref}`choosing-memory-settings`. + +## Disk quota exceeded + +Since the clusters are shared resources with hundreds of users, we have to have +quotas in place to prevent any user or group from +using too much disk space and making the system unusable for others. + +When a group or user reaches their disk quota, files cannot be created due to the cause `Disk Quota Exceeded`. +This will often stop jobs that need to write output or log files. + +There could be different quota settings for your home folder, project folders, +and other folders in addition to a file count or size quota. Please consult +[this page](/files_storage/clusters.md) for +more details to see how to inspect your disk quota. + +(Here we need to link to policies on how/when to ask for more) + + +## Job is rejected because of insufficient credit + +If you see an error message like this one after submitting your job script: +``` +sbatch: error: AssocGrpBillingMinutes +sbatch: error: Batch job submission failed: Job violates accounting/QOS policy + (job submit limit, user's size and/or time limits) +``` + +Then check with `cost` whether your compute account has enough credit for your +job. The error probably means that you asked for more resources in your job +script than you have available. Please consult [this page](/getting_started/applying_resources.md) on how to apply for more credits. diff --git a/_sources/jobs/guides.md.txt b/_sources/jobs/guides.md.txt new file mode 100644 index 000000000..79a728732 --- /dev/null +++ b/_sources/jobs/guides.md.txt @@ -0,0 +1,9 @@ +# Guides + +* [Running MPI Jobs](running-mpi-applications) +* [Porting From PBS/Torque](porting-from-pbs) +* [Job Array Howto](job-array-howto) +* [Job Dependencies](job-dependencies) +* [Running Job Steps in Parallel](running-job-steps-parallel) +* [Cleanup at Timeout](cleanup-timeout) +* [TensorFlow on GPU](tensorflow) diff --git a/_sources/jobs/guides/cleanup_timeout.md.txt b/_sources/jobs/guides/cleanup_timeout.md.txt new file mode 100644 index 000000000..432c8eece --- /dev/null +++ b/_sources/jobs/guides/cleanup_timeout.md.txt @@ -0,0 +1,29 @@ +--- +orphan: true +--- + +(cleanup-timeout)= + +# How to recover files before a job times out + +Possibly you would like to clean up the work directory or recover +files for restart in case a job times out. This is perhaps most +useful when using the `$SCRATCH` work directory (see {ref}`storage-areas`). + +In this example we ask Slurm to send a signal to our script 120 +seconds before it times out to give us a chance to perform clean-up +actions. + +```{eval-rst} +.. literalinclude:: files/timeout_cleanup.sh + :language: bash +``` + +Download the script: +```{eval-rst} +:download:`files/timeout_cleanup.sh` +``` + +Also note that jobs which use `$SCRATCH` as the work directory can use +the `savefile` and `cleanup` commands to copy files back to the submit +directory before the work directory is deleted (see info about {ref}`job-work-directory`). diff --git a/_sources/jobs/guides/job_array_howto.md.txt b/_sources/jobs/guides/job_array_howto.md.txt new file mode 100644 index 000000000..55bdbb68f --- /dev/null +++ b/_sources/jobs/guides/job_array_howto.md.txt @@ -0,0 +1,91 @@ +--- +orphan: true +--- + +(job-array-howto)= + +# Job array howto + +In this example we wish to run many similar sequential jobs in +parallel using job arrays. We take Python as an example but this does +not matter for the job arrays: + +```{eval-rst} +.. literalinclude:: files/array_test.py + :language: python +``` + +Download the script: +```{eval-rst} +:download:`files/array_test.py` +``` + +Try it out: + +```bash +$ python array_test.py +start at 15:23:48 +sleep for 10 seconds ... +stop at 15:23:58 +``` + +Good. Now we would like to run this script 16 times at (more or less) the same +time. For this we use the following + +```{eval-rst} +.. literalinclude:: files/array_howto.sh + :language: bash +``` + +Download the script: +```{eval-rst} +:download:`files/array_howto.sh` +``` + +This is a script for running a _normal_ array job on Saga. It can +easily be changed to run on Fram or Betzy, or use a different job type. + +Submit the script with `sbatch` and after a while you should see 16 +output files in your submit directory: + +```bash +$ ls -l output*txt +-rw------- 1 user user 60 Oct 14 14:44 output_1.txt +-rw------- 1 user user 60 Oct 14 14:44 output_10.txt +-rw------- 1 user user 60 Oct 14 14:44 output_11.txt +-rw------- 1 user user 60 Oct 14 14:44 output_12.txt +-rw------- 1 user user 60 Oct 14 14:44 output_13.txt +-rw------- 1 user user 60 Oct 14 14:44 output_14.txt +-rw------- 1 user user 60 Oct 14 14:44 output_15.txt +-rw------- 1 user user 60 Oct 14 14:44 output_16.txt +-rw------- 1 user user 60 Oct 14 14:44 output_2.txt +-rw------- 1 user user 60 Oct 14 14:44 output_3.txt +-rw------- 1 user user 60 Oct 14 14:44 output_4.txt +-rw------- 1 user user 60 Oct 14 14:44 output_5.txt +-rw------- 1 user user 60 Oct 14 14:44 output_6.txt +-rw------- 1 user user 60 Oct 14 14:44 output_7.txt +-rw------- 1 user user 60 Oct 14 14:44 output_8.txt +-rw------- 1 user user 60 Oct 14 14:44 output_9.txt +``` + +Observe that they all started (approximately) at the same time: + +```bash +$ grep start output*txt +output_1.txt:start at 14:43:58 +output_10.txt:start at 14:44:00 +output_11.txt:start at 14:43:59 +output_12.txt:start at 14:43:59 +output_13.txt:start at 14:44:00 +output_14.txt:start at 14:43:59 +output_15.txt:start at 14:43:59 +output_16.txt:start at 14:43:59 +output_2.txt:start at 14:44:00 +output_3.txt:start at 14:43:59 +output_4.txt:start at 14:43:59 +output_5.txt:start at 14:43:58 +output_6.txt:start at 14:43:59 +output_7.txt:start at 14:43:58 +output_8.txt:start at 14:44:00 +output_9.txt:start at 14:43:59 +``` diff --git a/_sources/jobs/guides/job_dependencies.md.txt b/_sources/jobs/guides/job_dependencies.md.txt new file mode 100644 index 000000000..1d94c7f89 --- /dev/null +++ b/_sources/jobs/guides/job_dependencies.md.txt @@ -0,0 +1,115 @@ +--- +orphan: true +--- +(job-dependencies)= + +# Job Dependencies + +In the following we demonstrate how to add dependecies between jobs using the Slurm option `--dependency`. +The full list of dependency types can be found in the [Slurm](https://slurm.schedmd.com/sbatch.html) +documentation, but we will show the most useful cases here: + +| Option | Explanation | +| :----------------------| :-------------------------------------------------------------------------------| +| `after:` | job can start after `` has *started* | +| `afterany:` | job can start after `` has *completed* (any exit code) | +| `afterok:` | job can start only if `` has *completed* with exit code 0 (success) | +| `afternotok:` | job can start only if `` has *completed* with exit code *not* 0 (failed) | + +Several ``s can be combined in a comma-separated list. + +```{note} +The `--dependency` option must be added to the `sbatch` command *before* the name of the +job script, if you put it *after* the script it will be treated as an argument to the script, not +to the `sbatch` command. If the dependency was added successfully, you should see a `(Dependency)` +in the `NODELIST(REASON)` column of the `squeue` output. +``` + +## Beware of exit status + +With some of the options it is important to keep in mind the *exit status* of +your job script, to indicate whether or not the job finished successfully. By default the +script will return the exit status of the *last command* executed in the script, which in +general does not necessarily reflect the overall success of the job. It is then highly +recommended adding the following to the script: + +```bash +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error +``` + +as well as capturing errors in critical commands along the way: + +```bash +mycommand || exit 1 +``` + +and finally *explicitly* return 0 in case the script finishes successfully: + +```bash +# Successful exit +exit 0 +``` + +Standard Slurm errors like out-of-memory or time limit will of course be captured automatically. + +## Examples + +- **Here `pre.sh` is a pre-processing step for `job-1.sh`, `job-2.sh`, etc:** +```bash +$ sbatch pre.sh +Submitted batch job 123123 +$ for i in 1 2 3 4 5; do sbatch --dependency=afterok:123123 job-${i}.sh; done +Submitted batch job 123124 +Submitted batch job 123125 +Submitted batch job 123126 +Submitted batch job 123127 +Submitted batch job 123128 +$ squeue -u $USER + JOBID PARTITION NAME ST USER TIME NODES NODELIST(REASON) + 123124 normal job-1 PD me 0:00 1 (Dependency) + 123125 normal job-2 PD me 0:00 1 (Dependency) + 123126 normal job-3 PD me 0:00 1 (Dependency) + 123127 normal job-4 PD me 0:00 1 (Dependency) + 123128 normal job-5 PD me 0:00 1 (Dependency) + 123123 normal pre R me 0:28 1 c1-1 +``` + + +- **Here `post.sh` is a post-processing step for `job-1.sh`, `job-2.sh`, etc:** +```bash +$ for i in 1 2 3 4 5; do sbatch job-${i}.sh; done +Submitted batch job 123123 +Submitted batch job 123124 +Submitted batch job 123125 +Submitted batch job 123126 +Submitted batch job 123127 +$ sbatch --dependency=afterok:123123,123124,123125,123126,123127 post.sh +Submitted batch job 123128 +``` + +- **Here `job-2.sh` is a fallback/retry in case `job-1.sh` fails:** +```bash +$ sbatch job-1.sh +Submitted batch job 123123 +$ sbatch --dependency=afternotok:123123 job-2.sh +Submitted batch job 123124 +``` + +- **If for some reason you want your jobs to run one after the other:** + +This is a bit cumbersome to do in a loop since the `sbatch` command returns the text string +"Submitted batch job" before showing the jobid, but we can extract it with a `awk '{ print $4 }'` +command (which returns the 4th entry in the string), and use it in a loop as follows (not that +the first job must be submitted individually, as it has no dependencies): + +```bash +$ lastid=`sbatch job-1.sh | awk '{ print $4 }'` +$ echo $lastid +123123 +$ for i in 2 3 4 5; do lastid=`sbatch --dependency=after:${lastid} job-${i}.sh | awk '{ print $4 }'`; echo ${lastid}; done +123124 +123125 +123126 +123127 +``` diff --git a/_sources/jobs/guides/porting_from_pbs.md.txt b/_sources/jobs/guides/porting_from_pbs.md.txt new file mode 100644 index 000000000..d81d89ad1 --- /dev/null +++ b/_sources/jobs/guides/porting_from_pbs.md.txt @@ -0,0 +1,55 @@ +--- +orphan: true +--- + +(porting-from-pbs)= + +# Porting Batch Scripts from PBS/TORQUE + +Converting a PBS/TORQUE script files to Slurm is simple because most of the commands +have direct equivalents in Slurm. The shell commands and variables +need to be changed but the application code such as compiling and copying of files +can remain the same. + +This page lists some ways to convert batch scripts from PBS/TORQUE to Slurm. + +## Shell Commands + +Many PBS/TORQUE commands directly translate to a Slurm command. Here are some +of the PBS/TORQUE commands with their Slurm counterparts. + +| Shell Commands | PBS/TORQUE | Slurm | +| :------------- | :------------- | :------------- | +| Job submission | qsub <*filename*> | sbatch <*filename*> | +| Job deletion | qdel <*job_id*> | scancel <*job_id*> | +| Job status (by job) | qstat <*job_id*> | squeue --job <*job_id*> | +| Full job status (by job) | qstat -f <*job_id*> | scontrol show job <*job_id*> | +| Job status (by user) | qstat -u <*username*> | squeue --user=<*username*> | + +## Environment variables + +| Environment variables | PBS/Torque | Slurm | +| :------------- | :------------- | :------------- | +| Job ID | $PBS_JOBID | $SLURM_JOB_ID | +| Submit Directory | $PBS_O_WORKDIR | $SLURM_SUBMIT_DIR | +| Node List | $PBS_NODEFILE | $SLURM_JOB_NODELIST | + +## Options and Settings + +These are options that may be placed in the batch script or passed as arguments +to *sbatch*. + +| Options | PBS/Torque | Slurm | +| :------------- | :------------- | :------------- | +| Script directive | #PBS | #SBATCH | +| Job Name | -N <*name*> | --job-name=<*name*> OR -J <*name*> | +| Node Count | -l nodes=<*count*> | --nodes=<*minnodes[-maxnodes]*> OR -N <*minnodes[-maxnodes]*> | +| CPU Count | -l ppn=<*count*> | --ntasks-per-node=<*count*> | +| CPUs Per Task | | --cpus-per-task=<*count*> | +| Memory Size | -l mem=<*MB*> | --mem=<*MB*> OR --mem-per-cpu=<*MB*> | +| Wall Clock Limit | -l walltime=<*hh:mm:ss*> | --time=<*min*> OR --time=<*days-hh:mm:ss*> | +| Standard Output File | -o <*file_name*> | --output=<*file_name*> OR -o <*file_name*> | +| Job Arrays | -t <*array_spec*> | --array=<*array_spec*> OR -a <*array_spec*> | +| Standard Error File | -e <*file_name*> | --error=<*file_name*> OR -e <*file_name*> | +| Combine stdout/stderr | -j oe (both to stdout) | (Default if you don’t specify --error) | +| Delay Job Start | -a <*time*> | --begin=<*time*> | diff --git a/_sources/jobs/guides/running_job_steps_parallel.md.txt b/_sources/jobs/guides/running_job_steps_parallel.md.txt new file mode 100644 index 000000000..916147c07 --- /dev/null +++ b/_sources/jobs/guides/running_job_steps_parallel.md.txt @@ -0,0 +1,82 @@ +--- +orphan: true +--- + +(running-job-steps-parallel)= + +# Packaging smaller parallel jobs into one large + +There are several ways to package smaller parallel jobs into one large +parallel job. *The preferred way is to use {ref}`array-jobs`.* +Here we want to present a more pedestrian alternative which can give a +lot of flexibility, but can also be a little more complicated to get right. + +Note that how to use this mechanism has changed since Slurm 19.05.x (and +might change again later). + +In this example we imagine that we wish to run a job with 5 MPI job steps +at the same time, each using 4 tasks, thus totalling to 20 tasks: + +```{eval-rst} +.. literalinclude:: files/parallel_steps_cpu.sh + :language: bash +``` + +Download the script: +```{eval-rst} +:download:`files/parallel_steps_cpu.sh` +``` + +Note that with the currently installed versions of Slurm (22.05.x and +newer), instead of +```bash +export SLURM_JOB_NUM_NODES=1-$SLURM_JOB_NUM_NODES +``` +one can use the slightly simpler +```bash +export SLURM_DISTRIBUTION=pack +``` +or add `-m=pack` / `--distribution=pack` to the `srun` command lines. + +This will work with any {ref}`job-types` that hands out _cpus +and memory_, so that one specifies `--mem-per-cpu`. For instance + + sbatch --partition=bigmem parallel_steps_cpu.sh + +For job types that hand out _whole nodes_, notably the _normal_ jobs +on Fram and Betzy, one has to do it slightly different. Here is an example to +run a `normal` job with 8 MPI job steps at the same time, each using +16 tasks, thus totalling 128 tasks: + +```{eval-rst} +.. literalinclude:: files/parallel_steps_node.sh + :language: bash +``` + +Download the script: +```{eval-rst} +:download:`files/parallel_steps_node.sh` +``` + +For instance (on Fram): + + sbatch parallel_steps_node.sh + +A couple of notes: + +- The `wait` command is important - the run script will only continue once + all commands started with `&` have completed. +- It is possible to use `mpirun` instead of `srun`, although `srun` is + recommended for OpenMPI. +- The `export SLURM_MEM_PER_CPU=1888` and `unset SLURM_MEM_PER_NODE` + lines prior to the `srun` lines are needed for jobs in the `normal` or + `optimist` partitions on Fram and Betzy, because it is not possible + to specify this to `sbatch` for such jobs. Alternatively, you can + add `--mem-per-cpu=1888` to the `srun` command lines (this only + works with `srun`). (1888 allows up to 32 tasks per node. If each + task needs more than 1888 MiB per cpu, the number must be increased + (and the number of tasks per node will be reduced). On *Betzy*, the + corresponding number is 1952, which will allow up to 128 tasks per + node. +- This technique does **not** work with IntelMPI, at least not when using + `mpirun`, which is currently the recommended way of running IntelMPI jobs. diff --git a/_sources/jobs/guides/running_mpi_jobs.md.txt b/_sources/jobs/guides/running_mpi_jobs.md.txt new file mode 100644 index 000000000..5f365b11a --- /dev/null +++ b/_sources/jobs/guides/running_mpi_jobs.md.txt @@ -0,0 +1,176 @@ +--- +orphan: true +--- + +(running-mpi-applications)= + +# Running MPI Applications + +On Betzy, Fram and Saga users have access to two MPI implementations: + +* OpenMPI is provided by the foss - and iomkl toolchains; and may also + be loaded directly. For available versions, type `module avail + OpenMPI/`**(note the slash)**. Normal way of loading is through the + `foss`-toolchain module, e.g. `module load foss/2018a` +* Intel MPI environment is provided by the intel-toolchain and may + also be loaded directly. For available versions, type `module avail + impi/`. Normal way of loading is through the `intel`-toolchain + module, e.g. `module load intel/2018a` + +**Also note that quite a few scientific packages is set up in such a +way that all necessary software are loaded as a part of the software +module in question. Do not load toolchains and/or mpi modules +explicitly unless absolutely sure of the need for it!!!** + +Slurm is used as the {ref}`queue-system`, and the native +way to start MPI applications with Slurm is to use the +[`srun`](https://slurm.schedmd.com/srun.html) command. On the other +hand, both MPI implementations provide their own mechanisms to start +application in the form of the `mpirun` command. + +One of the most important factors when running large MPI jobs is +mapping of the MPI ranks to compute nodes, and *binding* (or +*pinning*) them to CPU cores. Neglecting to do that, or doing that in +an suboptimal way can severely affect performance. In this regard +there are some differences when it comes to running applications +compiled against the two supported MPI environments. + +**Also note that the choice of MPI should be based on which MPI the +code is compiled with support for.** So if `module list` give you a +`OpenMPI/`-reading, you should focus on the OpenMPI part beneath, if +given a `impi/`-reading focus on the Intel MPI part + + +## OpenMPI + +On systems with Mellanox InfiniBand, OpenMPI is the implementation +recommended by Mellanox due to its support for the [HPCX +communication +libraries](https://docs.mellanox.com/category/hpcx). + + +### `srun` + +With OpenMPI, `srun` is the preferred way to start MPI programs due to +good integration with the Slurm scheduler environment: + +``` +srun /path/to/MySoftWare_exec +``` + +Executed as above, `srun` uses Slurm's default binding and mapping +algorithms (currently `--cpu-bind=cores`), [which can be +changed](https://slurm.schedmd.com/srun.html) using either +command-line parameters, or environment variables. Parameters specific +to OpenMPI can be set using [environment +variables](https://www.open-mpi.org/faq/?category=tuning#setting-mca-params). + +In the above scenario `srun` uses the PMI2 interface to launch the MPI +ranks on the compute nodes, and to exchange the InfiniBand address information between +the ranks. For large jobs the startup might be faster using OpenMPI's PMIx method: + +``` +srun --mpi=pmix /path/to/MySoftWare_exec +``` + +The startup time might be improved further using the OpenMPI MCA +`pmix_base_async_modex` argument (see below). With `srun` this needs to be +set using an environment variable. + + +### `mpirun` + +```{warning} +**On Saga use srun, not mpirun** + +mpirun can get the number of tasks wrong and also lead to wrong task +placement. We don't fully understand why this happens. When using srun +instead of mpirun or mpiexec, we observe correct task placement on Saga. +``` + +For those familiar with the OpenMPI tools, MPI applications can also +be started using the `mpirun` command: + +``` +mpirun /path/to/MySoftWare_exec +``` + +By default, `mpirun` binds ranks to cores, and maps them by +socket. Please refer to the +[documentation](https://www.open-mpi.org/doc/v2.1/man1/mpirun.1.php) +if you need to change those settings. Note that `-report-bindings` is +a very useful option if you want to inspect the individual MPI ranks +to see on which nodes, and on which CPU cores they run. + +When launching large jobs with sparse communication patterns +(neighbor to neighbor, local communication) the startup time will be improved +by using the following command line argument: + +``` +mpirun -mca pmix_base_async_modex 1 ... +``` +In the method above the address information will be exchanged between the ranks on a +need-to-know basis, i.e., at first data exchange between two ranks, instead of an all to all communication +step at program startup. Applications with dense communication patterns (peer to peer exchanges +with all ranks) will likely experience a slowdown. + + +## Intel MPI + +### `mpirun` + +```{warning} +**On Saga use srun, not mpirun** + +mpirun can get the number of tasks wrong and also lead to wrong task +placement. We don't fully understand why this happens. When using srun +instead of mpirun or mpiexec, we observe correct task placement on Saga. +``` + +At this moment, for performance reasons `mpirun` is the preferred way +to start applications that use Intel MPI: + +``` +mpirun /path/to/MySoftWare_exec +``` + +In the above, `MySoftWare_exec` is subject to `mpirun`'s internal +mapping and binding algorithms. Intel's `mpirun` uses it's own default +binding settings, which can be modified either by [command line +parameters](https://software.intel.com/en-us/node/589999), or by +[environment +variables](https://software.intel.com/content/www/us/en/develop/documentation/mpi-developer-reference-linux/top/environment-variable-reference/process-pinning/environment-variables-for-process-pinning.html). +Special care must be taken when running hybrid MPI-OpenMP cores. If +this is your case, please refer to the documentation regarding +[Interoperability between MPI and OpenMP](https://software.intel.com/content/www/us/en/develop/documentation/mpi-developer-reference-windows/top/environment-variable-reference/main-thread-pinning/interoperability-with-openmp-api.html). + +[comment]: # Original link (https://software.intel.com/en-us/mpi-developer-reference-windows-interoperability-with-openmp-api) + +### `srun` + +With `srun`, Intel MPI applications can be started as follows: + +``` +srun /path/to/MySoftWare_exec +``` + +We have observed that in the current setup some applications compiled +against Intel MPI and executed with `srun` achieve inferior +performance compared to the same code executed with `mpirun`. Until +this is resolved, we suggest using `mpirun` to start applications. + + +## Final remarks + +Note that when executing `mpirun` from within a Slurm allocation there +is no need to provide neither the number of MPI ranks (`-np`), nor the +host file (`-hostfile`): those are obtained automatically by +`mpirun`. This is also be the case with `srun`. + +Also note that in the current versions of Slurm (22.05.x and 23.02.x), +`srun` will **not** inherit `--cpus-per-task=n` from `sbatch`, so if +you specify `--cpus-per-task=n` when submitting a job, you must call +`srun` like this: `srun --cpus-per-task=n ...` or use `export +SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK` before the `srun` command in +the job script. This is new behaviour as of Slurm 22.05.x, and +hopefully the previous behaviour will be restored in a later version. diff --git a/_sources/jobs/interactive_jobs.md.txt b/_sources/jobs/interactive_jobs.md.txt new file mode 100644 index 000000000..f48f8e093 --- /dev/null +++ b/_sources/jobs/interactive_jobs.md.txt @@ -0,0 +1,144 @@ +(interactive-jobs)= + +# Interactive jobs + +Sometimes you might want to test or debug a calculation interactively, +but **running interactively on the login node is discouraged and not an +option**. + + +```{contents} Table of Contents +``` + + +## Requesting an interactive job + +Instead of running on a login node, you can ask the queue system to +allocate compute resources for you, and once assigned, you can run +commands interactively for as long as requested. The examples below +are for _devel_ jobs, but the procedure also holds for the [other job +types ](choosing_job_types.md) except _optimist_ jobs. + +On **Saga**: +``` +$ salloc --ntasks=1 --mem-per-cpu=4G --time=00:30:00 --qos=devel --account=YourAccount +``` + +On **Fram** or **Betzy**: +``` +$ salloc --nodes=1 --time=00:30:00 --qos=devel --account=YourAccount +``` + +This will allocate resources, and start a shell on a compute node. +When you are done, simply exit the shell (`exit`, `logout` or `^D`) to +end the job. + +The arguments to `salloc` (or `srun`) could be any arguments you +would have given to `sbatch` when submitting a non-interactive +job. However, `--qos=devel` is probably a good idea to avoid waiting +too long in the queue. + +**Note that interactive jobs stop when you log out from the login +node**, so unless you have very long days in office (or elsewhere, for +that matter), specifying more than 6-8 hours runtime is not very +useful. An alternative is to start the job in a `tmux` session (see +below). + + +## Graphical user interface in interactive jobs + +It is possible to run X commands, i.e., programs with a graphical user +interface (GUI), in interactive jobs. This allows you to get graphical +output back from your job running on a login node. (Note that +currently, this has not been activated on Betzy.) + +First, you must make sure that you have turned on *X forwarding* when logging +in to the cluster. With `ssh` from a Linux or MacOS machine, you do this with +the `-Y` flag, e.g.: +``` +$ ssh -Y saga.sigma2.no +``` +or: +``` +$ ssh -Y fram.sigma2.no +``` + +Check that the X forwarding works by running a graphical command like `xeyes` +and verify that it sets up a window. (Note that due to network latency, it +can take a long time to set up a window.) + +To be able to run X commands in interactive jobs, add the argument `--x11` +(note the lowercase `x`) to `salloc`, like this: + +On **Saga**: +``` +$ salloc --ntasks=1 --mem-per-cpu=4G --time=00:30:00 --qos=devel --account=YourAccount --x11 +``` + +On **Fram**: +``` +$ salloc --nodes=1 --time=00:30:00 --qos=devel --account=YourAccount --x11 +``` + + +## Running the shell or a command on the login node + +For some applications (see for instance {ref}`totalview_debugging`), +it is preferrable to have the shell or a command running on the login +node instead of on the compute node(s). + +This can be achieved by just adding `bash` or the command to the end of +the `salloc` command line, i.e., +``` +$ salloc bash +``` +or +``` +$ salloc +``` + +Note that the shell **will be running on the login node**. That means +that you *must* start all calculations with `srun` or `mpirun` or +equivalent, to make sure they run on the allocated compute node(s). + + +## Keeping interactive jobs alive + +Interactive jobs stop when you disconnect from the login node either by +choice or by internet connection problems. To keep a job alive you can +use a terminal multiplexer like `tmux`. + +`tmux` allows you to run processes as usual in your standard bash shell + +You start `tmux` on the login node before you get a interactive Slurm +session with `srun` and then do all the work in it. In case of a +disconnect you simply reconnect to the login node and attach to the `tmux` +session again by typing: +``` +$ tmux attach +``` +Or in case you have multiple session running: +``` +$ tmux list-session +$ tmux attach -t SESSION_NUMBER +``` + +As long as the `tmux` session is not closed or terminated (e.g. by a +server restart) your session should continue. One problem with our +systems is that the `tmux` session is bound to the particular login server +you get connected to. So if you start a `tmux` session on login-1 on SAGA +and next time you get randomly connected to login-2 you first have to +connect to login-1 again by: +``` +$ ssh login-1 +``` + +To log out a `tmux` session without closing it you have to press Ctrl-B +(that the Ctrl key and simultaneously "b", which is the standard `tmux` +prefix) and then "d" (without the quotation marks). To close a session +just close the bash session with either Ctrl-D or type exit. You can get +a list of all `tmux` commands by Ctrl-B and the ? (question mark). See +also [this +page](https://www.hamvocke.com/blog/a-quick-and-easy-guide-to-tmux/) for +a short tutorial of `tmux`. Otherwise working inside of a `tmux` session is +almost the same as a normal bash session. diff --git a/_sources/jobs/job_scripts.md.txt b/_sources/jobs/job_scripts.md.txt new file mode 100644 index 000000000..334cd4754 --- /dev/null +++ b/_sources/jobs/job_scripts.md.txt @@ -0,0 +1,150 @@ +(job-scripts)= + +# Job Scripts + +This page documents the basics of how to write job scripts for the HPC clusters. +Cluster-specific details are kept in separate sub pages for each cluster: + +- [Slurm job script generator](https://open.pages.sigma2.no/job-script-generator/) +- [Fram job scripts](job-scripts-on-fram) +- [Saga job scripts](job-scripts-on-saga) +- {ref}`Betzy job scripts ` + +```{note} +Email notification from completed Slurm scripts is currently not supported **on all +machines**, sorry for the inconvenience. The reason is technical due to the +way the infrastructure is set up and it is non-trivial for us to support this in +a good, robust and secure way. +``` + + +## Job Script Basics + +To run a _job_ on the cluster involves creating a shell script called +a _job script_. The job script is a plain-text file containing any +number of commands, including your main computational task, i.e., it +may copy or rename files, cd into the proper directory, etc., all +before doing the "real" work. The lines in the script file are the +commands to be executed, in the given order. Lines starting with a +`#` are ignored as comments, except lines that start with `#SBATCH`, +which are not executed, but contain special instructions to the queue +system. + +If you are not familiar with shell scripts, they are simply a set of +commands that you could have typed at the command line. You can find +more information about shell scripts here: [Introduction to Bash shell +scripts](http://www.linuxconfig.org/Bash_scripting_Tutorial). + +A job script consists of a couple of parts, in this order: + +- The first line, which is typically `#!/bin/bash` + (the Slurm script does not have to be written in Bash, see below) +- Parameters to the queue system +- Commands to set up the execution environment +- The actual commands you want to be run + +Parameters to the queue system may be specified on the `sbatch` +command line and/or in `#SBATCH` lines in the job script. There can +be as many `#SBATCH` lines as you want, and you can combine several +parameters on the same line. If a parameter is specified both on the +command line and in the job script, the parameter specified on the +command line takes precedence. The `#SBATCH` lines must precede any +commands in the script. + +Which parameters are allowed or required depends the job type and +cluster, but two parameters must be present in (almost) any job: + +- `--account`, which specifies the *project* the job will run in. + Required by all jobs. +- `--time`, which specifies how long a job should be allowed to + run. If it has not finished within that time, it will be cancelled. + +The other parameters will be described in the sub pages for each cluster. + +It is recommended to start the commands to set up the environment with + +```bash +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error + +module --quiet purge # Reset the modules to the system default +``` + +and will most likely include one or more + +```bash +module load SomeProgram/SomeVersion +``` + +to set up environment variables like `$PATH` to get access to the +specified programs. It is recommended to specify the explicit version +in the `module load` command. We also recommend adding a + +```bash +module list # For easier debugging +``` + +after the `module load` commands. See also {ref}`module-scheme`. + +All in all, a generic job script might look like this: + +```{eval-rst} +.. literalinclude:: files/generic_job.sh + :language: bash +``` + +Download the script: +```{eval-rst} +:download:`files/generic_job.sh` +``` + +## Wall Time Limit +The wall time limit (`--time`) is required for all jobs. + +The most used formats for the time specification is `DD-HH:MM:SS` +and `HH:MM:SS`, where *DD* is days, *HH* hours, *MM* minutes and *SS* +seconds. For instance: + +- `3-12:00:00`: 3.5 days +- `7:30:00`: 7.5 hours + +We recommend you to be as precise as you can when specifying the wall +time limit as it will inflict on how fast your jobs will start to +run: It is easier for a short job to get started between two larger, +higher priority jobs (so-called *backfilling*). On the other hand, if +the job has not finished before the wall time limit, it will be +cancelled, so too long is better than too short due to lost work! + + +## The Slurm script does not have to be written in Bash + +The job script can be written in +any language that uses `#` as the comment sign. Bash is most common, but some +applications like NorESM use Python. Perl, Julia, and R are other options. Here +is a Python example: +```python +#!/usr/bin/env python + +#SBATCH --job-name=slurm +#SBATCH --account=nn9999k +#SBATCH --nodes=128 +#SBATCH --ntasks-per-node=1 +#SBATCH --time=0:0:5 + +import os + +os.system("srun hostname") +``` + +Using Python, Perl, Julia, or R can open up for more programming possibilities +within the run script than what would be possible using Bash. + + +## Further Topics + +- [Environment variables available in job scripts](job_scripts/environment_variables.md) +- [Job work directory](job_scripts/work_directory.md) +- [Array jobs](job_scripts/array_jobs.md) +- [Running Job Steps in Parallel](guides/running_job_steps_parallel.md) +- [Porting Job Scripts from PBS/Torque](guides/porting_from_pbs.md) +- [Running MPI Jobs](guides/running_mpi_jobs.md) diff --git a/_sources/jobs/job_scripts/array_jobs.md.txt b/_sources/jobs/job_scripts/array_jobs.md.txt new file mode 100644 index 000000000..8e1e4f7ef --- /dev/null +++ b/_sources/jobs/job_scripts/array_jobs.md.txt @@ -0,0 +1,166 @@ +--- +orphan: true +--- + +(array-jobs)= + +# Array Jobs + +To run many instances of the same job, use the `--array` switch to `sbatch`. +This is useful if you have a lot of data-sets which you want to process in the +same way: + +```console +$ sbatch --array=from-to [other sbatch switches] YourScript +``` + +You can also put the `--array` switch in an `#SBATCH` line inside the script. +_from_ and _to_ are the first and last task number. Each instance of +`YourScript` can use the environment variable `$SLURM_ARRAY_TASK_ID` for +selecting which data set to use, etc. (The queue system calls the instances +"array tasks".) For instance: + +```console +$ sbatch --array=1-100 MyScript +``` + +will run 100 instances of `MyScript`, setting the environment variable +`$SLURM_ARRAY_TASK_ID` to 1, 2, ..., 100 in turn. + +## Array job properties + +### Specifying task IDs + +It is possible to specify the task ids in other ways than `from-to`: it can be +a single number, a range (`from-to`), a range with a step size +(`from-to:step`), or a comma separated list of these. Finally, adding `%max` at +the end of the specification puts a limit on how many tasks will be allowed to +run at the same time. A couple of examples: + +| Specification (`--array=`) | Resulting `SLURM_ARRAY_TASK_ID`s | +| -------------------------- | ------------------------------------------------------- | +| `1,4,42` | 1, 4, 42 | +| `1-5` | 1, 2, 3, 4, 5 | +| `0-10:2` | 0, 2, 4, 6, 8, 10 | +| `32,56,100-200` | 32, 56, 100, 101, 102, ..., 200 | +| `1-200%10` | 1, 2, ..., 200, but maximum 10 running at the same time | + +```{note} +Spaces, decimal numbers or negative numbers are not allowed in the `--array` +specification. +``` + +### Array job resources + +The instances of an array job are independent, they have their own `$SCRATCH` +({ref}`read more about storage locations here`) and are treated +like separate jobs. Thus any resources request in the Slurm script is available +for each task. + +### Canceling array jobs + +To cancel all tasks of an array job, cancel the job ID that is returned by +`sbatch`. One can also cancel individual tasks with `scancel :`. + +### Dependencies between array jobs + +To handle dependencies between two or more array jobs one can use the +`--depend=aftercorr:` (regular dependencies can also be used, +but we wanted to highlight this particular way since it can be beneficial with +array jobs), this will start the dependent array tasks as soon as the previous +corresponding array task has completed. E.g. if we start an array job with +`--array=1-5` and then start a second array job with `--array=1-5 +--depend=aftercorr:`, once task `X` of the first job is complete +the second job will start its task `X`, independently of the other task in the +first or second job. + +## Example + +A small, but complete example (for a `normal` job on Saga): + +```{eval-rst} +.. literalinclude:: files/minimal_array_job.sh + :language: bash +``` + +```{eval-rst} +:download:`minimal_array_job.sh ` +``` + +Submit the script with `sbatch minimal_array_job.sh`. This job will process the +datasets `dataset.1`, `dataset.2`, ..., `dataset.200` and put the results in +`result.1`, `result.2`, ..., `result.200`. **Note that your dataset files has to be named `dataset.1`, `dataset.2`, etc. for this example to work.** Make sure that the names of your dataset files and the names in your script are the same. Each of the tasks will consist of +two processes (`--ntasks=2`) and get a total of `8GB` of memory (2 x +`--mem-per-cpu=4G`). + +If your files has inconsistent naming (for example "dataset_one", dataset_2", "my_dataset" etc.), you either have to rename your files or include code in your script to handle your files. Here is one way to handle inconsistent names: + +```{warning} +You need to have the same number of files in your dataset directory as the number of tasks you specify in the `--array` switch i.e. count the number of files in your dataset directory and use that number in the `--array` switch. For example, to check how many csv files are in the directory named data, use `ls data/*.csv | wc -l` in the terminal. + +``` + +```{code-block} bash +-------------- +emphasize-lines: 5, 6, 7 +-------------- +#!/bin/bash +#SBATCH --account=YourProject +#SBATCH --time=1:0:0 +#SBATCH --mem-per-cpu=4G --ntasks=2 +#SBATCH --array=0-199 # we start at 0 instead of 1 for this + # example, as the $SLURM_ARRAY_TASK_ID + # variable starts at 0 + +set -o errexit # exit on errors +set -o nounset # treat unset variables as errors +module --quiet purge # clear any inherited modules + +DATASETS=(data/*) # get all files in the directory named "data". Replace + # "data" with the path of your dataset directory. + +FILE=${DATASETS[$SLURM_ARRAY_TASK_ID]} +FILENAME=$(basename ${FILE%.*}) + +YourProgram $FILE > ${FILENAME}.out +``` + +`DATASETS=(data/*)` will get all files in the directory named "data" and store them in an array. The array is indexed from 0, so the first file will be stored in `DATASETS[0]`, the second in `DATASETS[1]` and so on. The `SLURM_ARRAY_TASK_ID` variable is set by the Slurm system and is the task ID of the current task, with counting starting with 0. + +```{tip} +If your datasets for example are csv files and the directory contains other file types, use DATASETS=(data/*.csv) instead. +``` + +Alternatively, you can save the names of you files in a text file and use the order of the filenames in the text file as an index. This is useful if you need the order of your files later or if you need to map the Slurm job output file to the correct dataset file. + +Run for example these commands in the command line to create a text file with the names of your files: + +```console +$ DATASETS=(data/*) +$ printf "%s\n" "${DATASETS[@]}" > map_files.txt +``` + +And use the following example as you run script: + +```{code-block} bash +#!/bin/bash +#SBATCH --account=YourProject +#SBATCH --time=1:0:0 +#SBATCH --mem-per-cpu=4G --ntasks=2 +#SBATCH --array=0-199 + +set -o errexit # exit on errors +set -o nounset # treat unset variables as errors +module --quiet purge # clear any inherited modules + +IDX=($SLURM_ARRAY_TASK_ID) +FILE=$(sed "${IDX}q;d" map_files.txt) +FILENAME=$(basename ${FILE%.*}) + +YourProgram $FILE > ${FILENAME}.out +``` + +```{tip} +You can find a more extensive example {ref}`here `. +``` diff --git a/_sources/jobs/job_scripts/betzy/betzy_sample_mpi_job.md.txt b/_sources/jobs/job_scripts/betzy/betzy_sample_mpi_job.md.txt new file mode 100644 index 000000000..678d856de --- /dev/null +++ b/_sources/jobs/job_scripts/betzy/betzy_sample_mpi_job.md.txt @@ -0,0 +1,23 @@ +--- +orphan: true +--- + +# Sample MPI Batch Script + +Here is a sample batch script that demonstrates usage of various +variables and processes for a **normal** job on Betzy. To run in +other job types, please read {ref}`job-scripts-on-betzy`. + +```{eval-rst} +.. literalinclude:: files/betzy_mpi_job.sh + :language: bash +``` + +Download the script: +```{eval-rst} +:download:`files/betzy_mpi_job.sh` +``` + +The actual startup of MPI application differs for different MPI +libraries. Since this part is crucial for application performance, +please read about {ref}`running-mpi-applications`. diff --git a/_sources/jobs/job_scripts/betzy_job_scripts.md.txt b/_sources/jobs/job_scripts/betzy_job_scripts.md.txt new file mode 100644 index 000000000..1af95c1db --- /dev/null +++ b/_sources/jobs/job_scripts/betzy_job_scripts.md.txt @@ -0,0 +1,179 @@ +--- +orphan: true +--- + +(job-scripts-on-betzy)= + +# Job Scripts on Betzy + +This page documents how to specify the queue system parameters for the +different job types on Betzy. See {ref}`job-types-betzy` +for information about the different job types on Betzy. + +(job_scripts_betzy_normal)= + +## Normal + +The basic type of job on Betzy is the *normal* job. Most of the other +job types are "variants" of a *normal* job. + +*Normal* jobs must specify account (`--account`), walltime limit +(`--time`) and number of nodes (`--nodes`). The jobs can specify how +many tasks should run per node and how many CPUs should be used by +each task. + +A typical job specification for a normal job would be + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --time=1-0:0:0 + #SBATCH --nodes=10 --ntasks-per-node=128 + +This will start 128 tasks (processes) on each node, one for each cpu on the node. + +All normal jobs gets exclusive access to whole nodes (all CPUs and +memory). If a job tries to use more (resident) memory than is +configured on the nodes, it will be killed. Currently, this limit is +244 GiB, *but it can change*. If a job would require more memory per +task than the given 244 GiB split by 128 tasks, the trick is to limit the +number of tasks per node the following way: + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --time=1-0:0:0 + #SBATCH --nodes=10 --ntasks-per-node=16 + +This example above will use only 16 tasks per node, giving each task 15 +GiB. Note that is the _total_ memory usage on each node that counts, +so one of the tasks can use more than 15 GiB, as long as the total is +less than 244 GiB. + +To run multithreaded applications, use `--cpus-per-task` to allocate +the right number of cpus to each task. For instance: + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --time=1-0:0:0 + #SBATCH --nodes=4 --ntasks-per-node=4 --cpus-per-task=32 + +Note that setting `--cpus-per-task` does *not* bind the tasks to the +given number of cpus for _normal_ jobs; it merely sets +`$OMP_NUM_THREADS` so that OpenMP jobs by default will use the right +number of threads. (It is possible to override this number by setting +`$OMP_NUM_THREADS` in the job script.) + +The [Betzy Sample MPI Job](betzy/betzy_sample_mpi_job.md) page has an example +of a _normal_ MPI job. + +(job_scripts_betzy_preproc)= + +## Preproc + +_Preproc_ jobs must specify `--partition=preproc`. In addition, they +must specify wall time limit, the number of tasks and the amount of +memory memory per cpu. A _preproc_ job is assigned the requested cpus +and memory exclusively, but shares nodes with other jobs. (Currently, +there is only one node in the preproc partition.) If a +_preproc_ job tries to use more resident memory than requested, it gets +killed. The maximal wall time limit for preproc jobs is 1 day. + +Here is an example that asks for 3 tasks per, 4 cpus per +task, and 2 GiB RAM per cpu: + + #SBATCH --account=MyProject --job-name=MyJob + #SBATCH --partition=preproc + #SBATCH --time=1-0:0:0 + #SBATCH --ntasks=3 --cpus-per-task=4 + #SBATCH --mem-per-cpu=2G + +Note that even though the memory specification is called `--mem-per-cpu`, the +memory limit the job gets on the node is for the total usage by all processes +on the node, so in the above example, it would get a limit of 3 * 4 * 2 GiB = +12 GiB. The queue system doesn't care how the memory usage is divided between +the processes or threads, as long as the total usage on the node is below the +limit. + +Also note that contrary to *normal* jobs, *preproc* jobs _will_ be +bound to the cpu cores they are allocated, so the above sample job +will have access to 12 cores. However, the three tasks are free to use +all cores the job has access to (12 in this example). + +(job_scripts_betzy_accel)= + +## Accel +_Accel_ jobs are those that require GPUs to perform calculations. To +ensure that your job is run on only machinces with GPUs the +`--partition=accel` option must be supplied. Also, to get access to +one or more GPUs one need to request a number of GPUs with the +`--gpus=N` specification (see below for more ways to specify the GPUs +for your job). In addition, the jobs must specify wall time limit, +the number of tasks and the amount of memory memory per cpu or GPU. +See the *preproc* job section above for details about specifying tasks +and memory. + +For a simple job, only requiring 1 GPU, the following example configuration +could be used: + +```bash +#SBATCH --account=MyProject +#SBATCH --job-name=SimpleGPUJob +#SBATCH --time=0-00:05:00 +#SBATCH --mem-per-cpu=1G +#SBATCH --partition=accel +#SBATCH --gpus=1 +``` + +The following example starts 2 tasks each with a single GPU. This is useful +for MPI enabled jobs where each rank should be assigned a GPU. + +```bash +#SBATCH --account=MyProject +#SBATCH --job-name=MPIGPUJob +#SBATCH --time=0-00:05:00 +#SBATCH --mem-per-cpu=1G +#SBATCH --ntasks=2 --gpus=2 +#SBATCH --partition=accel +``` + +There are other GPU related specifications that can be used, and that +parallel some of the cpu related specifications. The most useful are +probably: + +- `--gpus-per-node` How many GPUs the job should have on each node. +- `--gpus-per-task` How many GPUs the job should have per task. + Requires the use of `--ntasks` or `--gpus`. +- `--gpus-per-socket` How many GPUs the job should have on each + socket. Requires the use of `--sockets-per-node`. +- `--mem-per-gpu` How much RAM the job should have for each GPU. + Can be used *instead of* `--mem-per-cpu`, (but cannot be used + *together with* it). + +```{attention} +Due to a bug in Slurm `--gpus-per-task` is not working correctly on **Betzy**, jobs using +this option will be billed more core hours than what the job is actually using. + +Users should revert to using `--gpus` or `--gpus-per-node` on +**Betzy** for now. +``` + +See [sbatch](https://slurm.schedmd.com/sbatch.html) or `man sbatch` +for the details, and other GPU related specifications. + +(The old way of specifying GPUs: `--gres=gpu:N` is still supported, +but is less flexible than the above specification.) + + +(job_scripts_betzy_devel)= + +## Devel + +_devel_ jobs must specify `--qos=devel`. A _devel_ job is like a _normal_ +job, except that it has restrictions on job length and size. + +For instance: + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --qos=devel + #SBATCH --time=00:30:00 + #SBATCH --nodes=2 --ntasks-per-node=128 diff --git a/_sources/jobs/job_scripts/environment_variables.md.txt b/_sources/jobs/job_scripts/environment_variables.md.txt new file mode 100644 index 000000000..414f7d0ec --- /dev/null +++ b/_sources/jobs/job_scripts/environment_variables.md.txt @@ -0,0 +1,26 @@ +--- +orphan: true +--- + +# Environment Variables in Job Scripts + +Here is a list of some useful environment variables that can be used +in job scripts. This is not a complete list. See the [sbatch +documentation](https://slurm.schedmd.com/sbatch.html) for more +variables. Another way to get a list of defined environment variables +is to run `env` in a job script and look at the output. + +- `SLURM_JOB_ID`: The jobid returned by `sbatch` +- `SLURM_ARRAY_TASK_ID`: The id of the current array task in an [array + job](array_jobs.md). +- `SLURM_JOB_NODELIST`: The list of nodes allocated to the job. +- `SLURM_NTASKS`: The number of tasks in the job. +- `SLURM_SUBMIT_DIR`: The directory where you ran `sbatch`. Usually + the place where the `slurm-.out` is located. +- `SCRATCH`: A per-job scratch directory on the shared file system. + See [work directory](work_directory.md) for details. +- `USERWORK`: A per-user scratch directory on the shared file system. + See [work directory](work_directory.md) for details. +- `OMP_NUM_THREADS`: The number of threads to use for OpenMP + programs. This is controlled by the `--cpus-per-task` parameter to + `sbatch`, and defaults to 1. diff --git a/_sources/jobs/job_scripts/fram/fram_job_placement.md.txt b/_sources/jobs/job_scripts/fram/fram_job_placement.md.txt new file mode 100644 index 000000000..becf9882f --- /dev/null +++ b/_sources/jobs/job_scripts/fram/fram_job_placement.md.txt @@ -0,0 +1,93 @@ +--- +orphan: true +--- + +(job-placement-fram)= + +# Job Placement on Fram + +The compute nodes on Fram are divided into four groups, called +*islands*. Each island has about the same number of nodes. The +Infiniband network throughput ("speed") within an island is higher than +the throughput between islands. Some jobs need high network throughput +between its nodes, and will usually run faster if they run within a +single island. + +## Default Setup + +Therefore, the queue system is configured to run each job within one +island, if that does not delay the job too much. It works like this: +When a job is submitted, the queue system lets the job wait until there +are enough free resources so that it can run within one island. If this +has not happened when the job has waited 7 days[^1], the job will be +started on more than one island. + +## Overriding the Setup + +The downside of requiring that all nodes belonging to a job should be in the +same island, is that the job might have to wait longer in the queue, +especially if the job needs many nodes. Some jobs do not need high network +throughput between its nodes. For such jobs, you can override the setup, +either for individual jobs or for all your jobs. + +### Individual Jobs + +For individual jobs, you can use the switch `--switches=N[@time]` *on the +command line* when submitting the job, where *N* is the maximal number of +islands to use (1, 2, 3 or 4), and *time* (optional) is the maximum time to +wait. See `man sbatch` for details. Two examples: + + --switches=2 # Allow two islands + --switches=1@4-0:0:0 # Change max wait time to 4 days + +The maximal possible wait time to specify is 28 days[^1]. *A longer time +will silently be truncated to 28 days!* + +Note that putting this option in an `#SBATCH` line in the job script will +**not** work (it will silently be overridden by the environment variables we +set to get the default behaviour)! + +On the other hand, you might want to guarantee that your job never, +ever, starts on more than one island. The easiest way to do that is to +specify `--constraint=[island1|island2|island3|island4]` instead (this option +can be used either on the command line or in the job script). + +### Changing the Defaults + +For changing the default for your jobs, you can change the +followin environment variables: + +- `SBATCH_REQ_SWITCH`: Max number of islands for `sbatch` jobs. +- `SALLOC_REQ_SWITCH`: Max number of islands for `salloc` jobs. +- `SRUN_REQ_SWITCH`: Max number of islands for `srun` jobs. +- `SBATCH_WAIT4SWITCH`: Max wait time for `sbatch` jobs. +- `SALLOC_WAIT4SWITCH`: Max wait time for `salloc` jobs. +- `SRUN_WAIT4SWITCH`: Max wait time for `srun` jobs. + +`salloc` and `srun` jobs are interactive jobs; see {ref}`interactive-jobs`. +As above, the maximal possible wait +time to specify is 28 days[^1], and any time longer than that will *silently be +truncated*. The change takes effect for jobs submitted after you change the +variables. For instance, to change the default to allow two islands, and wait +up to two weeks: + +```bash +export SBATCH_REQ_SWITCH=2 +export SALLOC_REQ_SWITCH=2 +export SRUN_REQ_SWITCH=2 +export SBATCH_WAIT4SWITCH=14-00:00:00 +export SALLOC_WAIT4SWITCH=14-00:00:00 +export SRUN_WAIT4SWITCH=14-00:00:00 +``` + +Note that we do *not* recommend that you unset these variables. If you want +your jobs to start on any nodes, whichever island they are on, simply set +`*_REQ_SWITCH` variables to 4. Specifically, if you unset the +`*_WAIT4SWITCH` variables, they will default to 28 days[^1]. Also, in the +future we might change the underlying mechanism, in which case unsetting these +variables will have no effect (but setting them will). + + +**Footnotes** + +[^1]: The limits might change in the future. diff --git a/_sources/jobs/job_scripts/fram/fram_sample_mpi_job.md.txt b/_sources/jobs/job_scripts/fram/fram_sample_mpi_job.md.txt new file mode 100644 index 000000000..c316b6f92 --- /dev/null +++ b/_sources/jobs/job_scripts/fram/fram_sample_mpi_job.md.txt @@ -0,0 +1,23 @@ +--- +orphan: true +--- + +# Sample MPI Batch Script + +Here is a sample batch script that demonstrates usage of various +variables and processes for a **normal** job on Fram. To run in +other job types, please read {ref}`job-scripts-on-fram`. + +```{eval-rst} +.. literalinclude:: files/fram_mpi_job.sh + :language: bash +``` + +Download the script: +```{eval-rst} +:download:`files/fram_mpi_job.sh` +``` + +The actual startup of MPI application differs for different MPI +libraries. Since this part is crucial for application performance, +please read about {ref}`running-mpi-applications`. diff --git a/_sources/jobs/job_scripts/fram_job_scripts.md.txt b/_sources/jobs/job_scripts/fram_job_scripts.md.txt new file mode 100644 index 000000000..3c72c63c2 --- /dev/null +++ b/_sources/jobs/job_scripts/fram_job_scripts.md.txt @@ -0,0 +1,175 @@ +--- +orphan: true +--- + +(job-scripts-on-fram)= + +# Job Scripts on Fram + +This page documents how to specify the queue system parameters for the +different job types on Fram. See {ref}`job-types-fram` +for information about the different job types on Fram. + + +(job_scripts_fram_normal)= + +## Normal + +The basic type of job on Fram is the *normal* job. Most of the other +job types are "variants" of a *normal* job. + +*Normal* jobs must specify account (`--account`), walltime limit +(`--time`) and number of nodes (`--nodes`). The jobs can specify how +many tasks should run per node and how many CPUs should be used by +each task. + +A typical job specification for a normal job would be + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --time=1-0:0:0 + #SBATCH --nodes=10 --ntasks-per-node=32 + +This will start 32 tasks (processes) on each node, one for each cpu on the node. + +All normal jobs gets exclusive access to whole nodes (all CPUs and +memory). If a job tries to use more (resident) memory than is +configured on the nodes, it will be killed. Currently, this limit is +59 GiB, *but it can change*. If a job would require more memory per +task than the given 59 GiB split by 32 tasks, the trick is to limit the +number of tasks per node the following way: + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --time=1-0:0:0 + #SBATCH --nodes=10 --ntasks-per-node=4 + +This example above will use only 4 tasks per node, giving each task 15 +GiB. Note that is the _total_ memory usage on each node that counts, +so one of the tasks can use more than 15 GiB, as long as the total is +less than 59 GiB. + +If your job needs more than 59 GiB per task, the only option on Fram +is to use a *bigmem* job (see below). + +To run multithreaded applications, use `--cpus-per-task` to allocate +the right number of cpus to each task. For instance: + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --time=1-0:0:0 + #SBATCH --nodes=4 --ntasks-per-node=2 --cpus-per-task=16 + +Note that setting `--cpus-per-task` does *not* bind the tasks to the +given number of cpus for _normal_ jobs; it merely sets +`$OMP_NUM_THREADS` so that OpenMP jobs by default will use the right +number of threads. (It is possible to override this number by setting +`$OMP_NUM_THREADS` in the job script.) + +The [Fram Sample MPI Job](fram/fram_sample_mpi_job.md) page has an example +of a _normal_ MPI job. + +See [Fram Job Placement](fram/fram_job_placement.md) for optional +parameters for controlling which nodes a _normal_ job is run on. + + +(job_scripts_fram_bigmem)= + +## Bigmem + +_Bigmem_ jobs must specify `--partition=bigmem`. In addition, they +must specify wall time limit, the number of tasks and the amount of +memory memory per cpu. A _bigmem_ job is assigned the requested cpus +and memory exclusively, but shares nodes with other jobs. If a +_bigmem_ job tries to use more resident memory than requested, it gets +killed. The maximal wall time limit for bigmem jobs is 14 days. + +Here is an example that asks for 2 nodes, 3 tasks per node, 4 cpus per +task, and 32 GiB RAM per cpu: + + #SBATCH --account=MyProject --job-name=MyJob + #SBATCH --partition=bigmem + #SBATCH --time=1-0:0:0 + #SBATCH --nodes=2 --ntasks-per-node=3 --cpus-per-task=4 + #SBATCH --mem-per-cpu=32G + +Note that even though the memory specification is called `--mem-per-cpu`, the +memory limit the job gets on the node is for the total usage by all processes +on the node, so in the above example, it would get a limit of 3 * 4 * 32 GiB = +384 GiB. The queue system doesn't care how the memory usage is divided between +the processes or threads, as long as the total usage on the node is below the +limit. + +Also note that contrary to *normal* jobs, *bigmem* jobs _will_ be bound to the +cpu cores they are allocated, so the above sample job will have access to 12 +cores on each node. However, the three tasks are free to use all cores the job +has access to on the node (12 in this example). + +Here is a simpler example, which only asks for 16 tasks (of 1 cpu +each) and 32 GiB RAM per task; it does not care how the tasks are +allocated on the nodes: + + #SBATCH --account=MyProject --job-name=MyJob + #SBATCH --partition=bigmem + #SBATCH --time=1-0:0:0 + #SBATCH --ntasks=16 + #SBATCH --mem-per-cpu=32G + + +(job_scripts_fram_devel)= + +## Devel + +_devel_ jobs must specify `--qos=devel`. A _devel_ job is like a _normal_ +job, except that it has restrictions on job length and size. + +For instance: + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --qos=devel + #SBATCH --time=00:30:00 + #SBATCH --nodes=2 --ntasks-per-node=32 + + +(job_scripts_fram_short)= + +## Short + +_short_ jobs must specify `--qos=short`. A _short_ job is like a _normal_ +job, except that it has restrictions on job length and size. It +differs from _devel_ jobs in that it allows somewhat longer and larger +jobs, but typically have longer wait time. + +For instance: + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --qos=short + #SBATCH --time=2:00:00 + #SBATCH --nodes=8 --ntasks-per-node=32 + + +(job_scripts_fram_optimist)= + +## Optimist + +_Optimist_ jobs are specified just like _normal_ jobs, except that +they also must must specify `--qos=optimist`. They run on the same +nodes as *normal* jobs. + +An _optimist_ job can be scheduled if there are free resources at +least 30 minutes when the job is considered for scheduling. However, +it can be requeued before 30 minutes have passed, so there is no +_gurarantee_ of a minimum run time. When an _optimist_ job is requeued, +it is first sent a `SIGTERM` signal. This can be trapped in order to +trigger a checkpoint. After 30 seconds, the job receives a `SIGKILL` +signal, which cannot be trapped. + +A simple _optimist_ job specification might be: + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --qos=optimist + #SBATCH --nodes=4 --ntasks-per-node=32 + #SBATCH --time=2:00:00 diff --git a/_sources/jobs/job_scripts/saga/saga_sample_mpi_job.md.txt b/_sources/jobs/job_scripts/saga/saga_sample_mpi_job.md.txt new file mode 100644 index 000000000..fa9fbf83b --- /dev/null +++ b/_sources/jobs/job_scripts/saga/saga_sample_mpi_job.md.txt @@ -0,0 +1,23 @@ +--- +orphan: true +--- + +# Sample MPI Batch Script + +Here is a sample batch script that demonstrates usage of various +variables and processes for a **normal** job on Saga. (To run in +other job types, please read {ref}`job-scripts-on-saga`. + +```{eval-rst} +.. literalinclude:: files/saga_mpi_job.sh + :language: bash +``` + +Download the script: +```{eval-rst} +:download:`files/saga_mpi_job.sh` +``` + +The actual startup of MPI application differs for different MPI +libraries. Since this part is crucial for application performance, +please read about {ref}`running-mpi-applications`. diff --git a/_sources/jobs/job_scripts/saga_job_scripts.md.txt b/_sources/jobs/job_scripts/saga_job_scripts.md.txt new file mode 100644 index 000000000..969109140 --- /dev/null +++ b/_sources/jobs/job_scripts/saga_job_scripts.md.txt @@ -0,0 +1,207 @@ +--- +orphan: true +--- + +(job-scripts-on-saga)= + +# Job Scripts on Saga + +This page documents how to specify the queue system parameters for the +different job types on Saga. See {ref}`job-types-saga` +for information about the different job types on Saga. + +```{warning} +**On Saga use srun, not mpirun** + +mpirun can get the number of tasks wrong and also lead to wrong task +placement. We don't fully understand why this happens. When using srun +instead of mpirun or mpiexec, we observe correct task placement on Saga. +``` + + +(job_scripts_saga_normal)= + +## Normal + +The basic type of job on Saga is the *normal* job. + +_Normal_ jobs must specify account (`--account`), walltime limit +(`--time`) and how much memory is needed. Usually, they will also +specify the number of tasks (i.e., processes) to run (`--ntasks` - the +default is 1), and they can also specify how many cpus each task +should get (`--cpus-per-task` - the default is 1). + +The jobs can also specify how man tasks should be run per node +(`--ntasks-per-node`), or how many nodes the tasks should be +distributed over (`--nodes`). Without any of these two +specifications, the tasks will be distributed on any available +resources. + +Memory usage is specified with `--mem-per-cpu`, in _MiB_ +(`--mem-per-cpu=3600M`) or _GiB_ (`--mem-per-cpu=4G`). + +If a job tries to use more (resident) memory on a compute node than it +requested, it will be killed. Note that it is the _total_ memory +usage on each node that counts, not the usage per task or cpu. So, +for instance, if your job has two single-cpu tasks on a node and asks +for 2 GiB RAM per cpu, the total limit on that node will be 4 GiB. +The queue system does not care if one of the tasks uses more than 2 +GiB, as long as the total usage on the node is not more than 4 GiB. + +A typical job specification for a normal job would be + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --time=1-0:0:0 + #SBATCH --mem-per-cpu=3G + #SBATCH --ntasks=16 + +This will start 16 tasks (processes), each one getting one cpu and 3 +GiB RAM. The tasks can be distributed on any number of nodes (up to +16, of course). + +To run multithreaded applications, use `--cpus-per-task` to allocate +the right number of cpus to each task. `--cpus-per-task` sets the +environment variable `$OMP_NUM_THREADS` so that OpenMP programs by +default will use the right number of threads. (It is possible to +override this number by setting `$OMP_NUM_THREADS` in the job script.) +For instance: + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --time=1-0:0:0 + #SBATCH --mem-per-cpu=4G + #SBATCH --ntasks=8 --cpus-per-task=10 --ntasks-per-node=4 + +This job will get 2 nodes, and run 4 processes on each of them, each +process getting 10 cpus. All in all, that will be two whole nodes on +Saga. + +All jobs on Saga are allocated the requested cpus and memory +exclusively, but share nodes with other jobs. Also note that they are +bound to the cpu cores they are allocated. However, the tasks and +threads are free to use all cores the job has access to on the node. + +Note that the more restrictive one is in specifying how tasks are +placed on nodes, the longer the job might have to wait in the job +queue: In this example, for instance, there might be eight nodes with +10 idle cpus, but not two whole idle nodes. Without the +`--ntasks-per-node` specification, the job could have started, but +with the specification, it will have to wait. + +The [Saga Sample MPI Job](saga/saga_sample_mpi_job.md) page has an example +of a _normal_ MPI job. + + +(job_scripts_saga_bigmem)= + +## Bigmem and Hugemem + +_Bigmem_ and _Hugemem_ jobs are specified exactly like the _normal_ jobs except that +you also have to specify `--partition=bigmem` or `--partition=hugemem`. + +Here is a _bigmem_ example that asks for 2 tasks, 4 cpus per task, and 32 GiB +RAM per cpu: + + #SBATCH --account=MyProject --job-name=MyJob + #SBATCH --partition=bigmem + #SBATCH --time=1-0:0:0 + #SBATCH --ntasks=2 --cpus-per-task=4 + #SBATCH --mem-per-cpu=32G + +Please note that not all of the ordinary software modules will work on +the *hugemem* nodes, due to the different cpu type. If you encounter +any software-related issues, we are happy to help you at +support@nris.no. As an alternative, you can use the NESSI or +[EESSI](https://www.eessi.io/docs/) modules. These have been built to +support the cpus on the hugemem nodes. To activate the modules, do +`source /cvmfs/pilot.nessi.no/versions/2023.06/init/bash` (NESSI) or +`source /cvmfs/software.eessi.io/versions/2023.06/init/bash` (EESSI) +before you load modules. + +(job_scripts_saga_accel)= + +## Accel and A100 +_Accel_ and _A100_ jobs are specified just like *normal* jobs except that they +also have to specify `--partition=accel` (for P100 GPUs) or +`--partition=a100` (for A100 GPUs). In addition, they must also +specify how many GPUs to use, and how they should be distributed +across nodes and tasks. The simplest way to do that is, with +`--gpus=N` or `--gpus-per-node=N`, where `N` is the number of GPUs to +use. + +For a job simple job running one process and using one P100 GPU, the +following example is enough: + + #SBATCH --account=MyProject --job-name=MyJob + #SBATCH --partition=accel --gpus=1 + #SBATCH --time=1-0:0:0 + #SBATCH --mem-per-cpu=8G + +Here is an example that asks for 2 tasks and 2 A100 GPUs on one node: + + #SBATCH --account=MyProject --job-name=MyJob + #SBATCH --partition=a100 --gpus-per-node=2 + #SBATCH --time=1-0:0:0 + #SBATCH --ntasks-per-node=2 --nodes=1 + #SBATCH --mem-per-cpu=8G + +There are other GPU related specifications that can be used, and that +parallel some of the cpu related specifications. The most useful are +probably: + +- `--gpus-per-node` How many GPUs the job should have on each node. +- `--gpus-per-task` How many GPUs the job should have per task. + Requires the use of `--ntasks` or `--gpus`. +- `--gpus-per-socket` How many GPUs the job should have on each + socket. Requires the use of `--sockets-per-node`. +- `--mem-per-gpu` How much RAM the job should have for each GPU. + Can be used *instead of* `--mem-per-cpu`, (but cannot be used + *together with* it). + +See [sbatch](https://slurm.schedmd.com/sbatch.html) or `man sbatch` +for the details, and other GPU related specifications. + +(The old way of specifying GPUs: `--gres=gpu:N` is still supported, +but is less flexible than the above specification.) + +(job_scripts_saga_devel)= + +## Devel + +_Devel_ jobs must specify `--qos=devel`. A _devel_ job is like a _normal_ +job, except that it has restrictions on job length and size. + +For instance: + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --qos=devel + #SBATCH --time=00:30:00 + #SBATCH --ntasks=16 + + +(job_scripts_saga_optimist)= + +## Optimist + +_Optimist_ jobs are specified just like _normal_ jobs, except that +they also must must specify `--qos=optimist`. They can run on any +node on Saga. + +An _optimist_ job can be scheduled if there are free resources at +least 30 minutes when the job is considered for scheduling. However, +it can be requeued before 30 minutes have passed, so there is no +_gurarantee_ of a minimum run time. When an _optimist_ job is requeued, +it is first sent a `SIGTERM` signal. This can be trapped in order to +trigger a checkpoint. After 30 seconds, the job receives a `SIGKILL` +signal, which cannot be trapped. + +A simple _optimist_ job specification might be: + + #SBATCH --account=MyProject + #SBATCH --job-name=MyJob + #SBATCH --qos=optimist + #SBATCH --mem-per-cpu=3G + #SBATCH --ntasks=16 + #SBATCH --time=2:00:00 diff --git a/_sources/jobs/job_scripts/slurm_parameter.md.txt b/_sources/jobs/job_scripts/slurm_parameter.md.txt new file mode 100644 index 000000000..84e8d2a6b --- /dev/null +++ b/_sources/jobs/job_scripts/slurm_parameter.md.txt @@ -0,0 +1,200 @@ +--- +orphan: true +--- + +# Slurm Parameter and Settings + +Slurm supports a multitude of different parameters. This enables you to +effectively tailor your script to your need but also +means that is easy to get lost and waste your time and quota. + +The following parameters can be used as command line parameters with +`sbatch` and `srun` or in jobscripts. To use it in a jobscript, start +a newline with `#SBATCH` followed by the parameter. Replace <....> +with the value you want, e.g. `--job-name=test-job`. + + +## Slurm Parameter + +### Basic settings: + +| Parameter | Function | +| ------------------------------------ | ------------------------ | +| `--job-name=` | Job name to be displayed by for example `squeue` | +| `--output=` | Path to the file where the job (error) output is written to | + + +### Requesting Resources + +| Parameter | Function | +| -------------------------------------- | ------------------------------- | +| `--time=` | Time limit for job. Job will be killed by Slurm after time has run out. Format days-hours:minutes:seconds. | +| `--nodes=` | Number of nodes. Multiple nodes are only useful for jobs with distributed-memory (e.g. MPI). | +| `--mem=` | Minimum memory (RAM) per node. Number followed by unit prefix, e.g. 16G. | +| `--mem-per-cpu=` | Minimum memory (RAM) per requested physical CPU core Number followed by unit prefix, e.g. 4G. | +| `--ntasks-per-node=` | Number of (MPI) processes per node. More than one useful only for MPI jobs. Maximum number depends nodes (number of cores). | +| `--cpus-per-task=` | CPU cores per task. For MPI use one. For parallelized applications benchmark this is the number of threads. | + +### Accounting + +| Parameter | Function | +| -------------------------- | ----------------------------- | +| `--account=` | Project (not user) account the job should be charged to. | +| `--partition=` | Partition/queue in which to run the job. | +| `--qos=` | The *devel* or *short* QOS (quality of servive) can be used to submit short jobs for testing and debugging. | + +See also {ref}`projects-accounting` for more information. + +Slurm differs slightly from the previous Torque system with respect to +definitions of various parameters, and what was known as queues in +Torque may be covered by either `--partition=...` or `--qos=...`. + +Check our cluster specific sites for an overview of the partitions and +QOS of that system: + +- {ref}`job-types-betzy` +- {ref}`job-types-fram` +- {ref}`job-types-saga` + + +### Advanced Job Control + +| Parameter | Function | +| ---------------------------------- | --------------------------- | +| `--array=` | Submit a collection of similar jobs, e.g. `--array=1-10`. (sbatch command only). [More info](array_jobs.md). | +| `--dependency=` | Wait with the start of the job until specified dependencies have been satified. E.g. --dependency=afterok:123456, | +| `--ntasks-per-core=2` | Enables hyperthreading. Only useful in special circumstances. | + + +## Differences between CPUs and tasks + +As a new users writing your first Slurm job script the difference +between `--ntasks` and `--cpus-per-taks` is typically quite confusing. +Assuming you want to run your program on a single node with 16 cores +which Slurm parameters should you specify? + +The answer is it depends whether the your application supports MPI. MPI +(message passing protocol) is a communication interface used for +developing parallel computing programs on distributed memory systems. +This is necessary for applications running on multiple computers (nodes) +to be able to share (intermediate) results. + +To decide which set of parameters you should use, check if your +application utilizes MPI and therefore would benefit from running on +multiple nodes simultaneously. On the other hand you have an non-MPI +enables application or made a mistake in your setup, it doesn't make +sense to request more than one node. + +## Settings for OpenMP and MPI jobs + +### Single node jobs + +For applications that are not optimized for HPC (high performance +computing) systems like simple python or R scripts and a lot of software +which is optimized for desktop PCs. + +#### Simple applications and scripts + +Many simple tools and scripts are not parallized at all and therefore +won't profit from more than one CPU core. + +| Parameter | Function | +| --------------------- | ---------------------------------------- | +| `--nodes=1` | Start a unparallized job on only one node. | +| `--ntasks-per-node=1` | For OpenMP, only one task is necessary. | +| `--cpus-per-task=1` | Just one CPU core will be used. | +| `--mem=` | | + +If you are unsure if your application can benefit from more cores try a +higher number and observe the load of your job. If it stays at +approximately one there is no need to ask for more than one. + +#### OpenMP applications + +OpenMP (Open Multi-Processing) is a multiprocessing library is often +used for programs on shared memory systems. Shared memory describes +systems which share the memory between all processing units (CPU cores), +so that each process can access all data on that system. + +| Parameter | Function | +| -------------------------------------- | --------------------------- | +| `--nodes=1` | Start a parallel job for a shared memory system on only one node. | +| `--ntasks-per-node=1` | For OpenMP, only one task is necessary. | +| `--cpus-per-task=` | Number of threads (CPU cores) to use. | +| `--mem=` | Minimum memory (RAM) per node. Number followed by unit prefix, e.g. 16G. | + +### Multiple node jobs (MPI) + +For MPI applications. + +Depending on the frequency and bandwidth demand of your setup, you can choose two +distribution schemes: + +- Let Slurm determine where to put your parallel MPI tasks as it see fit. + +- Force Slurm to group all MPI tasks on whole nodes. + +The latter approach of using whole nodes guarantees a low latency and high bandwidth, but it +usually results in a longer queuing time compared to cluster wide job. +With the former approach, the Slurm manager distribute your task to maximize utilization. +This usually results in shorter queuing times but slower inter-task connection speeds and latency. +What is suitable for you depends entirely on your ability to wait and the requirements of the application +that are set for execution. + +However, if it is suitable for you, we would recommend the former approach as it will make the best use of the resources +and give the most predictable execution times. If your job requires more than the default +available memory per core (for example 32 GB/node gives 2 GB/core for 16 core nodes +and 1.6GB/core for 20 core nodes) you should adjust this need with the +following command: `#SBATCH --mem-per-cpu=4GB`. When doing this, the +batch system will automatically allocate 8 cores or less per node. + +#### Task placement on whole nodes + +| Parameter | Function | +| -------------------------------------- | --------------------------- | +| `--nodes=` | Start a parallel job for a distributed memory system on several nodes. | +| `--ntasks-per-node=` | Number of (MPI) processes per node. Maximum number depends on nodes. | +| `--cpus-per-task=1` | Use one CPU core per task. | +| `--exclusive` | Job will not share nodes with other running jobs. You don't need to specify memory as you will get all available on the node. | + +#### General task placement + +| Parameter | Function | +| ----------------------------- | ---------------------------------- | +| `--ntasks=` | Number of (MPI) processes in total. Equals to the number of cores/ | +| `--mem-per-cpu=` | Memory (RAM) per requested CPU core. Number followed by unit prefix, e.g. 2G. | + + +### Scalability + +You should run a few tests to see what is the best fit between +minimizing runtime and maximizing your allocated cpu-quota. That is you +should not ask for more cpus for a job than you really can utilize +efficiently. Try to run your job on 1, 2, 4, 8, 16, etc., cores to see +when the runtime for your job starts tailing off. When you start to see +less than 30% improvement in runtime when doubling the cpu-counts you +should probably not go any further. Recommendations to a few of the most +used applications can be found in sw\_guides. + +### A few notes about memory + +It is possible to specify the used memory using either `mem` or `mem-per-cpu`. The former can give some surprises in +particular if not used together with `ntasks-per-node`, or another flag to fix the number of cores available to the job. +For instance if you set `mem=300G` and `ntasks=10` you could either get 10 tasks on a node with 300 GB or one task on +10 nodes, each demanding 300 GB. You are always accounted for the effective CPU time. In this case, say that each CPU has +30 GB available (`memory_per_cpu`). Even though the job only run on one CPU per node, you are accounted for 300/`memory_per_cpu` GB, meaning 10 CPUs. +In total you are thus accounted for the usage of 100 CPUs. + +### Troubleshooting + +#### "srun: Warning: can't honor --ntasks-per-node set to _X_ which doesn't match the requested tasks _Y_ with the number of requested nodes _Y_. Ignoring --ntasks-per-node." + +This warning appears when using the `mpirun` command with Intel MPI and +specifying `--ntasks-per-node` for jobs in the `normal` partition on Fram. As +far as we have seen, the job does *not* ignore the `--ntasks-per-node`, and +will run the specified number of processes per node. You can test it with, +e.g., `mpirun hostname`. Please let us know if you have an example where +`--ntasks-per-node` is *not* honored! + +So, if you get this when using `mpirun` with Intel MPI, our recommendation is +currently that the warning can be ignored. diff --git a/_sources/jobs/job_scripts/work_directory.md.txt b/_sources/jobs/job_scripts/work_directory.md.txt new file mode 100644 index 000000000..f3bb17a5e --- /dev/null +++ b/_sources/jobs/job_scripts/work_directory.md.txt @@ -0,0 +1,59 @@ +--- +orphan: true +--- + +(job-work-directory)= + +# Job work directory + +A job has multiple choices for its work directory, i.e., the directory +where it does its work: + +- Project area (`/custer/projects/`) +- `$USERWORK` (`/cluster/work/users/$USER`) +- `$SCRATCH` (`/cluster/work/jobs/$SLURM_JOB_ID`) + +There are different pros and cons with each of the choices. See +[Storage Areas](../../files_storage/clusters.md) for details. + +Currently, the recommended choice is to use the `$USERWORK` area. It +provides a nice balance between auto-cleanup and simplicity. Thus the +job script examples in this documentation will use `$USERWORK`. + +We do _not_ recommend running jobs in your home directory, mainly +because the home directory quotas are small, so you risk your jobs +failing due to not being able to write to disk. Also, the home +directories are private, so you would have to move the files to your +project area for others to be able to access them. + +When using `$USERWORK`, it is a good idea to make sure that each job +runs in its own subdirectory. This reduces the risk of jobs +interfering with each other. One easy way to do that is to use the +following in the job script: + + ## Create and move to work dir + workdir=$USERWORK/$SLURM_JOB_ID + mkdir -p $workdir + cd $workdir + +Please remember to copy result files that you want to keep from +`$USERWORK` to your project area after the job has finished, because +files in `$USERWORK` are removed after a number of days. + +If you are going to use `$SCRATCH`, there are two commands that can be +used in the job script to make sure result files are copied back even +if the job crashes before it finishes. (They don't give a 100% +guarantee: if the compute node itself crashes before the job finishes, +then the files will not be copied.) + + ## Make sure file1, file2, etc are copied back to + ## $SLURM_SUBMIT_DIR at the end of the job: + savefile ... + + ## Register a command to be run at the end of the job to copy + ## files somewhere + cleanup + +Both commands should be used in the job script _before_ starting the +main computation. Also, if they contain any special characters like +`*`, they should be quoted. diff --git a/_sources/jobs/job_types/betzy_job_types.md.txt b/_sources/jobs/job_types/betzy_job_types.md.txt new file mode 100644 index 000000000..ea797121d --- /dev/null +++ b/_sources/jobs/job_types/betzy_job_types.md.txt @@ -0,0 +1,103 @@ +--- +orphan: true +--- + +(job-types-betzy)= + +# Job Types on Betzy + +Betzy is designed to run highly parallelized jobs. If you need to run medium-sized jobs, than Fram is a better choice, while for serial jobs you shall use Saga. + +For a preprocessing or postprocessing job which only needs one or a few CPU cores, use a *preproc* job. + +For development or testing use the *devel* queue which is limited to small and short jobs. + +Here is a more detailed description of the different job types on Betzy: + + +(job_type_betzy_normal)= + +## Normal + +- __Allocation units__: whole nodes +- __Job Limits__: + - minimum 4 nodes + - maximum 512 nodes +- __Maximum walltime__: 4 days +- __Priority__: normal +- __Available resources__: 1340 nodes, each with 128 CPU cores and 244 GiB RAM +- __Parameter for sbatch/salloc__: + - None, _normal_ is the default +- __Job Scripts__: {ref}`job_scripts_betzy_normal` + +This is the default job type. In _normal_ jobs, the queue system hands out complete nodes. + +(job_type_betzy_accel)= + +## Accel +- __Allocation units__: CPUs, Memory and GPUs +- __Job Limits__: +- __Maximum walltime__: 7 days +- __Priority__: Normal +- __Available resources__: 4 nodes, each with 64 CPU cores, 494.5 GiB + RAM and 4 x Nvidia A100 GPUs with 40 GiB RAM +- __Parameter for sbatch/salloc__: + - `--partition=accel` + - `--gpus=N`, `--gpus-per-node=N` or similar, with `N` being the number of GPUs +- __Job Scripts__: {ref}`job_scripts_betzy_accel` + +Can be combined with `--qos=devel` for shorter development tasks which require +GPUs for testing. + +Note that *accel* jobs on Betzy are billed differently than *normal* jobs. +See the {ref}`accounting page` for more information. + + +(job_type_betzy_preproc)= + +## Preproc + +- __Allocation units__: cpus and memory +- __Job Limits__: + - maximum 128 billing units (CPU cores plus memory) per job + - maximum 1 node per job + - maximum 16 running jobs per user + - in total maximum 256 billing units in running jobs per user +- __Maximum walltime__: 1 day +- __Priority__: normal +- __Available resources__: 6 nodes, each with 128 CPU cores and 1 TiB RAM +- __Parameter for sbatch/salloc__: + - `--qos=preproc` +- __Job Scripts__: {ref}`job_scripts_betzy_preproc` + +*preproc* jobs are meant for small preprocessing or postprocessing +tasks. Typically, such jobs don't use many CPUs, so requiring them to +use 4 whole nodes would waste resources. + +Note that *preproc* jobs on Betzy are billed differently than *normal* jobs. +The details about how the billing units are calculated can be found +in [job accounting](../projects_accounting.md). + + +(job_type_betzy_devel)= + +## Devel + +- __Allocation units__: whole nodes +- __Job Limits__: + - minimum 1 node, maximum 4 nodes per job + - maximum 1 running job per user +- __Maximum walltime__: 60 minutes +- __Priority__: high +- __Available resources__: 4 nodes with 128 CPU cores and 244 GiB RAM +- __Parameter for sbatch/salloc__: + - `--qos=devel` +- __Job Scripts__: {ref}`job_scripts_betzy_devel` + +This is meant for small, short development or test jobs. + +Can be combined with `--partition=accel` to increase priority while +having max wall time and job limits of _devel_ job. + +If you have _temporary_ development needs that cannot be fulfilled by the _devel_ job type, please contact us at +[support@nris.no](mailto:support@nris.no). diff --git a/_sources/jobs/job_types/fram_job_types.md.txt b/_sources/jobs/job_types/fram_job_types.md.txt new file mode 100644 index 000000000..ecf887798 --- /dev/null +++ b/_sources/jobs/job_types/fram_job_types.md.txt @@ -0,0 +1,154 @@ +--- +orphan: true +--- + +(job-types-fram)= + +# Job Types on Fram + +Fram is designed to run medium-sized parallel jobs. If you need to +run serial jobs or "narrow" parallel jobs, Saga is a better choice. + +Most jobs on Fram are *normal* jobs. + +Jobs requiring a lot of memory (> 4 GiB/cpu) should run as *bigmem* +jobs. Also, jobs requiring only a single cpu, can use a small *bigmem* job. + +Jobs that are very short, or implement checkpointing, can run as +*optimist* jobs, which means they can use resources that are idle for +a short time before they are requeued by a non-*optimist* job. + +For development or testing, there are two job types: *devel* usually +has the shortest wait time during office hours, but is limited to +small, short jobs. *short* allows slightly larger and longer jobs, +but will probably have longer wait times. + +Here is a more detailed description of the different job types on +Fram: + + +(job_type_fram_normal)= + +## Normal + +- __Allocation units__: whole nodes +- __Job Limits__: + - minimum 1 node, maximum 32 nodes (can be increased) +- __Maximum walltime__: 7 days +- __Priority__: normal +- __Available resources__: 996 nodes with 32 cpus and 59 GiB RAM +- __Parameter for sbatch/salloc__: + - None, _normal_ is the default +- __Job Scripts__: {ref}`job_scripts_fram_normal` + +This is the default job type. Most jobs are *normal* jobs. Most of +the other job types are "variants" of a *normal* job. + +In _normal_ jobs, the queue system hands out complete nodes. If a +project needs more than 32 nodes per job, and the application in +question can actually scale more than 32 nodes, please send a request +to [support@nris.no](mailto:support@nris.no). + + +(job_type_fram_bigmem)= + +## Bigmem + +- __Allocation units__: cpus and memory +- __Job Limits__: + - (none) +- __Maximum walltime__: 14 days +- __Priority__: normal +- __Available resources__: + - 8 nodes with 32 cpus and 494 GiB RAM +- __Parameter for sbatch/salloc__: + - `--partition=bigmem` +- __Job Scripts__: {ref}`job_scripts_fram_bigmem` + +*Bigmem* jobs are meant for jobs that need a lot of memory (RAM), +typically more than 4 GiB per cpu. (The _normal_ nodes on Fram have +slightly less than 2 GiB per cpu.) + +For _bigmem_ jobs, the queue system hands out cpus and memory, not +whole nodes. + + +(job_type_fram_devel)= + +## Devel + +- __Allocation units__: whole nodes +- __Job Limits__: + - minimum 1 nodes, maximum 8 nodes per job + - maximum 1 running job at a time per user +- __Maximum walltime__: 30 minutes +- __Priority__: high +- __Available resources__: 8 nodes with 32 cpus and 59 GiB RAM between + 07:00 and 21:00 on weekdays +- __Parameter for sbatch/salloc__: + - `--qos=devel` +- __Job Scripts__: {ref}`job_scripts_fram_devel` + +This is meant for small, short development or test jobs. *Devel* jobs +have access to a set of dedicated nodes on daytime in weekdays to +make the jobs start as soon as possible. On the other hand, there are +limits on the size and number of _devel_ jobs. + +If you have _temporary_ development needs that cannot be fulfilled by +the _devel_ or _short_ job types, please contact us at +[support@nris.no](mailto:support@nris.no). + + +(job_type_fram_short)= + +## Short + +- __Allocation units__: whole nodes +- __Job Limits__: + - minimum 1 nodes, maximum 10 nodes per job + - maximum 16 nodes in use at the same time +- __Maximum walltime__: 2 hours +- __Priority__: high (slightly lower than *devel*) +- __Available resources__: 16 nodes with 32 cpus and 59 GiB RAM + (shared with *normal*) +- __Parameter for sbatch/salloc__: + - `--qos=short` +- __Job Scripts__: {ref}`job_scripts_fram_short` + +This is also meant for development or test jobs. It allows slightly +longer and wider jobs than *devel*, but has slightly lower priority, +and no dedicated resources. This usually results in a longer wait +time than *devel* jobs, at least on work days. + + +(job_type_fram_optimist)= + +## Optimist + +- __Allocation units__: whole nodes +- __Job Limits__: + - minimum 1 node, maximum 32 nodes (can be increased) +- __Maximum Walltime__: None. The jobs will start as soon as + resources are available for at least 30 minutes, but can be + requeued at any time, so there is no guaranteed minimum run time. +- __Priority__: low +- __Available resources__: *optimist* jobs run on the *normal* nodes. +- __Parameter for sbatch/salloc__: + - `--qos=optimist` +- __Job Scripts__: {ref}`job_scripts_fram_optimist` + +The _optimist_ job type is meant for very short jobs, or jobs with +checkpointing (i.e., they save state regularly, so they can restart +from where they left off). + +_Optimist_ jobs get lower priority than other jobs, but will start as +soon as there are free resources for at least 30 minutes. However, +when any other non-_optimist_ job needs its resources, the _optimist_ +job is stopped and put back on the job queue. This can happen before +the _optimist_ job has run 30 minutes, so there is no _guaranteed_ +minimum run time. + +Therefore, all _optimist_ jobs must use checkpointing, and access to +run _optimist_ jobs will only be given to projects that demonstrate +that they can use checkpointing. If you want to run _optimist_ jobs, +send a request to [support@nris.no](mailto:support@nris.no). diff --git a/_sources/jobs/job_types/saga_job_types.md.txt b/_sources/jobs/job_types/saga_job_types.md.txt new file mode 100644 index 000000000..e1442d772 --- /dev/null +++ b/_sources/jobs/job_types/saga_job_types.md.txt @@ -0,0 +1,218 @@ +--- +orphan: true +--- + +(job-types-saga)= + +# Job Types on Saga + +Saga is designed to run serial and small ("narrow") parallel jobs, in +addition to GPU jobs. If you need to run "wider" parallel jobs, Fram +is a better choice. + +```{warning} +**On Saga use srun, not mpirun** + +mpirun can get the number of tasks wrong and also lead to wrong task +placement. We don't fully understand why this happens. When using srun +instead of mpirun or mpiexec, we observe correct task placement on Saga. +``` + +The basic allocation units on Saga are cpu and memory. +The details about how the billing units are calculated can be found +in {ref}`projects-accounting`. + +Most jobs on Saga are *normal* jobs. + +Jobs requiring a lot of memory (> 8 GiB/cpu) should run as *bigmem* +or *hugemem* jobs. + +Jobs that are very short, or implement checkpointing, can run as +*optimist* jobs, which means they can use resources that are idle for +a short time before they are requeued by a non-*optimist* job. + +For development or testing, use a *devel* job + +Here is a more detailed description of the different job types on +Saga: + + +(job_type_saga_normal)= + +## Normal + +- __Allocation units__: cpus and memory +- __Job Limits__: + - maximum 256 units +- __Maximum walltime__: 7 days +- __Priority__: normal +- __Available resources__: + - 200 nodes with 40 cpus and 178.5 GiB RAM + - 120 nodes with 52 cpus and 178.5 GiB RAM +- __Parameter for sbatch/salloc__: + - None, _normal_ is the default +- __Job Scripts__: {ref}`job_scripts_saga_normal` + +This is the default job type. Most jobs are *normal* jobs. + + +(job_type_saga_bigmem)= + +## Bigmem + +- __Allocation units__: cpus and memory +- __Job Limits__: + - maximum 256 units +- __Maximum walltime__: 14 days +- __Priority__: normal +- __Available resources__: + - 28 nodes with 40 cpus and 362 GiB RAM + - 8 nodes with 64 cpus and 3021 GiB RAM +- __Parameter for sbatch/salloc__: + - `--partition=bigmem` +- __Job Scripts__: {ref}`job_scripts_saga_bigmem` + +*Bigmem* jobs are meant for jobs that need a lot of memory (RAM), +typically more than 8 GiB per cpu. (The _normal_ nodes on Saga have +slightly more than 4.5 GiB per cpu.) + +Can be combined with `--qos=devel` to get higher priority but maximum wall time (2h) +and resource limits of _devel_ apply. + + +(job_type_saga_hugemem)= + +## Hugemem + +- __Allocation units__: cpus and memory +- __Job Limits__: + - maximum 256 units +- __Maximum walltime__: 14 days +- __Priority__: normal +- __Available resources__: + - 2 nodes with 64 cpus and 6040 GiB RAM +- __Parameter for sbatch/salloc__: + - `--partition=hugemem` +- __Job Scripts__: {ref}`job_scripts_saga_bigmem` + +*Hugemem* jobs are meant for jobs that need even more memory (RAM) +than *bigmem* jobs. + +Can be combined with `--qos=devel` to get higher priority but maximum wall time (2h) +and resource limits of _devel_ apply. + +Please note that not all of the ordinary software modules will work on +the *hugemem* nodes, due to the different cpu type. If you encounter +any software-related issues, we are happy to help you at +support@nris.no. As an alternative, you can use the NESSI or +[EESSI](https://www.eessi.io/docs/) modules. These have been built to +support the cpus on the hugemem nodes. To activate the modules, do +`source /cvmfs/pilot.nessi.no/versions/2023.06/init/bash` (NESSI) or +`source /cvmfs/software.eessi.io/versions/2023.06/init/bash` (EESSI) +before you load modules. + + +(job_type_saga_accel)= + +## Accel + +- __Allocation units__: cpus, memory and GPUs +- __Job Limits__: + - maximum 256 units +- __Maximum walltime__: 14 days +- __Priority__: normal +- __Available resources__: 8 nodes with 24 cpus, 364 GiB RAM and 4 P100 + GPUs. +- __Parameter for sbatch/salloc__: + - `--partition=accel` + - `--gpus=N`, `--gpus-per-node=N` or similar, with _N_ being the number of GPUs +- __Job Scripts__: {ref}`job_scripts_saga_accel` + +*Accel* jobs give access to use the P100 GPUs. + +Can be combined with `--qos=devel` to get higher priority but maximum wall time (2h) +and resource limits of _devel_ apply. + + +(job_type_saga_a100)= + +## A100 + +- __Allocation units__: cpus, memory and GPUs +- __Job Limits__: + - maximum 256 units +- __Maximum walltime__: 14 days +- __Priority__: normal +- __Available resources__: 8 nodes with 32 cpus, 1,000 GiB RAM and 4 A100 + GPUs. +- __Parameter for sbatch/salloc__: + - `--partition=a100` + - `--gpus=N`, `--gpus-per-node=N` or similar, with _N_ being the number of GPUs +- __Job Scripts__: {ref}`job_scripts_saga_accel` + +*A100* jobs give access to use the A100 GPUs. + +Can be combined with `--qos=devel` to get higher priority but maximum wall time (2h) +and resource limits of _devel_ apply. + + +(job_type_saga_devel)= + +## Devel + +- __Allocation units__: cpus and memory and GPUs +- __Job Limits__: + - maximum 128 units per job + - maximum 256 units in use at the same time + - maximum 2 running jobs per user +- __Maximum walltime__: 2 hours +- __Priority__: high +- __Available resources__: *devel* jobs can run on any node on Saga +- __Parameter for sbatch/salloc__: + - `--qos=devel` +- __Job Scripts__: {ref}`job_scripts_saga_devel` + +This is meant for small, short development or test jobs. *Devel* jobs +get higher priority for them to run as soon as possible. On the other +hand, there are limits on the size and number of _devel_ jobs. + +Can be combined with either `--partition=accel`, `--partition=bigmem` +or `--partition=huemem` to increase +priority while having max wall time and job limits of _devel_ job. + +If you have _temporary_ development needs that cannot be fulfilled by +the _devel_ or _short_ job types, please contact us at +[support@nris.no](mailto:support@nris.no). + + +(job_type_saga_optimist)= + +## Optimist + +- __Allocation units__: cpus and memory +- __Job Limits__: + - maximum 256 units +- __Maximum Walltime__: None. The jobs will start as soon as + resources are available for at least 30 minutes, but can be + requeued at any time, so there is no guaranteed minimum run time. +- __Priority__: low +- __Available resources__: *optimist* jobs can run on any node on Saga +- __Parameter for sbatch/salloc__: + - `--qos=optimist` +- __Job Scripts__: {ref}`job_scripts_saga_optimist` + +The _optimist_ job type is meant for very short jobs, or jobs with +checkpointing (i.e., they save state regularly, so they can restart +from where they left off). + +_Optimist_ jobs get lower priority than other jobs, but will start as +soon as there are free resources for at least 30 minutes. However, +when any other non-_optimist_ job needs its resources, the _optimist_ +job is stopped and put back on the job queue. This can happen before +the _optimist_ job has run 30 minutes, so there is no _guaranteed_ +minimum run time. + +Therefore, all _optimist_ jobs must use checkpointing, and access to +run _optimist_ jobs will only be given to projects that demonstrate +that they can use checkpointing. If you want to run _optimist_ jobs, +send a request to [support@nris.no](mailto:support@nris.no). diff --git a/_sources/jobs/memory-bandwidth.md.txt b/_sources/jobs/memory-bandwidth.md.txt new file mode 100644 index 000000000..3ebaee840 --- /dev/null +++ b/_sources/jobs/memory-bandwidth.md.txt @@ -0,0 +1,192 @@ +# Efficient use of memory bandwidth on Betzy + +## Memory - NUMA and ccNUMA + +Betzy compute node is a 2-socket system running AMD EPYC 7742 64-Core +processors. Each compute node on Betzy has 256 GiB of memory, organised in 8 +banks of 32 GiB each. Every processor has four memory controllers, each +responsible for one bank. Furthermore, every virtual core in a processor is +assigned to one memory controller which results in different paths to access +memory. Memory accesses may have to traverse an intra-processor network +(another controller within the same processor is responsible for the memory +address being accessed) or an intra-node network (another controller of the +other processor is responsible for the memory address being accessed). This +memory organisation is referred to as non uniform memory access (NUMA) memory. + +This means that although all CPU cores can access all RAM, **the speed of the +access will differ: some memory pages are closer to each CPU, and some are +further away**. In contrast to a similar Intel-based system, where each socket +is one NUMA node, Betzy has 4 NUMA nodes per socket, and 8 in total. + +A NUMA node comprises of a memory bank and a subset of the virtual cores. The +best performance is achieved when processes and the memory they access +(frequently) are placed close to each other, or in other words, within one NUMA +node. + +Additionally, the compute nodes implement cache coherent NUMA (ccNUMA) memory +which ensures that programs see a consistent memory image. Cache coherence +requires intra-node communication if different caches store the same memory +location. Hence, the best performance is achieved when the same memory +location is not stored in different caches. Typically this happens when +processes need access to some shared data, e.g. at boundaries of regions they +iterate over. Limiting or even avoiding these accesses is often a challenge. + + +## Detailed information about NUMA nodes + +More detailed information about the NUMA +architecture can be obtained as follows: +``` +$ numactl -H + +available: 8 nodes (0-7) +node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 +node 0 size: 32637 MB +node 0 free: 30739 MB +node 1 cpus: 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 +node 1 size: 32767 MB +node 1 free: 31834 MB +node 2 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 +node 2 size: 32767 MB +node 2 free: 31507 MB +node 3 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 +node 3 size: 32755 MB +node 3 free: 31489 MB +node 4 cpus: 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 +node 4 size: 32767 MB +node 4 free: 31746 MB +node 5 cpus: 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 +node 5 size: 32767 MB +node 5 free: 31819 MB +node 6 cpus: 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 +node 6 size: 32767 MB +node 6 free: 31880 MB +node 7 cpus: 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 +node 7 size: 32767 MB +node 7 free: 31805 MB +node distances: +node 0 1 2 3 4 5 6 7 + 0: 10 12 12 12 32 32 32 32 + 1: 12 10 12 12 32 32 32 32 + 2: 12 12 10 12 32 32 32 32 + 3: 12 12 12 10 32 32 32 32 + 4: 32 32 32 32 10 12 12 12 + 5: 32 32 32 32 12 10 12 12 + 6: 32 32 32 32 12 12 10 12 + 7: 32 32 32 32 12 12 12 10 +``` + + +## Binding thread/processes to cores + +The above picture is further complicated by the fact that within the individual +NUMA nodes the memory access time is also not uniform. This can be verified by +running the [STREAM benchmark](https://github.com/jeffhammond/STREAM). As +reported above, each NUMA node has 16 physical cores (e.g. node 0, cores 0-15). + +Consider the following 2 STREAM experiments: + +1. start 8 threads, bind them to cores 0-8 +2. start 8 threads, bind them to cores 0,2,4,6,8,10,12,14 + +In terms of the OMP_PLACES directive the above is equivalent to: + +1. OMP_PLACES="{0:1}:8:1" OMP_NUM_THREADS=8 ./stream +2. OMP_PLACES="{0:1}:8:2" OMP_NUM_THREADS=8 ./stream + +On a standard Intel-based system the above two experiments would perform +identically. This is not the case on Betzy: the first approach is slower than the +second one: + +| Experiment | Function | Best Rate MB/s | Avg time | Min time | Max time | +| ---------- | -------- | -------------- | -------- | -------- | -------- | +| 1 | Copy | 37629.4 | 0.212833 | 0.212600 | 0.213007 | +| 1 | Triad | 35499.6 | 0.338472 | 0.338032 | 0.338771 | +| 2 | Copy | 42128.7 | 0.190025 | 0.189894 | 0.190152 | +| 2 | Triad | 41844.4 | 0.287000 | 0.286777 | 0.287137 | + +This shows that the memory access time is not uniform within a single NUMA node. + +Interestingly, the peak achievable memory bandwidth also depends on the number +of cores used, and is maximized for lower core counts. This is confirmed by the +following STREAM experiments running on one NUMA node: + +1. start 8 threads, bind them to cores 0,2,4,6,8,10,12,14 +2. start 16 threads, bind them to cores 0-15 + +In terms of the OMP_PLACES directive the above is equivalent to: + +1. OMP_PLACES="{0:1}:8:2" OMP_NUM_THREADS=8 ./stream +2. OMP_PLACES="{0:1}:16:1" OMP_NUM_THREADS=16 ./stream + +The results are: + +| Experiment | Function | Best Rate MB/s | Avg time | Min time | Max time | +| ---------- | -------- | -------------- | -------- | -------- | -------- | +| 1 | Copy | 42126.3 | 0.190034 | 0.189905 | 0.190177 | +| 1 | Triad | 41860.1 | 0.287013 | 0.286669 | 0.287387 | +| 2 | Copy | 39675.8 | 0.201817 | 0.201634 | 0.201950 | +| 2 | Triad | 39181.7 | 0.306733 | 0.306265 | 0.307508 | + +The above test demonstrates that memory bandwidth is maximized when using 8 out of 16 cores per NUMA node. + +The following experiments test the entire system: + +1. start 64 threads, bind them to cores 0,2,...126 +2. start 128 threads, bind them to cores 0,1,..127 + +In terms of the OMP_PLACES directive the above is equivalent to: + +1. OMP_PLACES="{0:1}:64:2" OMP_NUM_THREADS=64 ./stream +2. OMP_PLACES="{0:1}:128:1" OMP_NUM_THREADS=128 ./stream + +The results are: + +| Experiment | Function | Best Rate MB/s | Avg time | Min time | Max time | +| ---------- | -------- | -------------- | -------- | -------- | -------- | +| 1 | Copy | 334265.8 | 0.047946 | 0.047866 | 0.048052 | +| 1 | Triad | 329046.7 | 0.073018 | 0.072938 | 0.073143 | +| 2 | Copy | 315216.0 | 0.050855 | 0.050759 | 0.050926 | +| 2 | Triad | 309893.4 | 0.077549 | 0.077446 | 0.077789 | + +Hence on Betzy **memory bandwidth hungry applications will likely benefit from only using half of the cores (64)**. + +Note however that it is not enough to just use 64 cores instead of 128. **It is +important to bind the threads/ranks to cores correctly**, i.e., to run on every +second core. So a correct binding is either 0,2,4,...,126, or 1,3,5,...,127. +The above assures that the application runs on both NUMA-nodes in the most +efficient way. If instead you run the application on cores 0-63 only, then it +will be running at 50% performance as only one NUMA node will be used. + + +## Monitoring thread/process placement + +To monitor the placement of threads or processes/ranks the `htop` utility is +useful, just log in to a node running your application and issue the `htop` +command. By default `htop` numbers the cores from 1 through 256. This can be +confusing at times (it can be changed in htop by pressing F2 and navigate to +display options and tick off count from zero). + +The 0-127 (counting starts from zero) are the first one of the two SMT on the +AMD processor. The number of cores from 128 to 255 are the second SMT thread +and share the same executional units as do the first 128 cores. + +Using this view it’s easy to monitor how the ranks and threads are allocated +and mapped on the compressor cores. + + +## Memory bandwidth sensitive MPI applications + +Some MPI applications are very sensitive to memory bandwidth +and consequently will benefit from having fewer ranks per node than the number +of cores, like running only 64 ranks per node. + +Using `#SBATCH --ntasks-per-node=64` and then launch using something like: +``` +mpirun --map-by slot:PE=2 --bind-to core ./a.out +mpirun --map-by ppr:32:socket:pe=2 --bind-to core ./a.out +``` + +Tests have shown that more than 2x in performance is possible, but using twice +as many nodes. Twice the number of nodes will yield twice the aggregated +memory bandwidth. Needless to say also twice as many core hours. diff --git a/_sources/jobs/mkl.md.txt b/_sources/jobs/mkl.md.txt new file mode 100644 index 000000000..b5bfd39f8 --- /dev/null +++ b/_sources/jobs/mkl.md.txt @@ -0,0 +1,94 @@ +(using-mkl-efficiently)= + +# Using MKL efficiently + +The Intel Math Kernel Library is a popular library +for vector and matrix operations, solving eigenvalue problems, and much more. +On AMD processors it is important that you verify whether MKL is using +its best performing routines. You can also force it to and below we discuss +how this can be done. + +MKL performs run time checks at startup to select the +appropriate **Intel** processor. When it cannot work ut the processor type a +least common instruction set is set is selected yielding lower performance. + + +## AVX2 vector units + +Each processor core on Betzy has two vector units, these are 256 bits wide and +can hence operate on four 64-bit floating point numbers simultaneously. With +two such units and selecting fused multiply add (FMA) up to 16 double precision +operations can be performed per clock cycle. + +This yields a marketing theoretical performance of frequency times number of +cores times 16, 2.26 GHz * 128 * 16 = 4608 Gflops/s for a single compute node +(or 6.2 Pflops/s for the complete Betzy). No program consists of only FMA +instruction so these numbers are inflated. + +In any case the vector units are important for floating point performance, see +below. + + +## Mixing Intel compiler and MKL versions + +Users are advised to check if there is any performance difference between +Intel 2019b and the 2020 versions. We recommend to **not mix different compiler +versions and Math Kernel Library (MKL) versions**, +e.g. building using 2020 compilers and then linking with MKL 2019. + + +## MKL_DEBUG_CPU_TYPE + +To instruct MKL to use a more suitable instruction set a debug variable can be +set, e.g. `export MKL_DEBUG_CPU_TYPE=5`. + +However, +the `MKL_DEBUG_CPU_TYPE` environment variable does not work for Intel compiler distribution +2020 and and newer. + + +## Forcing MKL to use best performing routines + +MKL issue a run time test to check for genuine Intel processor. If +this test fail it will select a generic x86-64 set of routines +yielding inferior performance. This is well documented +[here](https://en.wikipedia.org/wiki/Math_Kernel_Library) and +remedies are discussed in +[Intel MKL on AMD Zen](https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html). + +It has been found that MKL calls a function called +`mkl_serv_intel_cpu_true()` to check the current CPU. If a genuine +Intel processor is found, it returns 1. + +The trick is to +bypass this by writing a dummy function which always +returns 1 and place this first in the search path (below we show how): +```c +int mkl_serv_intel_cpu_true() { + return 1; +} +``` + +Save this into a file called `trick.c` and compile it into a shared library +using the following command: +`gcc -shared -fPIC -o libtrick.so trick.c` + +To put the new shared library first in the search path we can use a preload environment variable: +`export LD_PRELOAD=/libtrick.so`. + +In addition, setting the environment variable `MKL_ENABLE_INSTRUCTIONS` to +`AVX2` can also have a significant effect on performance. Just changing it to +`AVX` can have a significant negative impact. + +Setting it to `AVX512` and launching it on AMD it does not fail, MKL probably +tests if the requested feature is available. + +The following table show the recorded performance obtained with the HPL (the +top500) test using a small problem size and a single Betzy node: + +| Settings | Performance | +|------------------------------------------------------:|:--------------:| +| None | 1.2858 Tflop/s | +| LD_PRELOAD=./libtrick.so | 2.7865 Tflop/s | +| LD_PRELOAD=./libtrick.so MKL_ENABLE_INSTRUCTIONS=AVX | 2.0902 Tflop/s | +| LD_PRELOAD=./libtrick.so MKL_ENABLE_INSTRUCTIONS=AVX2 | 2.7946 Tflop/s | diff --git a/_sources/jobs/monitoring.md.txt b/_sources/jobs/monitoring.md.txt new file mode 100644 index 000000000..2083fcec5 --- /dev/null +++ b/_sources/jobs/monitoring.md.txt @@ -0,0 +1,62 @@ +(monitoring-jobs)= + +# Monitoring jobs + + +## How to check whether your job is running + +To check the job status of all your jobs, you can use +[squeue](https://slurm.schedmd.com/squeue.html), i.e. by executing: + + squeue -u MyUsername + +You can also get a quick view of the status of a job + + squeue -j JobId + +where `JobId` is the job id number that `sbatch` returns. To see more +details about a job, use + + scontrol show job JobId + +Both commands will show the job state (**ST**), and can show a job reason for +why a job is pending. {ref}`job-states` describes a few +of the more common ones. + +While a job is running, it is possible to view some of its usage +statistics with the [sstat](https://slurm.schedmd.com/sstat.html) +command, and after it has finished, +[sacct](https://slurm.schedmd.com/sacct.html) will give you similar +information: + + sstat -j JobId + sacct -j JobId + +Both `sstat` and `sacct` have an option `--format` to select which +fields to show. See the documentation of the commands for the +available fields and what they mean. + +When a job has finished, the output file `slurm-JobId.out` will +contain some usage statistics from `sstat` and `sacct`. + + +## Cancelling jobs and putting jobs on hold + +You can cancel running or pending (waiting) jobs with [scancel](https://slurm.schedmd.com/scancel.html). For instance: + + scancel JobId # Cancel job with id JobId (as returned from sbatch) + scancel --user=MyUsername # Cancel all your jobs + scancel --account=MyProject # Cancel all jobs in MyProject + +The command [scontrol](https://slurm.schedmd.com/scontrol.html) can be +used to further control pending or running jobs: + +- `scontrol requeue JobId`: Requeue a running job. The job will be + stopped, and its state changed to pending. +- `scontrol hold JobId`: Hold a pending job. This prevents the queue + system from starting the job. The job reason will be set to `JobHeldUser`. +- `scontrol release JobId`: Release a held job. This allows the queue + system to start the job. + +It is also possible to submit a job and put it on hold immediately +with `sbatch --hold JobScript`. diff --git a/_sources/jobs/monitoring/job_states.md.txt b/_sources/jobs/monitoring/job_states.md.txt new file mode 100644 index 000000000..0c6ed2b08 --- /dev/null +++ b/_sources/jobs/monitoring/job_states.md.txt @@ -0,0 +1,45 @@ +--- +orphan: true +--- + +(job-states)= + +# Job States + +Commands like `squeue`, `sacct` and `scontrol show job` will show a +_state_ of each job. All job states are explained in the [JOB STATE +CODES section of the squeue documentation +page](https://slurm.schedmd.com/squeue.html#lbAG). + +Here is a table with the most common ones + +| Name | Long name | Description | +|:----:|:-------------:|---------------------------------------------------------------------| +| PD | Pending | Job is waiting to be started | +| CF | Configuring | The queue system is starting up the job | +| R | Running | Job is running | +| CG | Completing | Job is finishing | +| CD | Completed | Job has finished | +| CA | Cancelled | Job has been cancelled, either before or after it started | +| F | Failed | Job has exited with a non-zero exit status | +| TO | Timeout | Job didn't finish in time, and was cancelled | +| PR | Preemepted | Job was requeued because a higher priority job needed the resources | +| NF | Node_fail | Job was requeued because of a problem with one of its comput nodes | +| OOM | Out_of_memory | Job was cancelled because it tried to use too much memory | + +The commands can also give a _reason_ why a job is in the state it is. +This is most useful for pending jobs. All these reasons are explained +in the [JOB REASON CODES section of the squeue documentation +page](https://slurm.schedmd.com/squeue.html#lbAF). + +Here is a table with the most common ones + +| Name | Description | +|------------------------|----------------------------------------------------------------------------------------------------------------------| +| Resources | The job is waiting for resources to become idle | +| Priority | There are jobs with higher priority than this job. The job might be started, if it does not delay any of those jobs | +| AssocGrpBillingMinutes | There is not enough hours left on the quota to start the job | +| ReqNodeNotAvail | One or more of the job's required nodes is currently not available, typically because it is down or reserved | +| Dependency | The job is waiting for jobs it depend on to start or finish. | +| JobHeldUser | The job has been put on hold by the user | +| JobHeldAdmin | The job has been put on hold by an admin. Please contact support if you don't know why it is being held. | diff --git a/_sources/jobs/overview.rst.txt b/_sources/jobs/overview.rst.txt new file mode 100644 index 000000000..98fe7a2de --- /dev/null +++ b/_sources/jobs/overview.rst.txt @@ -0,0 +1,27 @@ +.. _running-jobs: + +Running jobs +============ + +.. toctree:: + :maxdepth: 1 + + submitting.md + choosing_job_types.md + job_scripts/array_jobs.md + job_scripts.md + monitoring.md + slurm_output.md + choosing-memory-settings.md + choosing-number-of-cores.md + common_job_failures.md + performance.md + mkl.md + memory-bandwidth.md + parallel-calculations.md + interactive_jobs.md + checkpointing.md + projects_accounting.md + scratchfiles.md + guides.md + slurm_commands.md diff --git a/_sources/jobs/parallel-calculations.md.txt b/_sources/jobs/parallel-calculations.md.txt new file mode 100644 index 000000000..b3c4a6942 --- /dev/null +++ b/_sources/jobs/parallel-calculations.md.txt @@ -0,0 +1,274 @@ +# Efficient use of processors and network on Betzy + + +## Interconnect - InfiniBand + +A cluster (using Betzy as the example) contains a rather large number of nodes +(Betzy consists of 1344 nodes) with an interconnect that enables efficient +delivery of messages (message passing interface, MPI) between the nodes. On +Betzy, Mellanox InfiniBand is used, in a HDR-100 configuration. The HDR (high +data rate) standard is 200 Gbits/s and HDR-100 is half of this. This is a +trade-off, as each switch port can accommodate two compute nodes. All the +compute nodes are connected in a Dragonfly topology. + +While not fully symmetrical, tests have shown that the slowdown by spreading +the ranks randomly around the compute nodes had less than the 10% specified by +the tender. Acceptance tests showed from 8 to zero percent slow-down depending +on the application. Hence **for all practical purposes there is no need to pay +special attention to schedule jobs within a single rack/cell etc**. + + +## Processors and cores + +Each compute node on Betzy contains two sockets with a 64 core AMD processor per socket. +Every processor has 64 cores each supporting 2-way [simultaneous multithreading](https://en.wikipedia.org/wiki/Simultaneous_multithreading) +(SMT). +To not confuse these *threading* capabilities in hardware with +threads in software (e.g., pthreads or OpenMP), we use the term *virtual core* +from now on. + +For applications it looks as if every compute node has 256 independent +*virtual cores* numbered from 0 to 255. Due to SMT, always two of these seemingly +independent virtual cores form a pair and share the executing units of a core. If both of +these two virtual cores are used in parallel by an application, the +application's performance will be the same as if it used only one virtual core +(and the other one is left idle) or if two different applications would use one +virtual core each, each of the two applications would achieve only half of the +performance of a core. To achieve the maximum performance from each core, it is +therefore important to pay attention to the mapping of processes to cores, that is, +**any two processes (or software threads) of an application must not share the +two virtual cores** or, in other words, one of the two virtual cores in a pair +shall be kept idle. + +The following command provides information about the numbering of virtual cores: +``` +cat /proc/cpuinfo | sed '/processor\|physical id\|core id/!d' | sed 'N;N;s/\n/ /g' +``` + +The first 128 entries (processor 0-127) correspond to the first virtual core. +Accordingly, the second 128 entries (processor 128-255) correspond to the second +virtual core. So, if one limits the placement of processes to processor +numbers 0-127, no process will share executional units with any other process. + +Both Intel MPI and OpenMPI provide means to achieve such placements and below +we will show how. + + +## Available MPI implementations + +The MPI to use is selected at application build time. Two MPI implementations +are supported: +* Intel MPI +* OpenMPI + +Behavior of Intel MPI can be adjusted through environment variables environment +variables, which start with *I_MPI* +([more information](https://software.intel.com/content/www/us/en/develop/documentation/mpi-developer-reference-linux/top/environment-variable-reference.html)). + +OpenMPI uses both environment variables (which must be used when running +through *srun*) and command line options (for use with *mpirun*). Command line +options override both the config files and environment variables. For a +complete list of parameters run `ompi_info --param all all --level 9`, or see +[the documentation](https://www.open-mpi.org/faq/?category=tuning). + + +## Slurm: Requesting pure MPI or hybrid MPI + OpenMP jobs + + +### Pure MPI + +``` +#SBATCH --ntasks=4096 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=128 +``` + +This will request 32 nodes with 128 ranks per compute nodes giving a total of 4096 +ranks/tasks. The `--ntasks` is not strictly needed (if missing, it will be +calculated from `--nodes` and `--ntasks-per-node`.) + +To get the total number of cores in a pure MPI job script the environment +variable `$SLURM_NTASKS` is available. + +For well behaved MPI applications the job scripts are relatively simple. The +only important thing to notice is that processes (MPI ranks) should be mapped +with one rank per two SMT threads (also often referred to a physical cores). + +For both Intel- and OpenMPI the simplest command line would be `mpirun +./a.out` This will launch the MPI application with the default settings, while +not with optimal performance it will run the application using the resources +requested in the run script. + + +### Hybrid MPI + OpenMP + +For large core count a pure MPI solution is often not optimal. Like HPL (the +top500 test) the hybrid model is the highest performing case. + +``` +#SBATCH --nodes=1200 +#SBATCH --ntasks-per-node=8 +#SBATCH --cpus-per-task=16 +``` + +This will request 1200 nodes placing 8 MPI ranks per node and provide 16 OpenMP +threads to each MPI rank, a total of 128 cores per compute node. 1200 times 128 +is 153600 cores. + +To get the total number of cores in a Hybrid MPI + OpenMP job script one can +multiply the environment variables `$SLURM_NTASKS` and `$SLURM_CPUS_PER_TASK`. + +To generate a list of all the Slurm variables just issue an `env` command in +the job script and all environment variables will be listed. + +Most OpenMP or threaded programs respond to the environment variable +`OMP_NUM_THREADS` and you can set it to the number of CPUs per task +set by Slurm: `export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK` + +The mapping of ranks and OpenMP threads onto the cores on the compute node can +often be tricky. There are many ways of dealing with this, from the simplest +solution by just relying on the defaults to explicit placement of the ranks and +threads on precisely specified cores. + + +#### Intel MPI + +There are a number of environment variables to be used with Intel MPI, they all start with I_MPI: +* `I_MPI_PIN` +* `I_MPI_PIN_DOMAIN` +* `I_MPI_PIN_PROCESSOR_EXCLUDE_LIST` +The variable `I_MPI_PIN_DOMAIN` is good when running hybrid codes, setting it to +the number of threads per rank will help the launcher to place the ranks +correctly. Setting `I_MPI_PIN_PROCESSOR_EXCLUDE_LIST=128-255` will make sure that only +cores 0-127 are used for MPI ranks. This ensures that no two ranks +share the same physical core. + + +#### OpenMPI + +There are currently some issues with mapping of threads started by MPI +processes. These threads are scheduled/placed on the same core as the MPI rank +itself. The issue seems to be an openMP issue with GNU OpenMP. We are working to +resolve this issue. + + +## Binding/pinning processes + +Since not all memory is equally "distant", some sort of binding to keep the +process located on cores close to the memory is normally beneficial. + + +### Intel MPI + +Binding/pinning to cores can be requested with an environment flag, `I_MPI_PIN=1`. + +To limit the ranks to only the fist thread on SMT e.g. using only cores 0 to 127 set the Intel MPI environment variable `I_MPI_PIN_PROCESSOR_EXCLUDE_LIST` to 128-255, e.g.: +``` +export I_MPI_PIN_PROCESSOR_EXCLUDE_LIST=128-255 +``` + + +### OpenMPI + +The simplest solution is just to request binding at the command line: +``` +mpirun --bind-to core ./a.out +``` + +To learn more about the binding options try issuing the following command: +``` +mpirun --help binding +``` + +## Optimizing collective MPI operations + +For OpenMPI, setting the variable `OMPI_MCA_coll_hcoll_enable` to 0 to disable +or 1 to enable can have a significant effect on the performance of your MPI +application. Most of the times it is beneficial to enable it by including +`export OMPI_MCA_coll_hcoll_enable=1` in the run script. + + + +## srun vs mpirun on Betzy + +Most if the times the `mpirun` command can be used. The `mpirun` sets up the +MPI environment and makes sure that everything is ready for the MPI function +`MPI_init()` when it’s called in the start of any MPI program. + +As Slurm is built with MPI support srun will also set up the MPI environment. + +Both mpirun and srun launch the executable on the requested nodes. While there +is a large range of opinions on this matter it’s hard to make a final statement +about which one is best. If you do development on small systems like your +laptop or stand alone system there is generally no Slurm and mpirun is the only +option, so mpirun will work on everything from Raspberry Pis through laptops to +Betzy. + +Performance testing does not show any significant performance difference when +launching jobs in a normal way. + + +## Creating a hostfile or machinefile + +Some application ask for a list of hosts to distribute the tasks +across nodes and cores. + +To make a *host-* or *machinefile*, you can use `srun`: +``` +srun /bin/hostname | uniq +``` + +A more complicated example is a *nodelist* file for the molecular mechanics application NAMD : +``` +srun /bin/hostname | uniq | awk -F\. 'BEGIN {print “group main”};{print "host ", $1}' > nodelist +``` + + +## Transport options for OpenMPI + +Most of the following is hidden behind some command line options, but in case +more information is needed about the subject of transport a few links will +provide more insight. + +For detailed list of settings a good starting point is here: + + +OpenMPI 4.x uses UCX for transport, this is a communication library: + + +Transport layer PML (point-to-point message layer): + + +Transport layer UCX: + + +Collective optimisation library hcoll from Mellanox is also an option: + + +Setting the different devices and transports can be done using environment variables: +``` +export UCX_IB_DEVICE_SPECS=0x119f:4115:ConnectX4:5,0x119f:4123:ConnectX6:5 +export UCX_NET_DEVICES=mlx5_0:1 +export OMPI_MCA_coll_hcoll_enable=1 +export UCX_TLS=self,sm,dc +``` + +From the UCX documentation the list of internode transports include: +* rc +* ud +* dc + +The last one is Mellanox scalable offloaded dynamic connection transport. The +self is a loopback transport to communicate within the same process, while *sm* +is all shared memory transports. There are two shared memory transports +installed +* cma +* knem + +Selecting *cma* or *knem* might improve performance for applications that use a +high number of MPI ranks per node. With 128 cores and possibly 128 MPI ranks +per node the intra node communication is quite important. + +Depending on the communication pattern of your application, the use of point-to-point or +collectives, the usage of Mellanox optimised offload collectives can have an +impact. diff --git a/_sources/jobs/performance.md.txt b/_sources/jobs/performance.md.txt new file mode 100644 index 000000000..164d9df0c --- /dev/null +++ b/_sources/jobs/performance.md.txt @@ -0,0 +1,219 @@ +(arm-performance-reports)= + +# How to check the performance and scaling using Arm Performance Reports + +[Arm Performance Reports](https://developer.arm.com/docs/101137/latest/contents) +is a performance evaluation tool which is simple to use, produces a +clear, single-file report, and it is used to obtain a +high-level overview of the performance characteristics. + +It can report CPU time spent on various types of instructions (e.g., +floating-point), communication time (MPI), multi-threading level and +thread synchronization overheads, memory bandwidth, and IO performance. +Such a report can help spotting certain bottlenecks in the +code and highlight potential optimization directions, but also suggest +simple changes in how the code should be executed to better utilize +the resources. Some typical examples of the suggestions are + +> The CPU performance appears well-optimized for numerical +computation. The biggest gains may now come from running at larger +scales. + +or + +> Significant time is spent on memory accesses. Use a profiler +to identify time-consuming loops and check their cache performance. + +A successful Arm Performance Reports run will produce two files, a HTML summary +and a text file summary, like in this example: + +``` +example_128p_4n_1t_2020-05-23_18-04.html +example_128p_4n_1t_2020-05-23_18-04.txt +``` + + +## Do I need to recompile the code? + +- You can use Arm Performance Reports on dynamically linked binaries without + recompilation. However, you may have to recompile statically linked binaries + (for this please consult the + [official documentation](https://developer.arm.com/docs/101137/2003)). +- Due to a bug in older versions of OpenMPI, on Fram Arm Performance + Reports works only with OpenMPI version 3.1.3 and newer. If you have + compiled your application with OpenMPI 3.1.1, you don't need to + recompile it. Simply load the 3.1.3 module - those versions are + compatible. + + +## Profiling a batch script + +Let us consider the following example job script as your +usual computation which you wish to profile: + +```bash +#!/bin/bash -l + +# all your SBATCH directives +#SBATCH --account=myaccount +#SBATCH --job-name=without-apr +#SBATCH --time=0-00:05:00 +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node=32 + +# recommended bash safety settings +set -o errexit # make bash exit on any error +set -o nounset # treat unset variables as errors + +srun ./myexample.x # <- we will need to modify this line +``` + +To profile the code you don't need to modify any of the `#SBATCH` part. +All we need to do is to load the `Arm-PerfReports/20.0.3` module +and to modify the `srun` command to instead use +[perf-report](https://developer.arm.com/docs/101137/latest/running-with-an-example-program): + +```bash +#!/bin/bash -l + +# ... +# we kept the top of the script unchanged + +# we added this line: +module load Arm-PerfReports/20.0.3 + +# we added these two lines: +echo "set sysroot /" > gdbfile +export ALLINEA_DEBUGGER_USER_FILE=gdbfile + +# we added perf-report in front of srun +perf-report srun ./myexample.x +``` + +This works the same way on Saga, Fram, and Betzy. +In other words, add 3 lines, replace `srun` or `mpirun -n ${SLURM_NTASKS}` by +`perf-report srun`. + +That's it. + +Why are these two lines needed? +```bash +echo "set sysroot /" > gdbfile +export ALLINEA_DEBUGGER_USER_FILE=gdbfile +``` +We have a Slurm plug-in that (deliberately) detaches a job from the global mount +name space in order to create private versions of `/tmp` and `/var/tmp` (i.e., +bind mounted) for each job. This is done both so jobs cannot see other jobs' +`/tmp` and `/var/tmp`, and also so that we avoid filling up (the global) `/tmp` +and `/var/tmp` (since we allow more than one job per compute node, we cannot +clean these directories after each job - we don't know which job created the +files). However, for `perf-report` to work with this setup we need to set GDB's +sysroot to `/`. + + +## Profiling on an interactive compute node + +To run interactive tests one needs to submit +[an interactive job](interactive_jobs.md) +to Slurm using `srun` (**not** using `salloc`), e.g.: + +First obtain an interactive compute node (adjust "myaccount"), on Saga: +```bash +$ srun --nodes=1 --ntasks-per-node=4 --mem-per-cpu=1G --time=00:30:00 --qos=devel --account=myaccount --pty bash -i +``` +or Fram: +```bash +$ srun --nodes=1 --ntasks-per-node=32 --time=00:30:00 --qos=devel --account=myaccount --pty bash -i +``` +or Betzy: +```bash +$ srun --nodes=1 --ntasks-per-node=128 --time=00:30:00 --qos=devel --account=myaccount --pty bash -i +``` + +Once you get the interactive node, you can run the profile: +```bash +$ module load Arm-PerfReports/20.0.3 +$ echo "set sysroot /" > gdbfile +$ export ALLINEA_DEBUGGER_USER_FILE=gdbfile +$ perf-report srun ./myexample.x +``` + + +## Use cases and pitfalls + +We demonstrate some pitfalls of profiling, and show how +one can use profiling to reason about the performance of real-world +codes. + +- [STREAM benchmark](arm-perf/stream.md) (measures the memory bandwidth) +- [LINPACK benchmark](arm-perf/linpack.md) (measures the floating-point capabilities) +- [OSU benchmark](arm-perf/osu.md) (measures the interconnect performance) +- [Quantifying the profiling overhead](arm-perf/overhead.md) +- {ref}`Finite Volume Community Ocean Model (fvcom) ` + +### Known issues + +ARM Performance reports may fail if too many processes are generated on a single node, due to the +`ulimit -u` default value (4096). This can easily be fixed by setting `ulimit -u` to a high number. +E.g., add the line `ulimit -u 40960` in your jobscript. + +There seems to be a compatibility issue between ARM Performance reports and the `Intel/20XX` modules. +If you are using this module and having trouble with APR, you might want to test alternative modules. +In some cases, loading the ARM Performance reports module after the Intel module, might fix the issue. + + +## What if the job timed out? + +The job has to finish within the allocated time for the report to be generated. +So if the job times out, there is a risk that no report is generated. + +If you run a job that always times out by design (in other words the job never +terminates itself but is terminated by Slurm), there is a workaround **if you +are running the profile on Fram on no more than 64 cores**: + +As an example let us imagine we profile the following example: + +```bash +# ... +#SBATCH --time=0-01:00:00 +# ... + +module load Arm-PerfReports/20.0.3 + +echo "set sysroot /" > gdbfile +export ALLINEA_DEBUGGER_USER_FILE=gdbfile + +perf-report srun ./myexample.x # <- we will need to change this +``` + +Let's imagine the above example code (`./myexample.x`) always times out, +and we expect it to time out after 1 hour (see `--time` above). +In this case we get no report. + +To get a report on Fram, we can do this instead: + +```bash +# ... +#SBATCH --time=0-01:00:00 +# ... + +module load Arm-MAP/20.0.3 # <- we changed this line + +echo "set sysroot /" > gdbfile +export ALLINEA_DEBUGGER_USER_FILE=gdbfile + +# we changed this line and tell map to stop the code after 3500 seconds +map --profile --stop-after=3500 srun ./myexample.x +``` + +We told map to stop the code after 3500 seconds but to still have some time to +generate a map file. This run will generate a file with a `.map` suffix. From +this file we can generate the profile reports on the command line (no need to +run this as a batch script): + +```bash +$ module load Arm-PerfReports/20.0.3 +$ perf-report prefix.map # <- replace "prefix.map" to the actual map file +``` + +The latter command generates the `.txt` and `.html` reports. diff --git a/_sources/jobs/projects_accounting.md.txt b/_sources/jobs/projects_accounting.md.txt new file mode 100644 index 000000000..80bfc33d7 --- /dev/null +++ b/_sources/jobs/projects_accounting.md.txt @@ -0,0 +1,228 @@ +(projects-accounting)= + +# Projects and accounting + +```{contents} Table of Contents +``` + +## What is quota and why is it needed? + +**Our compute clusters are shared and limited resources** and therefore we +divide the available compute resources in quotas and we specify compute quota +in "billing units". You can think of a billing unit as something that +corresponds to for how long one processing core could be used. For example, if +your project received 100 billing units you could use one processing core for +100 hours. You could also use 10 processing cores for 10 hours or 20 processing +cores for 5 hours or ... + + +## TL;DR - how to use billing units well + +How billing units are computed is described below but here is what this means for you: + +```{admonition} This is important +- {ref}`Do not ask for a lot more memory than you need `, + otherwise you can get billed for a lot more than you use, and your jobs may + queue for a lot longer than you would like to. In addition this can also + block resources for others. + +- **You get billed for the resources you asked for, not what you used, + with one exception: time**. + Slurm cannot know how long your job will take. If you ask for 5 days but + only use 2 hours, it will subtract "5 days worth of billing units" from your + project/account once your job starts. 2 hours later, it will return to you + the unused quota once your job ends. This means that if you ask for a lot + more time than you actually need, you and your project colleagues may not be + able to get other jobs scheduled in the meantime since Slurm will not let you + overspend your quota. +``` + + +## Projects + +All jobs are run in a _project_ or _account_ (the Slurm queue system calls +projects _accounts_) and the account is something we always specify in our job +scripts to select which project the job should be "billed" in: +```{code-block} bash +--- +emphasize-lines: 4 +--- +#!/bin/bash -l + +# account name +#SBATCH --account=nnABCDk + +# max running time in d-hh:mm:ss +# this helps the scheduler to assess priorities and tasks +#SBATCH --time=0-00:05:00 + +# ... rest of the job script +``` + +Each project has a **CPU hour quota**, and when a job runs, CPU hours are +subtracted from the project quota. If there is not enough hours left on the +quota, the job will be left pending with a reason `AssocGrpBillingMinutes`. + +To see which projects you have access to on a cluster, run (the list of your projects will differ): +```console +$ projects + +nn9997k +nn9999k +nn9999o +``` + + +## How to list available quota + +The command `cost` gives an overview of the CPU hour quota. It can be +run in different ways: + +Show quota information for all projects you have access to: +```console +$ cost +``` + +Show quota information for a project: +```console +$ cost -p YourProject +``` + +Get information about how much each user has run: +```console +$ cost --details +``` + +See `cost --man` for other options, and explanation of the output. +The `cost` command only shows usage in the current _allocation +period_. Historical usage can be found [here](https://www.metacenter.no/mas/projects). + + +## How billing units are computed + +**The term "CPU hour" above is an over-simplification. Jobs are generally accounted for both CPU and memory usage, as well as usage of GPUs.** +The accounting tries to assign a fair "price" to the amount of resources a job +requested. + +Accounting is done in terms of _billing units_, and the quota is in +_billing unit hours_. Each job is assigned a number of billing units +based on the requested CPUs, memory and GPUs. The number that is +subtracted from the quota is the number of billing units multiplied +with the (actual) wall time of the job. + +The number billing units of a job is calculated like this: + +1. Each requested CPU is given a cost of 1. +2. The requested memory is given a cost based on a _memory cost factor_ + (see below). +3. Each requested GPU is given a cost based on a _GPU cost factor_ + (see below). +4. The number of billing units is the _maximum_ of the CPU cost, memory + cost and GPU cost. + +The _memory cost factor_ and _GPU cost factor_ vary between the partitions on the +clusters. + +### Fram + +- Jobs on Fram are only accounted for their CPU usage. + +### Saga + +- The `normal` partition: memory factor is 0.2577031 units per GiB. Thus + the memory cost of a job asking for all memory on a node will + be 46. This is a compromise between the two node types in the + normal partition; they have 40 and 52 CPUs. + +- For the `bigmem` partition, the factor is + 0.1104972 units per GiB. This means that for a job requesting all + memory on one of the "small" bigmem nodes, the memory cost is 40, + while for a job requesting all memory on one of the large nodes, + it is 320. + +- For the `hugemem` partition, the factor is 0.01059603 units per GiB. + This means that for a job requesting all memory on a node, the + memory cost is 64, the number of CPUs on the node. + +- On the `accel` partition, the memory factor is 0.06593407 units per + GiB, and the GPU factor is 6. This means that a job asking for all + memory on a node, or all GPUs on a node, gets a cost of 24, the + number of CPUs on the node. + +- The `optimist` partition has the same memory factor as the `normal` + partition. + + +### Betzy + +- In the `normal` partition, only whole nodes are handed out, so each + job is accounted for 128 units per node, and there is no memory + factor. + +- The `preproc` partition has a memory factor of 0.5245902 units per + GiB, so a job asking for all memory on the node would have a cost of + 128, the number of CPUs on the node. + +- The `accel` partition has a memory factor of 0.1294237 units per GiB, while + the GPU factor is 16 units per GPU. This means that when one reserves 1 GPU + on Betzy the billing is equivalent to reserving 16 CPU cores. + + +## Finding out how many billing units your job consumes + +This only works for running and pending jobs. Here is an example (43 billing units): +```{code-block} console +--- +emphasize-lines: 19 +--- +$ scontrol show job 123456 + +JobId=123456 JobName=example + UserId=... GroupId=... MCS_label=N/A + Priority=19760 Nice=0 Account=nnABCDk QOS=nnABCDk + JobState=RUNNING Reason=None Dependency=(null) + Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 + RunTime=5-00:08:56 TimeLimit=7-00:00:00 TimeMin=N/A + SubmitTime=2022-10-03T13:43:14 EligibleTime=2022-10-03T13:43:14 + AccrueTime=2022-10-03T13:43:14 + StartTime=2022-10-03T13:43:14 EndTime=2022-10-10T13:43:14 Deadline=N/A + PreemptEligibleTime=2022-10-03T13:43:14 PreemptTime=None + SuspendTime=None SecsPreSuspend=0 LastSchedEval=2022-10-03T13:43:14 Scheduler=Main + Partition=normal AllocNode:Sid=login-2:23457 + ReqNodeList=(null) ExcNodeList=(null) + NodeList=c10-38 + BatchHost=c10-38 + NumNodes=1 NumCPUs=40 NumTasks=1 CPUs/Task=40 ReqB:S:C:T=0:0:*:* + TRES=cpu=40,mem=172000M,node=1,billing=43 + Socks/Node=* NtasksPerN:B:S:C=1:0:*:* CoreSpec=* + MinCPUsNode=40 MinMemoryNode=172000M MinTmpDiskNode=0 + Features=(null) DelayBoot=00:00:00 + OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null) + Command=/cluster/home/... + WorkDir=/cluster/home/... + StdErr=/cluster/home/... + StdIn=/dev/null + StdOut=/cluster/home/... + Power= +``` + + +## How do I get compute and storage quota? + +The process is described here: {ref}`applying-computing-storage`. If you are +unsure about how much to ask for and on which cluster, do not hesitate to +{ref}`contact us `. + +If you exhaust your quota and need more, the project manager can apply for +[additional quota](https://www.sigma2.no/extra-allocation). + + +## For how long can I use the compute quota? + +Compute quota is always handed out for an allocation period. Allocation periods +run for six months (from April 1 to September 30, or October 1 to March 31). Unused +compute quota is not transferred to the next allocation period. The project +manager has to ask for new compute quota for every allocation period. + +If you need a small allocation to experiment, you don't need to wait until +April or October, but can also apply in-between ({ref}`contact `). diff --git a/_sources/jobs/scratchfiles.md.txt b/_sources/jobs/scratchfiles.md.txt new file mode 100644 index 000000000..0cf2acaeb --- /dev/null +++ b/_sources/jobs/scratchfiles.md.txt @@ -0,0 +1,206 @@ +(scratchfiles)= + + +# Local storage for scratch files + +## Why it matters + +Usage of local storage for scratch files can have dramatic impact of run time! +```{note} +localscratch is only available on Saga and Fram. +``` +![Different scratch locations](img/localscratch.png) + +As illustrated in the figure random read operations are extremely slow on a global +parallel file system. Reading and writing large sequential file are fast. Any file operation on a +parallel file system incur some kind of metadata operation and this is centralised and need to be +coordinated with locks to make sure file integrity is assured. Adding to this is the fact that a +large system have a large amount of jobs running, each might open thousands of files. No wonder that +most global parallel file systems struggle. + +On the other hand local handling of files not only scale with the number of nodes used it also lead +to far less demand on the parallel file system, benefiting all users. + +The test run to produce the results in the figure above figure took 47 minutes using `$LOCALSCRATCH` +while it took 20 hours using `$SCRATCH`. + + +## How to choose location for scratch files + +A job typically uses several types of files, including: + +- the job script itself +- the Slurm output file (default: `slurm-.out`) +- input files +- temporary files +- output files + +There are multiple choices for where to keep files. + +| Name | Path | Size | Description | +| ----------------- | ---------------------------------------- | --------------------- | -------------------------------------------- | +| Project area | `/cluster/projects/` | quota per project | main project area, for permanent files | +| User work area | `/cluster/work/users/` | no quota | for temporary user files | +| Job scratch area | (`$SCRATCH`) `/cluster/work/jobs/` | no quota | for temporary job files | +| Job scratch area on local disk | (`$LOCALSCRATCH`) `/localscratch/` | few 100GBs per node | a fast disk on the node where the job runs | + +An overall description of the areas is given in {ref}`storage-areas`. + +Each location has its advantages and disadvantages, depending on +usage. The parallel file system is very good for sequential read and +write of large files.It is by nature (very) slow for random read and write +operations and metadata operations (handling of large number of files). + +The local file system (`$LOCALSCRATCH`) is far better suited for this. +In addition the parallel file system needs to serve all users, so +placing very high metadata load on it make the file system slow for +all users. On the other hand, the local file system is local to each +compute node, and cannot easily be shared between nodes (but see +below). + +## Recommendations + +We recommend that the job script itself and the Slurm output file +(`slurm-.log`) are kept on the parallel file system. +The default location for the Slurm output file is the directory where +one runs sbatch. You can also keep both of these files in your home directory, +but be aware that the disk quota for home directories is quite small. + + +### Input files + +Where to keep input files depends on how they are used. + +If an input file is read sequentially (i.e., from start to end), it is +best to keep it in the work/home or project area. + +```{warning} +The storage location pointed to by `$LOCALSCRATCH` is limited, if +job fails due to storage limitation on `$LOCALSCRATCH`, use `$SCRATCH`. +``` + +If there is a lot of random read of an input file, it is best to let +the job script copy the file to `$LOCALSCRATCH`. + +### Temporary files + +By temporary files we mean files created by the job, and that are not +needed after the job has finished. + +Temporary files should normally be created in `$LOCALSCRATCH`, since +this is the fastest disk. This is especially important if there is a +lot of random read and/or write of the files. + +If other users need access to files while a job runs, you should create +files in the user work area or the project area. Files in the project +area are readable by users of the same project, and files in the user +work area can be made available to other users. + +```{warning} +NOTE: Files in the user work area are deleted after some time. +``` + +### Output files + +By output files we mean files created by the job, and that are needed after the job has finished. + +As with input files, if an output file is written sequentially (i.e., +from start to end), it is best to create it in the project area. + +If there is a lot of random writes (or reads) of an output file, it is +best to create it in `$LOCALSCRATCH`, and let the job script copy the file to +the project area when the job finishes. + + +### Files in `$LOCALSCRATCH` and `$SCRATCH` + +The `$LOCALSCRATCH` area (/localscratch/) for each job is +created automatically when the job starts, and deleted afterwards. It +is located on solid state storage on the compute +nodes. Such memory based storage is magnitudes faster than normal disk +storage for random access operations. For streaming operations like +writing or reading large sequential amount of data the parallel file +system is comparable, even tape drives are comparable for streaming +data/sequential access. + +A potential limitation of the scratch area is its limited size. As memory has +higher cost than spinning disks, the scratch area is rather limited. + +The `$SCRATCH` area (`/cluster/work/job/`) is part of the global parallel +file file system with its own traits like far more space, sharing files etc. + +Files placed in all scratch areas will automatically be deleted after the job finishes. + + + +### More on Output files + +Output files can also be placed in `$SCRATCH` or `$LOCALSCRATCH` for +increased speed (see above). + +For files in `$LOCALSCRATCH`, you must use `cp` in the job script to +copy the file back to the project area before the job exits. + +For files in `$SCRATCH`, you can use the command `savefile filename` +in the job script, where `filename` is the name of the file, relative +to the `$SCRATCH` area. (`savefile` does *not* work with +`$LOCALSRCATCH`.) The command should be placed before the main +computational commands in the script. I.e., +``` +savefile MyOuputFile +MyProgram > MyOutputFile +``` +This ensures that the file `${SCRATCH}/MyOutputFile` is +copied back to the submit directory (the directory you were in when you ran the +sbatch command). The file will be copied back even if the job crashes +(however, if the compute node itself crashes, the file will not be copied back). + +If you want more flexibility, it is possible to register a command to +be run to copy the file where you want it by using `cleanup +` instead of using the `savefile` command. It should also +be placed before the main computational commands. This can also be +used for files in `$LOCALSCRATCH`, i.e., +``` +cleanup cp $LOCALSCRATCH/MyOutputFile /cluster/projects/ec/mydir +MyProgram > $LOCALSCRATCH/MyOutputFile +``` + +If the `savefile` or `cleanup` commands contain any special characters +like `*`, these should be quoted. + + +### Jobs using `$LOCALSCRATCH` with more than one node + +As the `$LOCALSCRATCH` area is local to each node, files cannot be +shared between nodes like when using `$SCRATCH`. A job running on +several nodes will get one `$LOCALSCRATCH` area on each node. + +Slurm provide utilities for distributing files to local scratch areas on several nodes and gather files back again. +Here is an example to illustrate how this might look: +```bash +#!/bin/bash +#SBATCH --account=YourProject +#SBATCH --ntasks-per-node=2 +#SBATCH --nodes=2 +#SBATCH --mem-per-cpu=500M +#SBATCH --time=00:02:0 +#SBATCH --gres=localscratch:100G + +## Print the hostnames where each task is running: +srun hostname + +## This copies "hello.c" fro your submit dir to $LOCALSCRATCH on each node: +sbcast hello.c ${LOCALSCRATCH}/hello.c + +## Simulate output files created on the $LOCALSCRATCH areas on each node +## by copy $LOCALSCRATCH/hello.c to $LOCALSCRATCH/bye.c once on each node: +srun --ntasks-per-node=1 --ntasks=$SLURM_NNODES cp ${LOCALSCRATCH}/hello.c ${LOCALSCRATCH}/bye.c + +## This copies the "bye.c" files back to the submit dir: +sgather ${LOCALSCRATCH}/bye.c bye.c +``` + +Slurm `sgather` will append `$HOSTNAME` to each of the files gathered +to avoid overwriting anything. Note that you have to set up ssh keys +`sgather` to work, because under the hood, it uses `scp` to transfer +the files. diff --git a/_sources/jobs/slurm_commands.md.txt b/_sources/jobs/slurm_commands.md.txt new file mode 100644 index 000000000..0189ff388 --- /dev/null +++ b/_sources/jobs/slurm_commands.md.txt @@ -0,0 +1,68 @@ +(slurm_commands)= + +# Useful Slurm commands and tools for managing jobs + +## Slurm + +Slurm, the job scheduler used on the HPC clusters, has a number of useful commands for managing jobs. Here is a growing collection of useful Slurm commands. Slurm commands can also be found in the official [Slurm documentation](https://slurm.schedmd.com/documentation.html). + +### Commands for sbatch + +```{note} +There are two ways of giving sbatch a command. One way is to include the command in the job script by adding `#SBATCH` before the command (just like you would do for the required sbatch commands such as `#SBATCH --job-name`, `#SBATCH --nodes`, etc.) The other way is to give the command in the command line when submitting a job script. For example for the command `--test-only`, you would submit the job with `sbatch --test-only job_script.sh`. +``` + +#### `--test-only` + +Validates the script and report about any missing information (misspelled input files, invalid arguments, etc.) and give an estimate of when the job will start running. Will not actually submit the job to the queue. + +#### `--gres=localscratch:` + +A job on **Fram or Saga** can request a scratch area on local disk on the node it is running on to speed up I/O intensive jobs. This command is not useful for jobs running on more than one node. Currently, there are no special commands to ensure that files are copied back automatically, so one has to do that with cp commands or similar in the job script. More information on using this command is found here: {ref}`job-scratch-area-on-local-disk`. + +### Other Slurm commands + +#### `sstat` and `sacct` + +Job statistics can be found with `sstat` for _running_ jobs and with `sacct` for _completed_ jobs. In the command line, use `sstat` or `sacct` with the option `-j` followed by the job id number. + +```console +$ sstat -j JobId +$ sacct -j JobId +``` + +Both `sstat` and `sacct` have an option `--format` to select which +fields to show. See the documentation on `sstat` [here](https://slurm.schedmd.com/sstat.html) and on `sacct` [here](https://slurm.schedmd.com/sacct.html). + +## Tools + +Here, a growing collection of useful tools available in the command line. + +### seff + +`seff` is a nice tool which we can use on **completed jobs**. For example here we ask +for a summary for the job number 4200691: + +```console +$ seff 4200691 +``` + +```{code-block} +--- +emphasize-lines: 9-10 +--- +Job ID: 4200691 +Cluster: saga +User/Group: user/user +State: COMPLETED (exit code 0) +Cores: 1 +CPU Utilized: 00:00:01 +CPU Efficiency: 2.70% of 00:00:37 core-walltime +Job Wall-clock time: 00:00:37 +Memory Utilized: 3.06 GB +Memory Efficiency: 89.58% of 3.42 GB +``` + +### Slurm Job Script Generator + +A tool for generating Slurm job scripts tailored for our HPC clusters: [Slurm Job Script Generator](https://open.pages.sigma2.no/job-script-generator/) diff --git a/_sources/jobs/slurm_output.md.txt b/_sources/jobs/slurm_output.md.txt new file mode 100644 index 000000000..accd23304 --- /dev/null +++ b/_sources/jobs/slurm_output.md.txt @@ -0,0 +1,173 @@ +# Understanding the job output file + +When a job begins to run it will create a `slurm-.out`. All output, +both standard output and standard error, is sent to this file. In this way we +can monitor what our application is doing when it is running under Slurm. Slurm +will also ensure that no matter how many tasks or number of nodes everything +printed to standard out will be collected into this output file. + +```{tip} +When contacting support with a job error always attach the Slurm script used to +submit the job as well as the Slurm output file. This helps us understand the +error and reduces misscommunication. +``` + +To illustrate the output we will use the following Slurm script: + +```{eval-rst} +.. literalinclude:: slurm_output/run.slurm + :language: bash +``` + +This script does nothing special and follows the best practises for a Slurm +script as described in {ref}`our introduction to Slurm scripts`. +We ran the script on Saga which produced an output file called +`slurm-4677199.out`. + +The anatomy of the output file is as follows (we will explain each part in +detail below): + +1. Header created when the application launched +2. Application specific output +3. CPU and memory statistics +4. Disk read/write statistics +5. (Optional) GPU statistics + +## Output header + +Below, we have attached the first 15 lines of the output, from our sample +application above, which contain some shared information and some job specific +output. The first highlighted line shows which node(s) the job ran on and when +it started, this is always added to the output. Next we have highlighted the +beginning of the `module list` output. This output can be very useful to check +if the job does what you expect and allows you to see which software versions +you were using. + +```{eval-rst} +.. literalinclude:: slurm_output/slurm-4677199.out + :language: bash + :lines: 1-15 + :emphasize-lines: 1, 8-15 +``` + +After this follows the application output, which we will not show here as that +is application specific. However, know that using standard output to log what +your application is doing at different times can be a good way to better +understand how your application is running on the HPC machines. + +## CPU statistics + +```{note} +Slurm collects statistics every second, which means that for applications that +run for a short amount of time the CPU, memory, disk and GPU statistics can be +a bit missleading. Keep this in mind when developing your Slurm script and when +scaling up. +``` + +Once the application finishes running, either successfully or not, some +statistics about resource usage is outputted. The first of these are the CPU +and memory statistics. Using our example from above, here is the CPU and memory +statistics. + +```{eval-rst} +.. literalinclude:: slurm_output/slurm-4677199.out + :language: bash + :lines: 82-94 +``` + +In the above output we can see a few different things, but lets first explain +the meaning of `batch` and `extern`. When you submit a Slurm script the script +itself is counted as `batch` by Slurm. That means that any resources used by +your Slurm script is accounted for under this heading. If you run a command +directly within the Slurm script the time this command used will be accounted +for under `batch`. Looking back at our Slurm script above we can see that the +main application was `pytest`, we could alternatively have used `srun pytest` +which would create a new line in the output which would account for everything +under the `srun` call (multiple `srun` calls are accounted in separate line +items in the output). `extern` is everything outside these two usages and +should be fairly small, one example of an `extern` usage is SSH-ing to the node +where your code is running and inspecting, where the resources used during the +inspection would be accounted for under `extern`. + +--- + +Below, we have highlighted the `batch` step in both the CPU and memory +statistics, this is most likely where you would find the most useful +information. In the CPU statistics we can see that we allocated 16 CPU cores +(`AllocCPUS`), for 1 task (`NTasks`), the script ran for 1 minute (`Elapsed`), +but the 16 CPU cores ran for around 2 minutes in total (`AveCPU`). Lastly, we +can see that the job ended successfully since the `ExitCode` is `0:0` (an exit +code of `0` means success on Linux). + +```{eval-rst} +.. literalinclude:: slurm_output/slurm-4677199.out + :language: bash + :lines: 82-94 + :emphasize-lines: 5, 12 +``` + +The other line we highlighted above is the memory footprint of our application. +The most pertinent information here is the maximum memory used (`MaxRSS`) (note +that this is the maximum of memory used by any task and _not_ the total memory +footprint of the application as a whole) and the average memory used +(`AveRSS`), which is the average Resident Set Size over all tasks in the given job step (for example, if you run `srun` several times in a job, each will get their own AveRSS and the job script itself is counted as the .batch job step). Above, we can see that our application used a maximum of `150344 +KiB` (or around `147 MiB`). This information is very important as that can be +used to optimize your Slurm script to request less memory (and thus be +scheduled quicker). + +## Disk statistics + +The next information shows our applications disk usage. This information is +important because it can help you reduce unnecessary disk access which is not +the fastest for small files. + +```{eval-rst} +.. literalinclude:: slurm_output/slurm-4677199.out + :language: bash + :lines: 96-101 + :emphasize-lines: 5 +``` + +As we can see, our test application reads a bit of data, but writes very +little. + +## GPU statistics + +```{note} +GPU statistics is only outputted if your Slurm script +{ref}`requests one or more GPUs`. + +Currently, only Saga supports automatic GPU metric collection. We are working +on enabling the same support on Betzy. One alternative that works on both Saga +and Betzy, but require manual intervention, +{ref}`is described in our CUDA introduction`. +``` + +To give some indications on how your applications interacts with GPUs, the +following GPU statistics is collected during a run. The first thing to note in +the GPU output is that statistics is collected for all GPUs requested, and each +GPU is displayed separately. Below we have highlighted the information of most +interest. + +```{eval-rst} +.. literalinclude:: slurm_output/slurm-4677199.out + :language: bash + :lines: 103-141 + :emphasize-lines: 14, 17-20, 34-36 +``` + +`Max GPU Memory Used` is quite self explanatory, but can be important to check +(maybe your application can trade higher memory usage for more performance).`SM +Utilization` describes how well our application used the GPU compute resources. +If the maximum value here is low this could be an indication that your +application is not utilizing the requested GPUs. For our example application +not all steps utilized a GPU so we get a medium average utilization. The other +variable to pay extra attention to is the `Memory Utilization` which describes +how well our application utilized the GPU memory bandwidth. For longer running +application optimizing memory transfer is one of the first optimization that +should be undertaken for GPU optimization. + +```{note} +The information under `PID` tends to not be as accurate as the other +information so take this with a grain of salt. +``` diff --git a/_sources/jobs/submitting.md.txt b/_sources/jobs/submitting.md.txt new file mode 100644 index 000000000..04e302ea0 --- /dev/null +++ b/_sources/jobs/submitting.md.txt @@ -0,0 +1,107 @@ +(submitting-jobs)= + +# Submitting jobs + +The HPC clusters are resources that are shared between many users, and +to ensure fair use everyone must do their computations by submitting +jobs through a queue system (batch system) that will execute the +applications on the available resources. +In our case [Slurm](https://slurm.schedmd.com/) is used as workload +manager and job scheduler. + +When you log in to a cluster, you are logged in to a _login_ node +shared by all users. The login nodes are meant for logging in, copying +files, editing, compiling, running short tests (no more than a couple +of minutes), submitting jobs, checking job status, etc. +If you are unsure about the basic interaction with Unix-like systems, +[here](https://effective-shell.com/) is a good resource to start with. +Jobs started via Slurm run on the _compute nodes_. + +Note that it is _not_ allowed to run jobs directly on the login nodes. + +```{note} +Email notification from completed Slurm scripts is currently disabled **on all +machines** and it looks like it will take quite a while (months?) before we can +re-enable it. Sorry for the inconvenience. The reason is technical due to the +way the infrastructure is set up. It is non-trivial for us to re-enable this in +a good and robust and secure way. +``` + + +## Jobs + +It is possible to run commands interactively on the cluster, which can +be a good way to test your commands, or work with interactive +applications like MATLAB. See [interactive](interactive_jobs.md) for +more details. However, the normal way to run a computation on the +cluster, is to submit a [job script](job_scripts.md) into a _job +queue_, and the job is started when one or more suitable _compute +nodes_ are available. + +Job scripts are submitted with the +[sbatch](https://slurm.schedmd.com/sbatch.html) command: + + sbatch YourJobscript + +The `sbatch` command returns a _jobid_, number that identifies the +submitted job. The job will be waiting in the job queue until there +are free compute resources it can use. A job in that state is said to +be _pending_ (PD). When it has started, it is called _running_ (R). +Any output (stdout or stderr) of the job script will be written to a +file called `slurm-.out` in the directory where you ran +`sbatch`, unless otherwise specified. + +It is also possible to pass arguments to the job script, like this: + + sbatch YourJobscript arg1 arg2 + +These will be available as the variables `$1`, `$2`, etc. in the job +script, so in this example, `$1` would have the value `arg1` and `$2` +the value `arg2`. + +All commands in the job script are performed on the compute-node(s) +allocated by the queue system. The script also specifies a number of +requirements (memory usage, number of CPUs, run-time, etc.), used by +the queue system to find one or more suitable machines for the job. + + +### More information about Slurm + +- For more information about the Slurm parameters and job script settings, +see [Slurm parameter](job_scripts/slurm_parameter.md). + +- A more detailed description of the queue system can be found in +[Queue System Concepts](submitting/queue_system_concepts.md). + +- If you are already used to PBS/Torque, but not Slurm, you might find +[Porting from PBS/Torque](guides/porting_from_pbs.md) useful. + + +## Job Queue + +Jobs in the job queue are started on a priority basis, and a job gets +higher priority the longer it has to wait in the queue. A detailed +description can be found in [Job Scheduling](submitting/job_scheduling.md). + +To see the list of running or pending jobs in the queue, use the +command [squeue](https://slurm.schedmd.com/squeue.html). Some useful `squeue` options: + + -j jobids show only the specified jobs + -w nodes show only jobs on the specified nodes + -A projects show only jobs belonging to the specified projects + -t states show only jobs in the specified states (pending, running, + suspended, etc.) + -u users show only jobs belonging to the specified users + +All specifications can be comma separated lists. Examples: + + squeue -j 14132,14133 # shows jobs 4132 and 4133 + squeue -w c23-11 # shows jobs running on c23-11 + squeue -u foo -t PD # shows pending jobs belonging to user 'foo' + squeue -A bar # shows all jobs in the project 'bar' + +To see all pending jobs, in priority order, you can use `pending`, +which is a small wrapper for `squeue`. See `pending --help` for +details and options. + +For a description of common job states, see {ref}`job-states`. diff --git a/_sources/jobs/submitting/job_scheduling.md.txt b/_sources/jobs/submitting/job_scheduling.md.txt new file mode 100644 index 000000000..165107461 --- /dev/null +++ b/_sources/jobs/submitting/job_scheduling.md.txt @@ -0,0 +1,62 @@ +--- +orphan: true +--- + +# Scheduling + +Jobs are scheduled based on their priority. In addition, lower +priority jobs can be back-filled if there are idle resources. + +Note that job priority is only affected by the {ref}`job-types` +and how long the job has been pending in the +queue. Notably, job size and fair share usage does _not_ affect the +priorities. + +## Job Priorities + +The priority setup has been designed to be as predictable and easy to +understand as possible, while trying to maximize the utilisation of the +cluster. + +The principle is that a job's priority increases 1 point per minute the job is +waiting in the queue[^1], and once the priority reaches 20,000, the job gets a +reservation in the future. It can start before that, and before it gets a +reservation, but only if it does not delay any jobs with reservations. + +The different job types start with different priorities: + +- _devel_ jobs start with 20,000, so get a reservation directly +- _short_ jobs start with 19,880, so get a reservation in 2 hours +- _normal_, _bigmem_, _hugemem_, _accel_ and _preproc_ jobs start with 19,760, so + get a reservation in 4 hours +- "Unpri" _normal_, _bigmem_, _hugemem_, _accel_ and _preproc_ jobs[^2] start + with 19,040, so get a reservation in 16 hrs +- _optimist_ jobs start with 1 and end at 10,080, so they never get a reservation + +The idea is that once a job has been in the queue long enough to get a +reservation, no other lower priority job should be able delay it. But +before it gets a reservation, the queue system backfiller is free to +start any other job that can start now, even if that will delay the +job. Note that there are still factors that can change the estimated +start time, for instance running jobs that exit sooner than expected, +or nodes failing. + + +## Job Placement on Fram + +Note that this section _only_ applies to Fram! + +The compute nodes on Fram are divided into four groups, or *islands*. The +network bandwidth within an island is higher than the throughput between +islands. Some jobs need high network throughput between its nodes, and will +usually run faster if they run within a single island. Therefore, the queue +system is configured to run each job within one island, if possible. See +{ref}`job-placement-fram` for details and for how this can +be overridden. + + +**Footnotes** + +[^1]: Currently, only the priority of 10 jobs for each user within each project increase with time. As jobs start, more priorities start to increase. This is done in order to avoid problems if a user submits a large amount of jobs over a short time. Note that the limit is per user and project, so if a user has jobs in several projects, 10 of the user's jobs from each project will increase in priority at the same time. This limit might change in the future. + +[^2]: In some cases when a project applies for extra CPU hours because it has spent its allocation, it will get "unprioritized" hours, meaning their jobs will have lower priority than standard jobs (but still higher than `optimist` jobs). diff --git a/_sources/jobs/submitting/queue_system_concepts.md.txt b/_sources/jobs/submitting/queue_system_concepts.md.txt new file mode 100644 index 000000000..2a75840a6 --- /dev/null +++ b/_sources/jobs/submitting/queue_system_concepts.md.txt @@ -0,0 +1,89 @@ +--- +orphan: true +--- + +(queue-system)= + +# Queue system concepts + +This page explains some of the core concepts of the Slurm queue system. + +For an overview of the Slurm concepts, Slurm has a beginners guide: +[Quick Start](https://slurm.schedmd.com/quickstart.html). + +## Partition +The nodes on a cluster is divided into sets, called _partitions_. The +partitions can be overlapping. + +Several {ref}`job-types` on our clusters are +implemented as partitions, meaning that one specifies `--partition` to +select job type -- for instance _bigmem_, _accel_ and _optimist_. + +## QoS - Quality of Service +A _QoS_ is a way to assign properties and limitations to jobs. It can +be used to give jobs different priority, and add or change the +limitations on the jobs, for instance the size or length of jobs, or +the number of jobs running at one time. + +Several {ref}`job-types` on our clusters are +implemented as a QoS, meaning that one specifies `--qos` to select +job type -- for instance _preproc_, _devel_ and _short_. +The jobs will then (by default) run in the standard (_normal_) partition, +but have different properties. + +## Account +An _account_ is an entity that can be assigned a quota for resource +usage. All jobs run in an account, and the job's usage is subtracted +from the account's quota. + +Accounts can also have restrictions, like how man jobs can run in it +at the same time, or which reservations its jobs can use. + +On our cluster, each project has its own account, with the same name +"nnXXXXk". Some projects also have an account "nnXXXXo" for running +_optimist_ jobs. We use accounts mainly for accounting resource +usage. + +Read more about {ref}`projects-accounting`. + +## Jobs +Jobs are submitted to the job queue, and starts running on assigned +compute nodes when there are enough resources available. + +### Job step +A job is divided into one or more _job steps_. Each time a job runs +`srun` or `mpirun`, a new job step is created. Job steps are +normally executed +sequentially, one after each other. +But please also see {ref}`running-job-steps-parallel`. +In addition to these, the +batch job script itself, which runs on the first of the allocated +nodes, is considered a job step (`batch`). + +`sacct` will show the job steps. For instance: + + $ sacct -j 357055 + JobID JobName Partition Account AllocCPUS State ExitCode + ------------ ---------- ---------- ---------- ---------- ---------- -------- + 357055 DFT normal nn9180k 256 COMPLETED 0:0 + 357055.batch batch nn9180k 32 COMPLETED 0:0 + 357055.exte+ extern nn9180k 256 COMPLETED 0:0 + 357055.0 pmi_proxy nn9180k 8 COMPLETED 0:0 + +The first line here is the job allocation. Then comes the job script +step (`batch`), and an artificial step that we can ignore here +(`extern`), and finally a job step corresponding to an `mpirun` or +`srun` (step 0). Further steps would be numbered 1, 2, etc. + +### Tasks +Each job step starts one or more _tasks_, which corresponds to +processes. So for instance the processes (mpi ranks) in an mpi job +step are tasks. This is why one specifies `--ntasks` etc in job +scripts to select the number of processes to run in an mpi job. + +Each task in a job step is started at the same time, and they run in +parallel on the nodes of the job. `srun` and `mpirun` will take care +of starting the right number of processes on the right nodes. + +(Unfortunately, Slurm also calls the individual instances of {ref}`array-jobs` +for _array tasks_.) diff --git a/_sources/known-issues.md.txt b/_sources/known-issues.md.txt new file mode 100644 index 000000000..198a01f8e --- /dev/null +++ b/_sources/known-issues.md.txt @@ -0,0 +1,48 @@ +--- +orphan: true +--- + +(known-issues)= +# Known issues + +For latest changes and events please follow the +[OpsLog page]() +which tracks a lot over changes and events. + +This "Known issues" page on the other hand is meant to list more longer-term +problems that we know about and either work on or where we recommend +work-arounds, so that you don't have to scroll the +[OpsLog page]() +weeks back to find a known issue. + + +- **Email notification from completed Slurm scripts is currently disabled** on all + machines and it looks like it will take quite a while (months?) before we can + re-enable it. Sorry for the inconvenience. The reason is technical due to the + way the infrastructure is set up. It is non-trivial for us to re-enable this in + a good and robust and secure way. + +- **InfiniBand problems on Betzy**: There is a problem with high core count + jobs failing due to an InfiniBand problem which emits messages of the "excess + retry". This is an ongoing problem, but shows up (more) on high core count + jobs. + +- **Running jobs hang and consume compute resources**: on Fram and Betzy + there is a randomly occurring problem that results in Zombie / unkillable + processes. Amongst others, this happens when some of the application processes + execute `MPI_Abort`, or otherwise crash while other ranks are performing MPI communication. + With Intel MPI, this often results in the job hanging forever, or until it runs out + of the SLURM allocated time. At this stage, to avoid this issue users should + either make sure that all ranks call `MPI_Abort` at the same time (which might + of course be impossible), or use OpenMPI. In the latter case, although the Zombie + processes might also be created, we believe this does not result in a hanging + application and waste of compute resources. + +- **Slow performance with netCDF4 files**: Few users have reported this and this + seems to be related to the problem described [here](https://github.com/Unidata/netcdf-c/issues/489). + A solution seems to be to convert the files from the netcdf-4 format to the + netcdf-64bit offset format with the command `$ nccopy -k nc6 file1 file2`. + +- Don't use `--hint=nomultithread` in jobs on Fram, at least not with + Intel MPI. If you do, the result is that all the tasks (ranks) will + be bound to the first CPU core on each compute node. diff --git a/_sources/nird_archive/user-guide.md.txt b/_sources/nird_archive/user-guide.md.txt new file mode 100644 index 000000000..706a85b91 --- /dev/null +++ b/_sources/nird_archive/user-guide.md.txt @@ -0,0 +1,475 @@ +(research-data-archive)= + +# Research Data Archive (NIRD RDA) + +## Introduction + +The Research Data Archive (hereafter called 'the Archive') is a repository that provides long-term storage for research data and is compliant with the Open Archival +Information System (OAIS) reference model {ref}`[1] `. + +The aim of the archive is to provide (public) access to published research data and to promote cross-disciplinary studies. This document is a user-guide for deposit, search and retrieval of datasets. Each of the steps in the archival process are described in the following sections. + +## Depositing a Dataset + +The process for depositing a dataset in the Archive consists of the following stages: + +- {ref}`Identify the dataset. ` +- {ref}`Choose file formats. ` +- {ref}`Log onto the web interface.` +- {ref}`Agree to the terms and conditions. ` +- {ref}`Provide primary metadata. ` +- {ref}`Upload the dataset. ` +- {ref}`Provide secondary metadata. ` +- {ref}`Publish the dataset. ` + +The following sub-sections describe these stages. + +### Important Note + +All steps prior to and including *Upload Dataset* need to be completed during the same session. It is currently not possible to complete the *Provide Primary Metadata* step, log off and come back at a later point in time to perform the *Upload Dataset* step. Certain fields in the *Provide Primary Metadata* step can be left out and completed at a later time (after uploading the dataset). + +(Identify-the-dataset-Archive)= + +### Identify Dataset + +Before archiving a dataset you will need to define it, make sure you have approval to archive the data and understand which type of access license should be applied. + +A dataset must be a collection of related data. Typically, this consists of a collection of files. How a dataset is arranged will vary within communities, research groups and projects. However, a guideline as to what would be accepted for archival is: + +- Datasets resulting from research which is fully or in part supported by the Norwegian Research + Council. + +- Datasets of lasting value from any research discipline. + +- Datasets that are not in the process of being created. Datasets should be in a state where they are + well established (or mature) and will not be altered nor be supplemented by additional data. + Datasets that are related to each other (a series) can be associated in a 'data collection'. + +- Datasets with preferably no access restrictions so that a larger audience can make use of the data + (i.e. it has public access). However, the Archive recognises that certain datasets of restricted + use to a given community may be eligible for archiving. + +(Choose-file-formats)= + +### Choose File Formats + +You should choose open file formats for your data if possible. Open file formats follow an open licence which makes it easier for people to reuse your data as it is more likely that openly available applications exist to read the data (or applications can easily be written to access the data). A list of open file formats can be found on [Wikipedia](https://en.wikipedia.org/wiki/List_of_open_file_formats). You can find more information about open data formats on the [Open Data Formats](https://opendataformats.org) site. + +(Log-onto-the-web-interface-Archive)= + +### Log onto the Web Interface + +To access the Archive web interface, direct your browser to: [https://archive.sigma2.no](https://archive.sigma2.no). You should arrive at the front page shown in Figure 1. You will need to authenticate using your FEIDE or other account either by logging on via the *LOGIN* button on the top-right or via the *DEPOSIT* icon. + +![archive_web_interface](imgs/figure_1_screenshot_of_the_archive_web_interface_front_page.png "archive web interface") +Figure 1: Screenshot of the Archive web interface front page + +The *DEPOSIT* button provides access to the set of pages required for depositing your dataset in the Archive. These pages are accessible once you have authenticated. + +### Request Approval + +If you have never used the Archive before you will be presented with a page informing you that you are not registered. You can submit a request approval from this page. Only approved users are allowed to deposit datasets in the Archive. The Archive Manager will contact you if additional information is required. Approval should be granted within 3 business days (and usually much sooner). + +(Agree-to-the-terms-and-conditions-Archive)= + +### Agree to Terms & Conditions + +Once approval has been granted you will be able to deposit datasets. If you now click the *DEPOSIT* button you will be presented with a page containing a short introduction to the Archive and a link to the Terms and Conditions as shown in Figure 2. The Terms and Conditions outline your responsibilities and those of the Archive. You will need to agree to these before you can start the deposit process. + +![the_terms_and_conditions_page](imgs/figure_2_screenshot_of_the_terms_and_conditions_page.png "the terms and conditions page") +Figure 2: Screenshot of the Terms and Conditions page + +(Provide-primary-metadata-Archive)= + +### Provide Primary Metadata + +The primary metadata is divided into two parts. The first part covers the publication that best describes the dataset (see Figure 3). The goal of the Archive is to provide long-term storage for datasets that are considered to be of lasting value, and it's quite likely that these datasets will have resulted in publications. You have the option to add more publications in the Secondary Metadata phase if your dataset is best described by more than one publication. The second part of the primary metadata covers the information required by the Archive to identify the dataset so that queries regarding the dataset can be addressed by the appropriate person. + +![the_publication_form](imgs/figure_3_screenshot_of_the_publication_form.png "the_publication_form") +Figure 3: Screenshot of the publication form + +The publication metadata allows you to provide a link to the primary article based on the dataset which allows further, useful information to be associated to the dataset. This information can be very useful to potential users of your dataset. The primary article can be in one of a number of states at the time the dataset is being uploaded to the Archive. For example, an article may be accepted for publication, but not actually published in which case you would select the *Accepted for publication* button and fill in the URI for the article preprint. You can also specify *No publication*, but in this case the Archive will require justification as to why the dataset should be stored. For the *Paper in preparation* state you should update the state of the publication before requesting the dataset be published, or the Archive may delay publication of the dataset until the article has been published. The archive will publish datasets in cases where you need the dataset DOI before the paper can be published. + +After completing the publication metadata you will be presented with fields for the second part of the primary metadata (see Figure 4). Each of the metadata fields has a help button (the i icon) containing a short description and example of the term. Some of the metadata fields are automatically populated. An Internal Identifier is generated and used by the archive to identify the dataset. The Depositor is also automatically filled-in (the information is extracted from the identity provided by FEIDE or OpenIdP) and cannot be altered. Additional Depositors can be provided. + +The currently available licences covering access to a dataset are: the Creative Commons Attribution 4.0 International (CC BY 4.0 {ref}`[2] ` ), Norwegian Licence for Open Data (NLOD {ref}`[3] ` ). The licence provides liberal access to the data which maximises the impact of the data you have published, and it ensures that proper recognition is given to you. The default licence is CC BY 4.0. + +By default, metadata for published datasets are considered public and access is granted to all researchers. The Data Manager and Rights Holder that you nominate will be informed by email of the nomination and whether they wish to accept the role. The Archive will periodically remind them of their nomination, and you will receive an email when they accept or decline the nomination. It is your responsibility to find suitable entities (persons or organisations) that can accept the roles. Your institute may be able to offer guidance on suitable Data Manager and Rights Holders, or you can email the [archive.manager@nris.no](mailto:archive.manager@nris.no) for guidance. + +![the_primary_metadata_form](imgs/figure_4_screenshot_of_the_primary_metadata_form_0.png "primary_metadata_form") +Figure 4: Screenshot of the Primary Metadata form + +Click the *Save dataset information* button to save the metadata in the archive. You can modify or update the information at any point before you submit the dataset for publication. + +#### Note on restricted access data + +It is possible to restrict access to both the dataset and metadata to a group of researchers (although the Archive would encourage you to consider very carefully whether such restrictions are necessary). Note that restricting access to a dataset may require the Archive to impose special terms and conditions on the dataset. + +(Section-Upload-Dataset)= + +### Upload Dataset + +Once the basic metadata has been provided you will be presented with the dataset upload page (see Figure 5). You can choose to upload a dataset that exists in the NIRD project area [https://www.sigma2.no/data-storage](https://www.sigma2.no/data-storage) or a remote dataset. + +![the_upload_dataset_page](imgs/figure_5_screenshot_of_the_upload_dataset_page.png "upload_dataset_page") +Figure 5: Screenshot of the upload dataset page + +#### Upload Dataset from Local Machine + +The upload consists of two steps: + +- Assembling your dataset into a form the upload mechanism can handle. If your dataset consists of a large number of files (typically more than 10) and directories containing files it is best to aggregate these into a single file using the tar or zip command. + +- Uploading your dataset. There is no restriction on the size of the dataset, but bear in mind that the larger the dataset is the longer it will take to upload if you are on a slow network. + +- **NOTE:** there is current limitation that does not allow datasets consisting of a tar or zip file and individual files or directories to be uploaded. If your dataset contains a mixture of individual files and tar or zipped files you can tar or zip the entire collection and upload the dataset. + +**NOTE:** For the local machine method you will need to keep the browser window open whilst the upload is in progress. If you have large datasets (e.g. 200 GB or above) you might want to contact the Archive ([archive.manager@nris.no](mailto:archive.manager@nris.no)) to see if there's a more efficient way of transferring the data. +Choosing the *Local machine* for uploading a dataset will result in the upload interface appearing (see Figure 6). Use the *select* button to select files from a file chooser. Once you have selected all the files for upload click the *Submit Dataset* button to send the data to the archive. + +![the_upload_for_feide_users](imgs/figure_6_screenshot_of_the_upload_for_feide_users.png "_upload_for_feide_users") +Figure 6: Screenshot of the upload for FEIDE users + + **NOTE:** once you have clicked *Submit Dataset* it is currently not possible to make changes to the dataset. If you find you have cancel the dataset (which will delete the data and metadata) and create a new dataset. + +#### Upload Dataset from Project Area + + The archive supports deposit of datasets from the Project Area. + +In order to ingest data to the Archive from the Project Area: + +- Complete the basic metadata form for deposit of a new dataset +- By choosing the option to *Project Area* and clicking the *Submit Dataset* button you will receive an email containing the dataset identifier (the UUID) and instructions on how to archive your dataset +- After receiving the ingest dataset mail: +- Log onto a machine that has access to the project area storage (e.g. *ssh login.nird.sigma2.no* ). +- Create a manifest file containing the paths to the files that make up the dataset. The structure of the paths should be valid arguments for the UNIX “find ! -type d” command which is used by the *ArchiveDataset* script. For example if we define our dataset to consist of all gzipped tar files in the NS1234K project then the manifest file should contain the line: +/projects/NS1234K/ -name *.tar.gz + +The manifest file can contain more than one line if the dataset spans more than one project or different types of files etc. + +- By default, the files that make up the dataset will contain the full path excluding the leading '/' (e.g. project/NS1234K/subdir1/file1.dat). You can indicate that the root part of the path be removed by adding a “//” where the root path ends. + +E.g. to remove “/projects/NS2134K” from “/projects/NS1234K/subdir1/file1.dat” you would add the following to your manifest file: “/projects/NS1234K///subdir1/file1.dat”. This can be used in combination with the regular expressions and globbing that are recognised by the find command. To remove “/projects/NS1234K” from the pattern which will archive all “.tar.gz” files in the directory “/projects/NS1234K/subdir1” specify the following: “/projects/NS1234K///subdir1 -name *.tar.gz”. + +- Run the command: + +`ArchiveDataset UUID manifest-file` + +This will result in a special file being created that is used by the archiver cron-job that copies the dataset from the project area to the archive. Depending on the size of the dataset the copy can take quite a bit of time. + +- To get the status of your project-area datasets that have been submitted to the queue for archival use the command: + +`ListArchiveDataset [UUID]` + +The UUID is optional. + +- Once the copy has completed you will receive an email with the results of the copy: how much data was copied and if the copy was successful or not. At this point the dataset has been safely uploaded to the archive and you can log back onto the web interface to complete the archiving process. +- You can cancel a request with the: + +`CancelArchiveDataset UUID` + +Only datasets that are pending or are in the process of being archived can be cancelled. It is not possible to cancel a dataset that has been archived. + +- You will receive an email once the upload has completed. The email will report whether the upload was successful and how much data was uploaded. + +**NOTE:** that once a dataset has been archived using the *ArchiveDataset* script it is considered closed and it is not possible to add more files to the dataset. You will need to create a new dataset if you wish to update the dataset. + +### Modifying the dataset's data + +The archive allows an archive dataset's data to be updated. +If you find that you need to make a change to data that has been uploaded to the archive, please contact the archive manager [archive.manager@nris.no](mailto:archive.manager@nris.no) and they will help you to update your dataset. + +### List view of Datasets + +After uploading your dataset you will be presented with a list of all the datasets that you are associated with (either as a Depositor, Data Manager or Rights Holder) as shown in Figure 6. By default, the list is sorted by date. You can reverse the order by clicking on the *Date submitted* column. Clicking on a row will provide a detailed description of the selected dataset as well as the forms for supplying the secondary and optional metadata for unpublished datasets. + +![the_list_of_datasets](imgs/figure_7_screenshot_of_the_list_of_datasets.png "_list_of_datasets") +Figure 7: Screenshot of the list of datasets + +(Provide-Secondary-Optional-Metadata)= + +### Provide Secondary & Optional Metadata + +Once you have selected a dataset from the table shown in Figure 7 you will be presented with tabbed forms. The first tab is for the mandatory secondary metadata (see Figure 8). The second tab allows you to provide further, optional metadata that you think is necessary for your dataset. The third tab contains a tabular list of the contents of the dataset. Some of the secondary metadata fields such as the *Published On, Publisher, Phase* and *Lifetime* are automatically filled in by the Archive. + +Clicking the *Update metadata* button will save any changes you made to the metadata and cause the *Last updated* field to be updated. You can change the nominations for the Data Manager or Rights Holder roles provided the existing nominees have not accepted their roles. Some of the metadata terms require you to register the term in the form first. For example if you wish to add a Creator you first click the + sign to expand that field. Then you fill in the values for the term and then click the *Add Person as Creator* button to register the term. Then you can click the *Update metadata* button to save the metadata. + +The *Optional Information* tab consists of a form of optional metadata. The *Geo-location* is useful for datasets that have spatial relevance. Spatial datasets can be described with Point {ref}`[5] ` or Box {ref}`[6] ` types where the box type could be used to describe a geographical region and the point type a particular location. Temporal coverage datasets can be described with the Period {ref}`[7] `. + +![the_secondary_metadata_form](imgs/figure_8_screenshot_of_the_secondary_metadata_form.png "_secondary_metadata_form") +Figure 8: Screenshot of the secondary metadata form + + Datasets with spatial relevance could be, for example, population in Oslo or marine environmental data from the Norwegian Sea. Temporal data could be the population of Oslo from 1800-1900. For spatial data the coverage can be defined by box or by a point. Temporal data can be defined by a period (for example for geological data the Devonian period). Click the *Update metadata* button to save changes. + +The *Table of contents* tab (see Figure 9) contains a tabular list of the contents of the dataset. If the dataset was deposit as a tar file the contents of the tar file are displayed. However, if a tar file consisting of tar files is deposited then only the contents of the outer tar file are displayed (which would be a list of tar files). + +![the_table_of_contents](imgs/figure_9_screenshot_of_the_table_of_contents.png "_table_of_contents") +Figure 9: Screenshot of the table of contents + +(Publish-the-dataset-Archive)= + +### Publish Dataset (Archiving Data) + +Once you have supplied all mandatory metadata for the dataset and a Rights Holder and Data Manager have accepted their roles you will see a *Publish dataset* button appear in the *Action* column in the list of your datasets (as shown in Figure 7). Pressing the *Publish dataset* button will result in a request being sent to the Archive Manager to publish the dataset. The Archive Manager will verify the metadata information, check with the stakeholders agree to the publication, assign a public identifier to the dataset and make the metadata publicly available (or available to a restricted set of users if that option has been chosen). + +## Versioning Datasets + +You can create a new version of any published dataset for which you are a stakeholder. This can be useful if you find tha there are errors in the dataset and you need to archive a corrected dataset, or you wish to add more data to a dataset. If you only wish to update metadata, please contact the [archive.manager@nris.no](mailto:archive.manager@nris.no). + +**NOTE:** currently, you will need to upload the complete dataset as it is currently not possible to upload only modified data files. + +To create a new version of dataset navigate to the published dataset either using the DOI or from the *List datasets* option on the web interface and click on the *Manage* menu (see Figure 10 and Figure 11). You should choose the *Create version of dataset* to create a new version of the dataset (see Figure 12). This will bring you to the dataset upload where you can choose from Project Area upload or local machine upload (see Section {ref}`Upload Dataset ` ). Once you have uploaded your data you will be presented with the metadata form. + +![the_edit_dataset_page](imgs/figure_10_screenshot_of_the_edit_dataset_page._the_manage_menu_is_indicated_by_a_black_box.png "_edit_dataset_page") +Figure 10:Screenshot of the edit dataset page.The *Manage* menu is indicated by a black box + +You can then update the metadata accordingly (see Section {ref}`Provide Secondary & Optional Metadata `). + +**NOTE:** If you do not wish to make any changes to the metadata you should click the *Update metadata* button which will result in the *Publish dataset* button appearing. You can then submit the dataset for publication. + +![the_landing_page](imgs/figure_11_screenshot_of_the_landing_page._the_manage_link_is_indicated_by_black_box.png "_landing_page") +Figure 11: Screenshot of the landing page. The Manage link is indicated by black box + +![the_manage_dataset_menu](imgs/figure_12_screenshot_of_the_manage_dataset_menu.png "_manage_dataset_menu") +Figure 12: Screenshot of the manage dataset menu + +![the_version_dataset_upload](imgs/figure_13_screenshot_of_the_version_dataset_upload.png "_version_dataset_upload") +Figure 13: Screenshot of the version dataset upload + +Once the new version has been published you will see additional text on the landing page indicating the dataset replaces a previous version with a link to the version it replaces. A similar link to the previous dataset will also appear when you access the dataset from the *List datasets* menu item. + + +## Cloning Metadata + +If you want to create a new dataset, but wish to use the metadata from an existing dataset you can choose the *Copy metadata of dataset* menu option in the *Manage* menu (see Figure 13). This will require you uploading a new dataset and choose whether to use the same Data Manager and Rights Holder. + +**NOTE:** the Data Manager and Rights Holder will still need to accept their roles as this is considered a new dataset and not a version. + +Once the dataset has been uploaded you will be presented with a pre-filled metadata form which you can modify. Once you have completed filling in the metadata you can submit the dataset for publication. + +(Section-Terminating-Datasets)= + +## Terminating or Withdrawal of Datasets + +Normally, published datasets will not be deleted before the end of the retention period specified in the [depositor agreement](https://www.sigma2.no/research-data-archive-depositor-agreement). Beyond the retention period datasets may be deleted in the case of compelling technical reasons. In this case the deletion is first announced on the Research Data Archive front page and on the dataset’s landing page. This announcement is visible for a period of one year. During this grace period anyone having interest in maintaining the dataset can renovate the retention time by contacting the [archive.manager@nris.no](mailto:archive.manager@nris.no). + +Within the retention period, only exceptional reasons (such as copyright violation, see the [depositor agreement](https://www.sigma2.no/research-data-archive-depositor-agreement)) will be considered valid for the deletion or withdrawal of datasets from the archive. In this case you can request termination of your published dataset by filling in the terminate request page which is accessible either from the landing page by clicking the *Manage* link, or from the *List datasets* menu, selecting the dataset you want to terminate, click the *Manage* menu (see Figure 10). + +**NOTE:** the *Manage* menu is only available for published datasets. Clicking the *Request termination* link will display the dataset termination request page. You will need to supply a reason why you wish the dataset to be terminated. + +Once you have submitted your request it will be reviewed by the Archive Manager who may contact you to further discuss the request. All stakeholders (the creators of the dataset and the rights holders) will be informed of the request and providing there are no objections access to the dataset will be removed. + +In these cases the metadata for the terminated dataset will still be visible, but there will be clear indication that the dataset has been terminated as well as a reason for termination. This is to ensure existing articles that reference the dataset have valid links. + +## Searching and Accessing Datasets + +The search interface can be accessed both as an authenticated user or anonymously from the front page. The search interface has two tabs: *Basic search* and *Advanced search*. The basic search uses the metadata terms: Title, Label, Description and Subject to search for keywords. You can construct more complex queries over the full range of metadata terms using the advanced search (see Figure 14). The search is executed by clicking the *Search* button at the bottom of the page. + +The search interface allows searches over partial words which you can do by placing a \* before or after or both of the partial word. For example a search for datasets containing the word test could be achieved by searching for tes*, searching for norwegian in a dataset can be achieved by \*weg\*. + +**NOTE:** that the advanced search interface requires you to register the search term before executing the search (this entails clicking the + then entering the search term in the appropriate field and clicking the *add* button to register the search term and then clicking the *Search* button). + +![the_advanced_search_interface](imgs/figure_14_screenshot_of_the_advanced_search_interface.png "_advanced_search_interface") +Figure 14: Screenshot of the advanced search interface + +The search is performed over all published datasets and the results are presented as a tabulated list. Clicking on one of the search results will direct you to landing page which contains a subset of the metadata information for the dataset, clicking on the *[more]* will display the expanded landing page including links to the table of contents and dataset download (see Figure 11). + +The landing page also contains graphical views for the coverage (if it has been provided) and for the table of contents (see Figure 15). The table of contents provides an immediate view of the composition of the dataset. The chart in the lower-left pane can be navigated to explore the file structure of the dataset. The table on the lower-right pane displays the total size in each sub-directory and is updated as the chart is navigated. + +![the_landing_page_table_of_contents](imgs/figure_15_screenshot_of_the_landing_page_table_of_contents.png "_landing_page_table_of_contents") +Figure 15: Screenshot of the landing page table of contents + +### Downloading a Published Dataset + +Anyone can download published, public datasets either to their local machine or the Project Area. If you are a member of a NIRD project [https://www.sigma2.no/data-storage](https://www.sigma2.no/data-storage) you also have the possibility to download the dataset to your project area. + +To download a dataset first navigate to the landing page for the dataset you want to download and click on the *download* link. The focus on the landing page will change to an index of the dataset contents that can be downloaded (see Figure 16). + +![the_download_index_of_files](imgs/nird-archive-figure-16.png "download index of files") +Figure 16: Screenshot of the download index of files. + +You can click on the link to anonymously download the file to your local machine. To download to another machine or to the NIRD project area you can use the command-line application such as *wget* , or *curl*. For example: + +```bash +wget -P +``` + +or + +```bash +curl -o +``` + +If your dataset has more than one file, you will need to download each file separately (it may be better to create a script to download all the files in your dataset). + +## Citations + +If you use a dataset it is good practice to cite the dataset in any articles you publish. The archive provides a *cite* button on the landing page (see Figure 17) that contains a citation string for the dataset. Currently, two formats for the citation are provided. However, a link to an external service that provides many more formats is provided (you will need to cut and paste the dataset's DOI into the input field). + +![the_landing_page_with_dataset_citation](imgs/figure_17_screenshot_of_landing_page_with_dataset_citation.png "landing page with dataset citation") +Figure 17: Screenshot of landing page with dataset citation + +## Archive API + +The archive provides a set of APIs for programmatic access. The publicly accessible APIs focus on search and access: +- `https://search-api.web.sigma2.no/norstore-archive/metadata/api/basic-search/dois?` + - A GET request that returns the list of DOIs for published datasets as a JSON string with the following schema: + ``` + {"Total Datasets": , + "DOIs": [{"status": , + "date_published": , + "doi": + }]} + ``` + - The API can take the optional arguments: `before=` to return published datasets before a given date and `after=` + - A GET request that returns the metadata for a given dataset DOI. The dataset schema is: + ``` + {"Dataset": {"Category": , + "Publication": [{ + "Status": , + "isPrimary": , + "Reference": {"URL": , + "DOI": , + "Citation": } + }], + "Title": [], + "License": {"Name":, + "URI": , + }, + "Label":[], + "State": , + "Contents-Link": , + "Description": [], + "Access_Rights": {"Public": }, + "download_url":, + "Extent": , + "Publisher": , + "Language": [{"Long_Name": , + "Shors_Name: + }], + "Created": , + "Rights_Holder": {"Person": { + "First_Name: , + "Last_Name: + }}, + "Submitted": , + "Data_Manager": {"Person": { + "First_Name": , + "Last_Name": + }}, + "Identifier": + "Creator": [{"Person": { + "First_Name": , + "Last_Name": + }}], + "Contributor": [{"Person: { + "First_Name: , + "Last_Name": + }}], + "Subject": [{"Domain": , + "Field": , + "Subfield": }] + + }} + ``` + - The Contributor, Rights Holder, Data Manager and Creator can also hold organisations with the schema: + ``` + "Organisation": { + "Long_Name": , + "Short_Name": + } + ``` + - The `download_url` contains a link to the S3 bucket containing all the data that can be downloaded. +- `https://search-api.web.sigma2.no/norstore-archive/metadata/api/basic-search/tableofcontents?identifier=` + - A GET request to return the JSON string containing the tableofcontents for a given DOI. The schema of the tableofcontents is: + ``` + {"Total_Files": , + "Previous_Page" , + "Next_Page": , + "TableOfContents": [{ + "Fixity_Algorithm": , "File_Name": , + "Format": , + "Extent": , + "Fixity": + }] + } + ``` +- `https://search-api.web.sigma2.no/norstore-archive/oai/v1.0?verb=` + - A GET request that adopts the OAI-PMH protocol (see {ref}`[10] `). The API is primarily used for harvesting metadata for other registries. The `metadataPrefix=oai_dc` should be used as only the terms that correspond to Dublin Core are returned. The output format is XML. + +## Appendix A: Metadata Schema for Datasets + +The Research Data Archive uses the Dublin Core metadata standard (ISO 15836-1:2017, see {ref}`[8] `) as part of the metadata schema for datasets. The set of mandatory metadata terms that need to be provided by researchers is given in the table below. The terms without a DCMI Reference are defined only for the research data archive. + +| Term | DCMI Reference | Multiplicity | Description | +| ------------- | -------------- | ------------ | ------------- | +| Access Rights | [http://purl.org/dc/terms/accessRights](http://purl.org/dc/terms/accessRights) | 1..n | Information on who can access the dataset if the dataset is private. Requires information on the user (first and last name and email). Default is Public. | +| Article | | 1 | The article that either describes the dataset, or for which the dataset was created. Articles can either be Published, accepted for publication, in preparation, a conference proceeding, or no publication. Ideally, the dataset should be used in a publication. | +| Category | | 1 | The rough category that the dataset fits into. This can be: Calibration, Experiment, Image, Model, Observation, Simulation, Software. | +| Created on| [http://purl.org/dc/terms/created](http://purl.org/dc/terms/created) | 1 | The date that the dataset was created. This should be the date when you created or generated the data and not the date when the data was assembled for archiving.| +| Creator | [http://purl.org/dc/terms/creator](http://purl.org/dc/terms/creator)| 1...n | The people or person and/or organisation that created the data. The creators appear in the citation which describes how the dataset should be cited.| +| Data Manager | | 1...n | The contact person that can rely queries on the dataset to the relevant researchers able to answer the question. The data manager is responsible for ensuring the data remain usable.| +| Depositor | [http://purl.org/dc/terms/contributor](http://purl.org/dc/terms/contributor) | 1...n | The people or person that archives the dataset (either uploading the data, or supplying the metadata or both). In some cases a dataset may consist of data from more than one source where different researchers have access to the different datasets resulting in different researchers uploading the data to the same dataset.| +| Description | [http://purl.org/dc/terms/description](http://purl.org/dc/terms/description) | 1 | A description of the dataset. Ideally, the description should cover what the dataset is, what data is contained and how to use it (or links to resources describing how to use the data, or a reference to a file in the dataset describing how to use the data).| +| Language | [http://purl.org/dc/terms/language](http://purl.org/dc/terms/language) | 1...n | The language any text material in the dataset is written in. This can either be English or Norwegian. | +| License | [http://purl.org/dc/terms/license](http://purl.org/dc/terms/license) | 1 | A link to the license for the dataset that governs the use and distribution of the dataset. A selection of licenses is provided with the default being CC-BY-4.0. | +| Rights | [http://purl.org/dc/terms/rights](http://purl.org/dc/terms/rights)| 1 | A description of the various property rights associated to the resource. Ideally, this should be a link to a document describing the rights for a dataset.| +| Rights Holder | [http://purl.org/dc/terms/rightsHolder](http://purl.org/dc/terms/rightsHolder) | 1 | The person or organisation that either holds rights on the dataset, or can act as a contact person for queries on the dataset rights. | +| State | | 1 | The coarse description of dataset state. This can either be raw in the case of unprocessed data, or derived in the case of processed data where the original content is not archived.| +| Subject | [http://purl.org/dc/terms/subject](http://purl.org/dc/terms/subject) | 1...n | The subject is described by a domain, field and subfield that best matches the dataset. More than one subject can be provided in the case the dataset spans overlapping subjects. | +| Title | [http://purl.org/dc/terms/title](http://purl.org/dc/terms/title) | 1 | The title for the dataset.| + +Table A1: The mandatory metadata terms for the NIRD research data archive. Terms without a DCMI Metadata reference are defined only for the research data archive. + +A set if optional metadata can also be supplied for a dataset. The list of optional metadata are given in the table below. + +| Term | DCMI Metadata | Multiplicity | Description | +|------|---------------|--------------|-------------| +| Bibliographic Citation | [http://purl.org/dc/terms/bibliographicCitation](http://purl.org/dc/terms/bibliographicCitation) | 1 | The bibliographic citation can optionally be provided here as a URL that points to a document describing how the dataset should be cited. By default, the archive automatically generates a citation using the list of creators, title and DOI for the dataset.| +| Conforms to | [http://purl.org/dc/terms/conformsTo](http://purl.org/dc/terms/conformsTo) | 1...n | The standards that the dataset conforms to (in the case that the dataset follows a standard structure). | +| Comment | | 1...n | Any comments on the dataset. For example, comments on the quality of the dataset. | +| Geo-location | [https://www.dublincore.org/specifications/dublin-core/dcmi-box/](https://www.dublincore.org/specifications/dublin-core/dcmi-box/) and [https://www.dublincore.org/specifications/dublin-core/dcmi-point/](https://www.dublincore.org/specifications/dublin-core/dcmi-point/) | 1...n | The geospatial coverage for the dataset which can be a point, or as a box. Currently, WGS84 and UTM are supported.| +| Label | | 1 | A short title or label for the dataset (this could be an acronym that is meaningful to the domain).| +| Project | | 1...n | The names of the projects that supported the creation of the dataset. | +| Provenance | [http://purl.org/dc/terms/provenance](http://purl.org/dc/terms/provenance) | 1...n | Information on how the dataset was created, or modified and any change of ownership. | +| Source | [http://purl.org/dc/terms/source](http://purl.org/dc/terms/source) | 1 | This term is relevant for datasets derived from existing datasets. It's value should be a persistent URI (eg DOI) to the source dataset.| +| Temporal coverage | [http://purl.org/dc/terms/temporal](http://purl.org/dc/terms/temporal) | 1...n | The time period relevant to the dataset. The period is defined in [https://www.dublincore.org/specifications/dublin-core/dcmi-period/](https://www.dublincore.org/specifications/dublin-core/dcmi-period/).| + +Table A2: The optional metadata terms for the NIRD research data archive. Terms without a DCMI Metadata reference are defined only for the research data archive. + +Metadata for the dataset files is automatically extracted from the file metadata (file size, name, modification date) and stored in the table of contents [http://purl.org/dc/terms/tableOfContents](http://purl.org/dc/terms/tableOfContents) for the dataset. + +## Contacts + +In case of questions or comments please email the archive manager at: [archive.manager@nris.no](mailto:archive.manager@nris.no) + +(references-archive)= + +## References + +[1] Reference Model for an Open Archival Information System. +[https://public.ccsds.org/pubs/650x0m2.pdf](https://public.ccsds.org/pubs/650x0m2.pdf) + +[2] Creative Commons Attribution 4.0 International. +[https://creativecommons.org/licenses/by/4.0/legalcode](https://creativecommons.org/licenses/by/4.0/legalcode) + +[3] Norwegian License for Open Government Data. +[https://data.norge.no/nlod/en/1.0](https://data.norge.no/nlod/en/1.0) + +[5] DCMI Point Encoding Scheme. +[https://dublincore.org/documents/dcmi-point](https://dublincore.org/documents/dcmi-point) + +[6] DCMI Box Encoding Scheme. +[https://dublincore.org/documents/dcmi-box](https://dublincore.org/documents/dcmi-box) + +[7] DCMI Period Encoding Scheme. +[https://dublincore.org/documents/dcmi-period](https://dublincore.org/documents/dcmi-period) + +[8] Dublin Core Metadata Element Set, Version 1.1: Reference Description. +[https://www.dublincore.org/specifications/dublin-core/dces/](https://www.dublincore.org/specifications/dublin-core/dces/) + +[9] DCMI: DCMI Metadata Terms. +[https://www.dublincore.org/specifications/dublin-core/dcmi-terms/](https://www.dublincore.org/specifications/dublin-core/dcmi-terms/) + +[10] Open Archives Initiative Protocol for Metadata Harvesting [http://www.openarchives.org/pmh/](http://www.openarchives.org/pmh/) \ No newline at end of file diff --git a/_sources/nird_toolkit/about-packages/custom-image/about.md.txt b/_sources/nird_toolkit/about-packages/custom-image/about.md.txt new file mode 100644 index 000000000..425fb38f6 --- /dev/null +++ b/_sources/nird_toolkit/about-packages/custom-image/about.md.txt @@ -0,0 +1,25 @@ +--- +orphan: true +--- + +# Configuration + +The custom image package allows you to deploy an arbitrary [Docker](https://docs.docker.com/get-started/overview/) container. + +## Configuring the image + +The common parameters such as host name and machine types are configured in the same way as other application types. +Moreover, you will provide the [name of the docker image](https://docs.docker.com/engine/reference/commandline/tag/) to be deployed, +see image below: + +![Custom image](./custom-image.png) + +### Custom commands, arguments and environment + +If you need to override the command and/or arguments of the container, or provide custom environment variables, +this can be done by using advanced configuration, also shown in the image. + +# Usage + +The URL of the service deployed is accessible in the usual manner through the application's overview page. +Its usage depends on the container deployed. diff --git a/_sources/nird_toolkit/about-packages/deep-learning-tools/about.md.txt b/_sources/nird_toolkit/about-packages/deep-learning-tools/about.md.txt new file mode 100644 index 000000000..f4fcee226 --- /dev/null +++ b/_sources/nird_toolkit/about-packages/deep-learning-tools/about.md.txt @@ -0,0 +1,70 @@ +--- +orphan: true +--- + +# Configuration +See the `Configuration` section of the +{ref}`Jupyter package ` . + +# Usage +## What can the deep-learning-tools package be used for? +The purpose of the deep-learning-tools package is to provide a +pre-configure environment for performing deep-learning related tasks. +Widely used deep-learning libraries such as PyTorch, Tensorflow, Keras, CNTK, +mxnet, Theano and caffe2 are pre-installed. This package also provides all the +necessary drivers for using these tools with GPUs, as well as common +data-science libraries such as pandas, scipy and numpy (and many more). + +This package is an extension of the {ref}`Jupyter package ` , so it may be worth +reading its documentation first, as it contains basic information on how to +use the Jupyter notebook. + +### Using GPUs in Jupyter +In order to use a GPU, select a machine type which provides a GPU when +installing or reconfiguring. The machine types that provide GPU are suffixed +with `w/ GPU`. + +![GPU machine type](./deep_learning_gpu_machine_type.png) + +After installing, navigate to the Jupyter notebook, and run the following code +to test whether a GPU is found. The code will output the model name of the GPU. +` +import torch +torch.cuda.get_device_name(0) +` +This example uses PyTorch, but all the other libraries should also be able to find and use GPUs. +The output should be similar to `Pascal Titan X`. + +## Useful introductions to various deep learning libraries +- [Deep learning with PyTorch: A 60 minute blitz](https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html) +- [Getting started with Tensorflow + Keras](https://www.tensorflow.org/guide/keras) +- [MXNet - In-depth guide](https://github.com/zackchase/mxnet-the-straight-dope) + + +## Using Jupyter with Apache Spark +The `Connecting a Jupyter notebook to Apache Spark` section of the following +link describes how to use +{ref}`Apache Spark with Jupyter ` . + +## How to add new packages +In case you are missing some packages from the default application image, you can add those packages yourself by creating a custom docker image. +See {ref}`this tutorial ` for generic instructions on how to add packages. + +After having read the tutorial above, you can use the dockerfile below as a starting point when creating the dockerfile that adds new packages. +``` +# See the value of dockerImage in +# +# https://github.com/UninettSigma2/helm-charts/blob/master/repos/stable/deep-learning-tools/values.yaml +# +# to determine the latest base image + +FROM quay.io/nird-toolkit/deep-learning-tools2: + +# Install system packages +USER root +RUN apt update && apt install -y vim + +# Install other packages +USER notebook +RUN pip install fastai +``` diff --git a/_sources/nird_toolkit/about-packages/desktop-vnc/about.md.txt b/_sources/nird_toolkit/about-packages/desktop-vnc/about.md.txt new file mode 100644 index 000000000..191757ca0 --- /dev/null +++ b/_sources/nird_toolkit/about-packages/desktop-vnc/about.md.txt @@ -0,0 +1,30 @@ +--- +orphan: true +--- + +# Usage +## What can the desktop-vnc package be used for? +The purpose of the desktop-vnc package is to provide a lightweight linux desktop environment +for performing interactive tasks involving graphical applications and to speed-up their display on local desktop machines. +Note: 3D hardware acceleration is not supported. So it is not suitable for use with heavy 3D rendering application. + + +## How to add new packages +In case you are missing some packages from the default application image, you can add those packages yourself by creating a custom docker image. +See {ref}`this tutorial ` for generic instructions on how to add packages. + +After having read the tutorial above, you can use the dockerfile below as a starting point when creating the dockerfile that adds new packages. +``` +# See the value of dockerImage in +# +# https://github.com/UNINETTSigma2/helm-charts/blob/master/repos/stable/desktop-vnc/values.yaml +# +# to determine the latest base image + +FROM quay.io/nird-toolkit/desktop-vnc: + +# Install system packages +USER root +RUN apt update && apt install -y some_package +USER vncuser +``` diff --git a/_sources/nird_toolkit/about-packages/jupyter/about.md.txt b/_sources/nird_toolkit/about-packages/jupyter/about.md.txt new file mode 100644 index 000000000..9444c2246 --- /dev/null +++ b/_sources/nird_toolkit/about-packages/jupyter/about.md.txt @@ -0,0 +1,116 @@ +--- +orphan: true +--- + +# Configuration + +(Configuration-Jupyter-package)= + +## Enabling JupyterLab +JupyterLab is the next-generation of Jupyter notebooks and provides a more +flexible and powerful user-interface. To enable JupyterLab, go to `Configuration` while installing, and check the `jupyterLab` checkbox. + +## Usage +The Jupyter package provides a pre-configured development environment based +around Jupyter notebooks. Using this notebook, you are able to develop using +Python, Scala and R. + +Below are some tutorials describing how to use Jupyter notebooks for various different things: + +### Installing custom packages +In some cases the pre-installed libraries are not enough. Jupyter notebooks +allow you to run arbitrary shell commands directly from inside the notebook. +In order to run a command, in this case `pip install plotly`, prefix the +command with an exclamation mark, like so: `!pip install plotly`. +![Installing plotly](./jupyter_pip_install.png) + +You should then be able to use the library as you normally would. +![Using plotly](./jupyter_plotly.png) + +Note: in most cases the method outlined above will work, but if it doesn't see the following link for more details: +https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/ + +### Connecting a Jupyter notebook to Apache Spark + +(Apache-Spark-with-Jupyter)= + +#### Prerequisites +1. Knowledge of how to install a package {ref}` Link to package install tutorial ` + is useful, but not needed. + +#### Creating the Spark cluster +First, we need to create an Apache Spark cluster. We will do this by installing +the `spark` package, but you can also connect to any publicly accessible +Spark cluster. + +Fill in the `Name` and `URL` as desired. If you need more than one +worker, or more resources are required for the workers and master, go to the +`Configuration` section, and change these to your desired values. +[Image of spark install form with values filled in] + +When the application is running, copy the Spark master URL in the +`Application notes` section. This will later be used to specify which cluster +the notebook will attempt to connect to. +[Image of the Spark application with emphasis on the master URL] + +#### Creating the notebook +It is now time to install Jupyter. Navigate back to the package library, and +go to the Jupyter installation page. Fill in the `Name` and `URL` +once again. + +#### Connecting to Apache Spark +Now go to the `Configuration` section and paste in +the Spark master URL you obtained in one of the previous steps. +The package is now ready to be installed, so go ahead and press `Install`. + +![Jupyter application overview](../../imgs/application_overview.png) + +Click on the link in the `URL` column in order to access the Jupyter notebook. +In order to test the connection to Spark, you first need to create a +notebook. Then, you can use the following code sample in order to use Spark: +```python +import pyspark +import random + +def inside(p): + x, y = random.random(), random.random() + return x*x + y*y < 1 + +sc = pyspark.SparkContext(appName="Pi") +num_samples = 1000000 + +count = sc.parallelize(range(0, num_samples)).filter(inside).count() +pi = 4 * count / num_samples +print(pi) + +sc.stop() +``` +While running the application, you can visit the Spark cluster dashboard by +first navigating to the application overview for the previously created Spark +application, and the visiting the URL in the application overview. +[Image of the Spark dashboard overview] +Here you will be able to see all applications registered with your Spark +Cluster, and you are able to access the Spark logs. (This should probably be a tutorial on its own) + +### How to add new packages +In case you are missing some packages from the default application image, you can add those packages yourself by creating a custom docker image. +See {ref}`this tutorial ` for generic instructions on how to add packages. + +After having read the tutorial above, you can use the dockerfile below as a starting point when creating the dockerfile that adds new packages. +``` +# See the value of dockerImage in +# +# https://github.com/UninettSigma2/helm-charts/blob/master/repos/stable/jupyter/values.yaml +# +# to determine the latest base image + +FROM quay.io/nird-toolkit/jupyter-spark: + +# Install system packages +USER root +RUN apt update && apt install -y vim + +# Install other packages +USER notebook +RUN pip install scikit-learn +``` diff --git a/_sources/nird_toolkit/about-packages/jupyterhub/about.md.txt b/_sources/nird_toolkit/about-packages/jupyterhub/about.md.txt new file mode 100644 index 000000000..a640ef48f --- /dev/null +++ b/_sources/nird_toolkit/about-packages/jupyterhub/about.md.txt @@ -0,0 +1,104 @@ +--- +orphan: true +--- + +# Configuration +## Sharing data among users +To share data among the users of JupyterHub, when installing or reconfiguring, +start by selecting a volume in the persistent storage field. +Then, navigate to the `Configuration` section and check the `enabled` +checkbox in the `sharedData` subsection. + +![Reconfigure jupyter with shared data](./jupyterhub_shared_data.png) + +The shared data is by default not editable by other users, but you can enable +this by unchecking the `readOnly` checkbox. Note that this will allow any +users to add files or edit existing files. + +If you want to mount a subfolder in the persistent storage, specify the path +to the subfolder in the `subPath` field. Note that this path is relative to +the root of the persistent storage. + +### Sharing access to GPUs +In many cases it is useful to share GPUs among all users. In order to allow users to access GPUs, +[ipyparallel](https://ipyparallel.readthedocs.io/en/latest/) is used. +ipyparallel works by running separate Jupyter kernels on remote nodes. +This enables a user to with very little modification run their notebook on another GPU enabled machine. +To enable ipyparallel, ensure that `enabled` is ticked in the `ipyparallel` section of the `Advanced configuration`. + +To use ipyparallel within a notebook, run the following script inside a notebook +``` +import ipyparallel as ipp +import random +rc = ipp.Client() +worker_id = random.randint(0, len(rc.ids)-1) +rc = rc.activate(targets=[worker_id]) +``` +which will configure the ipyparallel client and select a single node which is used for execution. + +You are then able to run all code in a notebook cell by using the `%%px` notebook command. Output will be handled in the same manner as if the code was executed on the local kernel. +Each execution node will mount the same volumes as the notebook, but will run as user with very limited access. It is therefore important to make all files used by the code readable to all users. + +So, a cell containing the following code +``` +%%px +with open('/home/e19b7304-2d13ec-2d490f-2d9fa0-2de6d6b4c3a858/file.txt') as f: + print(f.readlines()) +``` +will be able to read the file stored in the users home directory. + +To read more about how to use ipyparallel with Jupyter notebooks, see the following [link](https://ipyparallel.readthedocs.io/en/latest/tutorial/magics.html). +The `%%px` notebook magic also works with regular notebook magic commands, so commands like +``` +%%px +!/home/e19b7304-2d13ec-2d490f-2d9fa0-2de6d6b4c3a858/gpu-enabled-binary +``` +will work if you need to execute other binaries on a GPU enabled machine. + + +# Usage +## Accessing the admin panel +Jupyterhub allows you to administer and access the notebooks that are +associated with the Jupyterhub server. This can for instance be useful in +order to help users with debugging, or shutting down notebooks that use a lot +of resources. + +To access the admin panel, begin by navigating to the `Control Panel` which +should be in on the right-hand side of the Jupyter navigation bar on the top +of the page. + +![Jupyter notebook overview](./jupyterhub_jup.png) + +Then, click on the `Admin` tab in the navigation bar. This tab should be on +the left side of the navigation bar. + +![Jupyter control panel](./jupyterhub_control_panel.png) + +You should then be taken to a page which +allows you to add new users, stop, access and edit the notebooks of individual +users, as well as shutting down the Hub. + +![Jupyter admin panel](./jupyterhub_admin.png) + +### How to add new packages +In case you are missing some packages from the default application image, you can add those packages yourself by creating a custom docker image. +See {ref}`this tutorial ` for generic instructions on how to add packages. + +After having read the tutorial above, you can use the dockerfile below as a starting point when creating the dockerfile that adds new packages. +``` +# See the value of userImage in +# +# https://github.com/UninettSigma2/helm-charts/blob/master/repos/stable/jupyterhub/values.yaml +# +# to determine the latest base image + +FROM quay.io/nird-toolkit/jupyterhub-singleuser: + +# Install system packages +USER root +RUN apt update && apt install -y vim + +# Install other packages +USER notebook +RUN pip install scikit-learn +``` diff --git a/_sources/nird_toolkit/about-packages/minio/about.md.txt b/_sources/nird_toolkit/about-packages/minio/about.md.txt new file mode 100644 index 000000000..1b95ba594 --- /dev/null +++ b/_sources/nird_toolkit/about-packages/minio/about.md.txt @@ -0,0 +1,42 @@ +--- +orphan: true +--- + +# Configuration +## Setting a more secure password +In order to set up a more secure password and username, go to the +`Configuration` section, and enter a more random `secretKey` and +`accessKey`. Keep in mind that these will be stored in plain-text and be +accessible in the application `Configuration` tab, so don't reuse these +anywhere. +The `accessKey` and `secretKey` you entered can then be used when login into +minio. + +# Usage +## Uploading files +Begin by creating a new bucket by clicking on the circle with a plus sign in +the right-hand corner, and selecting `Create bucket`. + +![Minio file overview](./minio_overview.png) + +You will then be prompted for a bucket name. + +![Minio add bucket](./minio_bucket.png) + +After giving the bucket a name and pressing enter, you will be able to access +the bucket by selecting it in sidebar on the left side of the screen. + +![Minio selected bucket](./minio_my_bucket.png) + +To upload a file, click the circle again, and select `Upload file`. You should +then be able to select which file(s) to upload. To upload multiple files, hold +`shift` while selecting files. + +![Minio file upload success](./minio_upload_file.png) + +## Sharing files +To share a file with a friend (or enemy), click the three dots furthest to the +right of the filename. You will then be given a link which can be used to +share the file. + +![Minio file sharing](./minio_sharing.png) diff --git a/_sources/nird_toolkit/about-packages/rstudio/about.md.txt b/_sources/nird_toolkit/about-packages/rstudio/about.md.txt new file mode 100644 index 000000000..926a7072a --- /dev/null +++ b/_sources/nird_toolkit/about-packages/rstudio/about.md.txt @@ -0,0 +1,53 @@ +--- +orphan: true +--- + +# Usage + +## How to use RStudio +For a general tutorial on how to use RStudio, +see [RStudio 101](https://dss.princeton.edu/training/RStudio101.pdf). +For a webinar with an in-depth explanation of RStudio can be found, see +[RStudio webinars](https://www.rstudio.com/collections/rstudio-essentials/). + +## Serving Shiny applications +The rstudio package has built-in support for serving [Shiny web applications](https://shiny.rstudio.com/). +Shiny can be used to create interactive web applications directly in R. +By default, a collection of sample applications are hosted on `shiny-`. + +![Shiny default apps](./rstudio_shiny.png) + +By default, all shiny apps are served from the `/srv/shiny-server` +directory. In order to add and host a new app, copy your application to +`/srv/shiny-server`. One way of doing this is using the terminal which is built +into Rstudio. The terminal can accessed through the `Terminal` tab, which is +found besides the `Console` tab in the upper part of the UI. + +![Shiny add app](./rstudio_shiny_app.png) + +For information about how to create new Shiny apps, +see [Learn Shiny](https://shiny.rstudio.com/tutorial/). + +### How to add new packages +In case you are missing some packages from the default application image, you can add those packages yourself by creating a custom docker image. +See {ref}`this tutorial ` for generic instructions on how to add packages. + +After having read the tutorial above, you can use the dockerfile below as a starting point when creating the dockerfile that adds new packages. +``` +# See the value of dockerImage in +# +# https://github.com/UninettSigma2/helm-charts/blob/master/repos/stable/rstudio/values.yaml +# +# to determine the latest base image + + +FROM quay.io/nird-toolkit/rstudio-server: + +# Install system packages +USER root +RUN apt update && apt install -y emacs && rm -rf /tmp/downloaded_packages + +# Install R packages +USER rstudio +RUN install2.r randomForest +``` diff --git a/_sources/nird_toolkit/about-packages/spark/about.md.txt b/_sources/nird_toolkit/about-packages/spark/about.md.txt new file mode 100644 index 000000000..c42f406af --- /dev/null +++ b/_sources/nird_toolkit/about-packages/spark/about.md.txt @@ -0,0 +1,75 @@ +--- +orphan: true +--- + +# Configuration +## Adding more workers +By default, only one worker is created. To create more workers, set `replicas` +(in the `Configuration` panel) to the desired number of workers when +installing the package, or when reconfiguring an existing application. More +workers will then automatically be associated with the cluster. + +# Usage +## Using Apache Spark from a Jupyter notebook +The `Connecting a Jupyter notebook to Apache Spark` section of the following +link describes how to use +{ref}`Apache Spark with Jupyter ` . + +## Accessing the Apache Spark cluster dashboard +Sometimes it is necessary to debug a Spark application. The Spark UI provides +a lot of different information about the Spark cluster, and may thus be useful +to found out why something went wrong. +Assuming you have already installed the Spark package in order to create an +Apache Spark cluster, the dashboard can be found by visiting the application +overview page, and clicking the link in the `URL` column. + +![Spark jobs overview](./spark_jobs_overview.png) + +After clicking the link, and logging in, you will be taken to the Spark +dashboard. Here you will see the available workers, and which applications are +registered with the cluster. + +By clicking on the application name in the dashboard, you will be able to see +the workers currently working on the application. To access their logs, click +on `stdout` or `stderr`. + +![Spark application overview](./spark_app_overview.png) + +To see more details about what the application is currently doing, click on +`Application Detail UI`, which will provide a detailed description of exactly +which functions each worker is currently executing, and at which stage in the +execution they are. + +![Spark application overview](./spark_app_details.png) + +In most cases the worker logs and `Application Detail UI` will provide enough +information to debug your application, but if you need more information, +the following link further explains each part of the UI. +[Apache Spark web UI detailed guide](https://www.ibm.com/support/knowledgecenter/en/SS3H8V_1.1.0/com.ibm.izoda.v1r1.azka100/topics/azkic_c_webUIs.htm) + +## Apache Spark for machine learning +Apache Spark can be used for various machine-learning related tasks. +The following guide provides various examples on [how to use Apache Spark for machine learning](https://spark.apache.org/docs/2.3.1/ml-statistics.html). + +## How to add new packages +In case you are missing some packages from the default application image, you can add those packages yourself by creating a custom docker image. +See {ref}`this tutorial ` for generic instructions on how to add packages. + +After having read the tutorial above, you can use the dockerfile below as a starting point when creating the dockerfile that adds new packages. +``` +# See the value of masterImage/workerImage in +# +# https://github.com/UninettSigma2/helm-charts/blob/master/repos/stable/spark/values.yaml +# +# to determine the latest base image + +FROM quay.io/nird-toolkit/jupyter-spark: + +# Install system packages +USER root +RUN apt update && apt install -y vim + +# Install other packages +USER notebook +RUN pip install scikit-learn +``` diff --git a/_sources/nird_toolkit/application-management.md.txt b/_sources/nird_toolkit/application-management.md.txt new file mode 100644 index 000000000..2b927d1b7 --- /dev/null +++ b/_sources/nird_toolkit/application-management.md.txt @@ -0,0 +1,64 @@ +# Application management + +## Listing all applications +To list all applications, go to the `Applications` tab in the main navigation menu. + +![Application list](imgs/application_list.png) + +On this page, applications in all projectspaces you have access to is listed. +By default, only applications created by you is shown. +To show all applications within a projectspace, click on the `All` button in +the application filtering bar. In the filtering bar, you will also find +functions for filtering applications based on for instance status, +projectspace or general metadata. + +## Viewing the details of a specific application +To view the details of a specific application, click on the name of the application. +You will then be taken to a page containing various information about the +application. + +![Application overview](imgs/application_overview.png) +As seen above, the first section of this page contains general information about the +application, such as the URL you can visit in order to use the application, +when it was created, which projectspace it exists in and email of whoever +created the application. + +Below, you will find a list of the volumes that are +accessible to various parts of the application. + +At the bottom of the page is a list of events. These events show information +about what is currently happening to the application, such as that it +is pulling a Docker image, or mounting a volume. +Most of this information is only useful when the application is initializing or +failing, so most of the time there will be few or none events. + +## Upgrading an application to the newest version +Sometimes it may be useful to reconfigure your application after it has been +created. By clicking on `Modify` and then the `Reconfigure` button, you will be navigated to a page +very similar to the installation page, which allows you to change the +configuration of a package. This is for instance useful if a new version of +the package has been released. + +![Application reconfigure](imgs/application_reconfigure.png) +Note that the application will be restarted and locally stored data will be +lost when reconfiguring. See the `Installing a package`-page for more details +as to what each field means. + +## Starting and stopping an application +If you want to keep the application configuration, but stop using the +resources associated with it, you can stop the application by navigating to +the `Stop` button. After stopping the application, you can start it again by +going to the `Start` button. This will reuse your previous configuration. + +## Deleting an application +When you are done using your application, you can delete it by going to the +`Delete`-button. After the application has been deleted, you may for some time be +able to find the application by listing all applications, and then choose +`All` in the status filtering bar. + +## *Advanced* - Viewing the Kubernetes resources an application is using +To view the Kubernetes resources created by your application, navigate to the +`Resources` tab. This page shows some details about each resource, as well as +linking to the dashboard URL of the different resources (when available). + +![Application Kubernetes resources](imgs/application_resources.png) diff --git a/_sources/nird_toolkit/custom-docker-image.md.txt b/_sources/nird_toolkit/custom-docker-image.md.txt new file mode 100644 index 000000000..65f5af788 --- /dev/null +++ b/_sources/nird_toolkit/custom-docker-image.md.txt @@ -0,0 +1,27 @@ +(custom-docker-image)= + +# Using a custom docker image to add new packages +In case you are missing some packages from the default application image, you can add those packages yourself. +To do so, you need to create a custom docker image that use the default application image as the base image. +[This tutorial](https://docs.docker.com/get-started/part2/) shows how to build a docker image and push it to a container registry. + +See the documentation of the specific package (ex. Jupyter) you want to add packages to for information about what the base dockerfile should look like. + +Typically, the dockerfile looks similar to the following +``` +# The image to use as a base image +FROM quay.io/nird-toolkit/example-docker-image:20230102-xxxxxxx + +# Install system packages +USER root +RUN apt update && apt install -y vim + +# Install other packages +USER notebook +RUN pip install scikit-learn +``` + +You need to have this image pushed to a public repository e.g. [Docker hub](https://hub.docker.com/) or [Quay Registry](https://quay.io). +Once pushed, you can use the docker image by specifying the `dockerImage` under `Configuration` button on the `Installation/Reconfigure` page. +Note that the exact name of the field may very, but the field name should end with `Image` (ex. `workerImage`, `userImage` etc.). +After specifying your custom image and applying those changes, your image will be used in the given instance of application and have all the newly added packages. diff --git a/_sources/nird_toolkit/getting_started_guide.md.txt b/_sources/nird_toolkit/getting_started_guide.md.txt new file mode 100644 index 000000000..795603e48 --- /dev/null +++ b/_sources/nird_toolkit/getting_started_guide.md.txt @@ -0,0 +1,40 @@ +(nird-toolkit-getting-started)= + +# Get ready to deploy a service through the NIRD Toolkit + +Only the project leader or the executive officer of a NSxxxxK project can deploy applications through the NIRD Toolkit. +The project leader/executive officer can also decide who else shall be authorized to deploy application through +the NIRD Toolkit and who shall be able to run the deployed application. +This is done by creating a group in Feide Innsyn and connecting it to the resources in MAS (NSxxxxK). +Those members of the group who hold administrative rights will deploy applications, ordinary members will run applications. + +1. Go to [Feide Innsyn](https://innsyn.feide.no/groups) and select the institution you belong to from the drop-down menu. If your institution does not appear there (Feide login), then select "Feide guest users" from the drop-down menu in the "Other alternative login". You will then be redirected to the OpenIdP page. Create an account in OpenIdP by following the procedure and, once the account has been created, use it to log in to Feide innsyn as Feide guest users. + + ![Feide login](imgs/Log-in-feide-innsyn.png "Feide login") + +2. Once logged in, you will be redirected to the Feide Innsyn dashboard. Create a new group by clicking on the link on the bottom of the page ("Create Ad-hoc Group"). + + ![Feide Innsyn dashboard](imgs/feide-innsyn-groups.png "Feide Innsyn dashboard") + + Once the group is created, the dashboard will show information about the group, including the "Group Code" (fc:adhoc:xxxx-xxx-xxxxx-xxxxx). + **Send the Group name, Group Code and preferred short name to contact@sigma2.no to be authorized to deploy a service through the NIRD Toolkit. Please specify which of your NSxxxxK projects you want this group to have access to.** + +3. You can now authorize other co-workers to run the deployed application, by adding them to the newly created group. Click on "Manage Group" and you will be redirected to a page that contains an "Invite other members (email)" section. Enter the email address of the person that you want to invite into the group, and an invitation link will be sent to them. + + ![Feide-innsyn-share-link](imgs/manage-feide-ad-hoc-group.png "Feide Innsyn share-link") + + Once the invited person accepts the invitation, they will appear as a member of the group. + You can make any member the administrator of the ad-hoc group. + +4. Your Feide or Feide OpenIdP that you add to the ad-hoc group needs to be connected to your MAS account in order to deploy applications. Check your account details in [MAS here](https://www.metacenter.no/mas/user/profile) and verify that you are registered through your Feide e-mail. + +5. In order to deploy the application you shall log in to the [NIRD Toolkit](https://apps.sigma2.no) via your Feide or your OpenIdP account, previously added to the group in Feide Innsyn. Select the group previously created in Feide Innsyn in the "Authorized Groups". Now you are able to run your application, which will be connected to the project area NSxxxxK. + + + ![Authorized groups](imgs/application_reconfigure.png "Authorized groups") + +**Log in with OpenIdP** +For those that use OpenIdP, at the login page in the [Toolkit](https://apps.sigma2.no/), choose "Feide guest users" and log in with your OpenIdP account. + +![Feide guest users](imgs/feide-guest-authentication.png "Feide guest users") + diff --git a/_sources/nird_toolkit/overview.rst.txt b/_sources/nird_toolkit/overview.rst.txt new file mode 100644 index 000000000..4ddfbd94e --- /dev/null +++ b/_sources/nird_toolkit/overview.rst.txt @@ -0,0 +1,17 @@ +.. _nird-toolkit: + +NIRD Toolkit +============ + +.. toctree:: + :maxdepth: 1 + + overview_nird_toolkit.md + getting_started_guide.md + package-install.md + persistent-storage.md + application-management.md + package-usage.md + custom-docker-image.md + terms.md + diff --git a/_sources/nird_toolkit/overview_nird_toolkit.md.txt b/_sources/nird_toolkit/overview_nird_toolkit.md.txt new file mode 100644 index 000000000..2b3b7521c --- /dev/null +++ b/_sources/nird_toolkit/overview_nird_toolkit.md.txt @@ -0,0 +1,33 @@ +# What is the NIRD Toolkit ? +Say you want to install an application and make it +accessible to the outside world. To do this, you most likely need to have +some knowledge of the computer (such as the OS, hardware specs, network rules etc.) +you will be installing it on, as well as some application specific +knowledge. + +Often, the installation process is error-prone and takes a lot of +time. The NIRD Toolkit attempts to automate this by providing a set of +pre-configured application templates (in the NIRD Toolkit world this is +known as a package), making it possible to install an +application instantly without much configuration. + +Each application created through the NIRD Toolkit provides a user accessible web +interface, as well as the ability to limit access using services similar to +Dataporten. + +The NIRD Toolkit is a Kubernetes based cloud infrastructure, just alike Google, Azure or Amazon kubernetes Clouds. The software runs into containers to ensure high portability of the tools and reproducibility of the results. It is highly customizable, meaning you can have the tools you want with the version you want. + +The NIRD toolkit allows pre/post processing analysis, data intensive processing, visualization, artificial intelligence and machine learning platform. + +To install your first package, go to the [Installing a package](package-install.md) section. + + +## Getting Access + +If you already have access to Sigma2 resources and a project allocation, you can use the NIRD Toolkit. Please see the +{ref}`nird-toolkit-getting-started`. + +If your research team has access to Sigma2 resources, but you do not have a user-account, please visit: https://www.sigma2.no/how-apply-user-account + +If you do not yet have access to Sigma2 resources, please visit these pages in order to apply for access: https://www.sigma2.no/apply-e-infrastructure-resources. + diff --git a/_sources/nird_toolkit/package-install.md.txt b/_sources/nird_toolkit/package-install.md.txt new file mode 100644 index 000000000..af379b2e7 --- /dev/null +++ b/_sources/nird_toolkit/package-install.md.txt @@ -0,0 +1,134 @@ +(Configuration-general-package)= + +# Installing a package +Begin by navigating to the package library in the [Toolkit](https://apps.sigma2.no/). +![Package library](imgs/library.png) + +Here, you will be presented with a selection of different packages. To find a +specific package, you can use the search bar to search for the package name, +keywords or similar features. + +## Getting an overview of a package +Each package in the package list consists of two components. A description of +the package, and a `Install` button. By clicking on the `Install`-button, you will +be taken to the installation page, where you can configure and install the package. + +In many cases it might be useful to visit the package overview package before +installing the package, as this contains information about exactly what will +be installed, and may contain useful information about the configuration of +a package. To find out more about a package, click on Read more. + +After navigating to the package overview page, you will be greeted with a +high-level description of what the package is and what it installs. +![Jupyter package](imgs/jupyter_package.png) + +Near the center of the page, is the package description. This is +the most important part of the package, and explains different features of the +package, as well as how to configure different values. +Various metadata, such as who maintains the package can be found besides the +package description. + + +## Actually installing a package +After reading this description, we should have all the information required to +install the package. To navigate to the package installation page, click on the +`Installation`-button in the menu. + +![Jupyter install page](imgs/jupyter_package_install.png) +To install a package, you are required to fill in some values. +Some common values are present in most packages, and the meaning of these is covered in +{ref}`Appendix A ` . + +The only values that you are required to fill are the `Name` and +`URL` fields. There are however several optional values that can be useful in +some use-cases. Some of these values reside in the `Configuration` +section. What these values means should be covered in +{ref}`Appendix A ` . + +After filling in the `Name` and `URL` fields, you can install the +application by clicking the `Install Package` button. You are then taken to a page +showing an overview of your newly created application. + +![Application overview](imgs/application_overview.png) +When the application is in a running state, the URL you entered on the +installation page will appear in the application overview, and you will be +able to visit and use the application. + +Congratulations, you have now installed your first package! +To find out how to manage your newly created application, +go to the [Managing applications](application-management.md) section. + +(appendix-a-the-meaning-of-each-of-common-fields-in-the-installation-form)= + +## Appendix A: the meaning of each of common fields in the installation form + +### Required fields +#### Name +To make it easier to determine identify applications, you are required give +your application a name. This name is meant as a human friendly way of +describing the application. + +#### Projectspace +In many cases it is useful to group applications belonging to the same project together. +By specifying a projectspace, an application will be grouped in a specific project. +A projectspace may have different resources available to it, and you may not +be able to install applications in all projectspaces. + +Note: if you are familiar with Kubernetes namespaces, then it is worth noting +that a projectspace is just a different name for a namespace. + +#### URL +In order to make it easier to access each application, each application hosted +behind the URL specified using the URL field. + +### Optional fields +#### Authorized groups +Sometimes it may be useful to only allow some groups access to the +application. +the 'authorized groups' selector allows you to specify which groups +are allowed to access the application. + + +#### Storage +Allows you to attach a volume that can be used to persistently +store data. The [enabling persistent storage section](persistent-storage.md) +goes into more detail as to what this does. + +#### Machine type +For an application to work properly, it must allocate some resources. + +![Machine types](imgs/machine_type.png) + +The machine type is used to determine how much resources will be +allocated to the application. a machine type consists of three +resources: + + cpu -> the number of CPU(s) to allocate + memory -> the amount of RAM to allocate + gpu -> (only available in some packages) the number of GPUs to allocate + +To specify how much of each resource is to be allocate, you can select +different resource flavours. Every projectspace as a different amount of +resource available. To see how much of each resource is available, look at the +`Requested resources` field. + +#### Requested resources +The bars in the 'requested resources' section shows how much of each +resource is available. + +![Requested resources](imgs/requested_resources.png) + +Green is used to indicate how much of +the resource you are about to allocate, and orange indicates how much +is already in use. red indicates that you are about to attempt to +allocate more resources than what is available. + +### Common advanced values + +Each package should have an explanation of its non-standard values on the +package overview page. + +#### dockerImage +In order to use custom images, it is in some packages possible to specify a +custom Docker image to use when creating the application. This image should be +publicly available. diff --git a/_sources/nird_toolkit/package-usage.md.txt b/_sources/nird_toolkit/package-usage.md.txt new file mode 100644 index 000000000..31f194331 --- /dev/null +++ b/_sources/nird_toolkit/package-usage.md.txt @@ -0,0 +1,11 @@ +# Packages +* List of available packages: + * [Desktop VNC](about-packages/desktop-vnc/about.md) + * [Jupyter](about-packages/jupyter/about.md) + * [JupyterHub](about-packages/jupyterhub/about.md) + * [Deep learning tools](about-packages/deep-learning-tools/about.md) + * [Apache Spark](about-packages/spark/about.md) + * [Minio](about-packages/minio/about.md) + * [RStudio](about-packages/rstudio/about.md) + + \ No newline at end of file diff --git a/_sources/nird_toolkit/persistent-storage.md.txt b/_sources/nird_toolkit/persistent-storage.md.txt new file mode 100644 index 000000000..f16958fb4 --- /dev/null +++ b/_sources/nird_toolkit/persistent-storage.md.txt @@ -0,0 +1,25 @@ +# Enabling persistent storage +By default, all applications store their data locally. This means that if an +application is stopped or restarted, all local data is lost. Most packages do +however how support for enabling persistent storage. You can enable persistent +storage when installing (or reconfiguring) by using the `Persistent storage` field +to select the volume your application should be using. After selecting the +volume, you will be able to specify whether the volume should be mounted +read-only, and which part of the volume to mount. + +![Persistent storage](imgs/persistent_storage.png) + + +By default, the entire volume is attached (i.e. `/` is +attached). by specifying a subpath, a subpath of the volume can be +mounted. for instance, if you have the following file system: + + / + etc/ + data/ + +and only want to mount the `/data/` folder, you can specify `/data/` as +the subpath in order to just mount this folder. + +Each volume is associated with a projectspace, so if there are no mountable +volumes, contact NIRD Toolkit . diff --git a/_sources/nird_toolkit/terms.md.txt b/_sources/nird_toolkit/terms.md.txt new file mode 100644 index 000000000..f8d5f575a --- /dev/null +++ b/_sources/nird_toolkit/terms.md.txt @@ -0,0 +1,26 @@ +# Glossary +## Package +A package represents a configuration of an application. +For instance, installing the Jupyter package will create a +Jupyter notebook (i.e. the application) that you can access through the +web browser. +You may sometimes see packages referred to as a chart. A package is an +extension of a chart. + +Several different packages are available. There are for +instance packages for creating an Apache Spark cluster, setting up a GPU +enabled deep-learning environment, and creating a personal cloud storage +server. + +## Application +An application represents a specific installation of a package. That is, +it encapsulates the configuration specified by the user when installing +the application. + +## Projectspace +Each application belongs to a single projectspace. A projectspace is essentially a +way of grouping applications together. Every projectspace is allocated a given +amount of resources (that is, RAM, CPUs and GPUs) and volumes. + +Note: if you are familiar with Kubernetes namespaces, then it is worth noting +that a projectspace is just a different name for a namespace. diff --git a/_sources/services/easydmp-user-documentation-rfk.md.txt b/_sources/services/easydmp-user-documentation-rfk.md.txt new file mode 100644 index 000000000..7c8e1cd3e --- /dev/null +++ b/_sources/services/easydmp-user-documentation-rfk.md.txt @@ -0,0 +1,100 @@ +--- +orphan: true +--- + +(EasyDMP-User-Documentation-for-the-RFK)= +# EasyDMP User Documentation for the RFK + +- {ref}`Access ` +- {ref}`Create a new DMP ` +- {ref}`Review and manage your plans ` +- {ref}`Export your plan ` +- {ref}`Request support ` +- {ref}`Example plan ` + + +(access-easydmp-rfk)= + +## Access + +The service is free of charge and accessible to everyone. + +The EasyDMP service is accessible here: + +Members from every institution supporting Feide federated identity will be able to login through DataPorten. + +Members from European institutions can authenticate through B2ACCESS login (more info about B2ACCESS [here](https://www.eudat.eu/services/b2access)). + + +(Create-a-DMP-rfk)= +## Create a DMP + +Click on the “+ Create new plan” button shown in the red rectangle in the figure below. + + +![Picture1](imgs/Picture1.png "Picture1") + + +- You will then be presented with a list of templates. You should use the **“Sigma2 Data Management Plan”** template. +- You will then need to fill in the Title of your plan and optionally an abbreviation for your plan. +- The template is divided into 5 Sections: + - **General Project Information** – that covers general information about your project. Some of this information you will have put into your MAS form request. We apologise for this. We are currently working on integrating the DMP tool with MAS so you do not have to provide duplicate information. There are 7 questions in this section, two which are optional. + - **Data** – which covers information on the data that your project will create or use. There are 4 questions in this section, all of which are optional at this stage. + - **Documentation and Metadata** – which covers the metadata and documentation that you will use to describe your data. There are 2 questions in this section that are optional at this stage. + - **Storage** – covers how and where you will manage your data during your project. There are up to 8 questions in this section, one which is optional. + - **End of project** – covers how you will make your data available after your project finishes. There are up to 7 questions in this section, one which is optional. + +- Note: in some sections, you may notice the question counter at the bottom of the page jump (e.g. 1/8 to 4/8). This is not a bug. The way the questionnaire is designed all questions are represented as one long chain. Depending on your response to some questions you may jump to different points in the chain, skipping the questions that are irrelevant. +- Only the questions marked with a red asterisk are required at this time, but all questions are necessary for a complete data management plan. +- You do not have to answer all questions in one go. You can press ‘Save’ and come back your plan at a late point in time. +- Click on **“Go to plan summary”** to go to a summary view of the plan. From the summary you can edit any question in your plan. +- Some questions have a **“More information”** text box (see the image below) where you can supply additional information relevant to the question. + + +![Picture2](imgs/Picture2.png "Picture2") +The last question in Section 5 will take you to a summary of your plan. If you have completed all the required questions you should see green ticks by the sections (see the figure below). + + + + +![Picture3](imgs/Picture3.png "Picture3") + +(Management-Options-rfk)= + +## Export your plan + +You can click on the ‘cog’ next to your plan title (see the above figure). Choose the **“Export”** option which will result in your plan being displayed in your browser. You can then either copy the text and paste into a document or use your browser’s print button to print to a PDF file. + + + + +(Review-and-manage-your-plan-rfk)= + +## Review and manage your plan + + +![Picture4](imgs/Picture4.png "Picture4") + +- You can view all your plans by clicking on the **“Your plans”** in the menu bar. You can view any plan by clicking on the title of that plan. From there, you can edit any questions in your plan. +- You can find more management options (see the image below) for your plan by clicking the ‘cog’ icon in the right column. The options are: + - You can share your plan with other users by selecting **"People"** option. The invited person will receive a mail with a link to edit the DMP. + - You can rename, copy or delete a plan with the **"Rename"**, **"Duplicate"** or **"Delete"** options respectively. + - The **"Export"** option results in the plan being displayed as an HTML document. You can use your browser's "Print" option to print the document to a file. Or, you can cut and paste the text into a new document. + NOTE: better support for exporting documents will appear in the near future. + + + + + +(Request-Support-easydmp-rfk)= +## Request Support + +If you experience any problems, or you have any question, please contact . +Your request will be handled within the next 24 hours during working days. + + + +(Example-plan-rfk)= +## Example plan + +An example plan can be found [here](https://www.sigma2.no/sites/default/files/imce/Support/ExamplePlan.pdf). diff --git a/_sources/services/easydmp-user-documentation.md.txt b/_sources/services/easydmp-user-documentation.md.txt new file mode 100644 index 000000000..86bb8f7e1 --- /dev/null +++ b/_sources/services/easydmp-user-documentation.md.txt @@ -0,0 +1,108 @@ +(easydmp)= + +# EasyDMP User Documentation + + + +- {ref}`Access ` +- {ref}`DMP templates ` + - {ref}`Science Europe ` + - {ref}`Horizon 2020 ` + - {ref}`Institutional Templates ` + - {ref}` Sigma2 RFK Template ` + +- {ref}`Create a new DMP ` +- {ref}`Review and manage your plans ` +- {ref}`Add a co-editor to a plan ` +- {ref}`Export your plan ` +- {ref}`Publish your plan ` +- {ref}`Request support ` + + +(access-easydmp)= +## Access + +The service is free of charge and accessible to everyone. + +The EasyDMP service is accessible here: + +Members from every institution supporting Feide federated identity will be able to login trough DataPorten. Members from European institutions can authenticate through B2ACCESS login. More info about B2ACCESS [here](https://www.eudat.eu/services/b2access). + +To get the B2ACCESS credentials, select B2ACCESS as login option in the easyDMP. At B2ACCESS select the "Register a new user" on the right-hand side. + +![B2ACCESS](imgs/b2access1.png "B2ACCESS") + + +From the menu, select "Create B2ACCESS account" + +![Create B2ACCESS account](imgs/b2access2.png "Create B2ACCESS account") + +Fill up the form, by giving a username and a password of preference, and agree with the B2ACCESS Term of Use. In a few seconds you will receive a mail with a link. Once you click on the link, your credentials (username/password) will be active, and you can use them to login to easyDMP. + +(Select-a-template)= +## Select a template + +EasyDMP is a web form consisting of a series of questions grouped into a number of sections. The questionnaire is dynamic, meaning that the type and amount of questions you will be presented at every stage depends on the answer you have given at the previous stage. For example, the Horizon 2020 template consists of up to 70 questions split into six sections. So the maximum amount of questions is 70, but most likely you will have much less than that. Many of the questions are designed as simple “yes/no” questions and each response results in pre-written text that will appear in your plan. + +(Science-Europe-template)= +### Science Europe template + +Science Europe is an association of European Research Funding Organisations (RFO) and Research Performing Organisations (RPO), based in Brussels. Science Europe has recently published the [Practical Guide to the International Alignment of Research Data Management](https://www.scienceeurope.org/our-resources/practical-guide-to-the-international-alignment-of-research-data-management/), aiming to align the RDM requirements across research funding and research organisations in Europe. The guide is the results of a working group made by experts from Science Europe Member Organisations and in consultation with the broader research stakeholder community, including the Norwegian Research Council. The Science Europe guideline are recommended by the Norwegian Research Council. + +The template in easyDMP that implements the Science Europe guidelines is called *"Science Europe"*. {ref}`A practical guide ` complemented with a set of example is available to support the creation of DMPs in compliance with Science Europe. The created DMP can be attached to the application for funding to several European funding agencies, including the Norwegian Research Council. + +(EU-Horizon-2020-template)= +### EU Horizon 2020 template + +This template implements the **EU Horizon 2020** recommendation with regard to data management. We offer two different flavor of the Horizon 2020 template, namely a linear template, with a fixed amount of guidelines and instructions to fill up the required fields, and a dynamic one to guide your through the process of making the DMP by customizing every question on the basis of the previous answers. + +(Institutional-and-community-specific-template)= +### Institutional and community specific template + +The service can support multiple form for the questionnaire (referred to as *"template"*). For example, it is possible to integrate Institutions specific recommendations or community specific recommendations into templates available in the easyDMP. If you want to know more about this take contact with us: . + +(Create-a-DMP)= +## Create a DMP +- To create a new plan, click on the *“Create a new plan”* menu in the menubar, or click on the *“+ Create new”* to create a new plan: + +![Create a DMP](imgs/yourplans.png "Create a DMP") + + +- There is one question per page. Pressing the *“Prev”* or *“Next”* button takes you to the previous or next question, and saves your answers as well. +- You do not have to answer all questions in one go. You can stop, close your browser and come back to your plan at a later point in time. Remember to save your answer to the question you are on by clicking the *“Next”* button. +- You do not have to answer all questions in order. You can skip questions and come back to them later. +- Click on *“Go to plan summary”* to go to a summary view of the plan. From the summary you can edit any question in your plan. Your changes will be saved if you make a change and press the *“Next”* or *“Prev”*. +- The light blue bar below the title shows the sessions of the current template. You can jump to a session by clicking on the ball indicating the number of the session. +- The *“More Information”* text box allows you to supply additional information relevant to the question. Clicking the *“Next”* or *“Prev”* buttons will result in the question response being saved: + + +![Metadata](imgs/plan3.png "Metadata") + + +(Review-and-manage-your-plan)= +## Review and manage your plan + +- You can view all your plans by clicking on the *“Your plans”* in the menu bar. You can view any plan by clicking on the title of that plan. From there, you can edit any questions in your plan. +- You can find more management options for your plan by clicking the ‘cog’ icon in the right column. + + + +![Review and manage your plan](imgs/yourplans2.png "Review and manage your plan") + + +(Management-Options)= +### Management Options: +- You can share your plan with other users by selecting **"Invite coworkers"** option. The invited person will receive a mail with a link to edit the dmp. +- You can rename, copy or delete a plan with the *"Rename"*, *"Copy"* or *"Delete"* options respectively. +- The **"Publish"** option will just result in the plan being made read-only. NOTE: currently a Digital Object Identifier (DOI) is not issued. +- The **"Export"** option results in the plan being displayed as an HTML document. You can use your browser's *"Print"* option to print the document to a file. Or, you can cut and paste the text into a new document. + NOTE: better support for exporting documents will appear in the near future. + + +(Request-Support-easydmp)= +## Request Support + +If you experience any problems, or you have any question, please contact . +Your request will be handled within the next 24 hours during working days. + + diff --git a/_sources/services/science-europe-template-practical-guide.md.txt b/_sources/services/science-europe-template-practical-guide.md.txt new file mode 100644 index 000000000..caa5f734e --- /dev/null +++ b/_sources/services/science-europe-template-practical-guide.md.txt @@ -0,0 +1,268 @@ +--- +orphan: true +--- + +(Science-Europe-template-Practical-Guide)= +# Science Europe template - Practical Guide + + +## How to use the guide + +The Science Europe guidelines consist of six core requirements - Data Collection, Data Quality, Data Storage, Ethical and Legal requirements, Data Sharing and Long Term Preservation, Data Management - translated into 13 questions. The present guide provides for each question (highlighted in italics) a high level description of the required information (in bullet and point), followed by the formulation of the question as it appears in easyDMP (in bold). Finally, one or more examples are provided to offer practical guidance for the formulation of the answer. + +The structure is therefore the following: +### Core requirement + +#### *Question* +- Description: +- ... +- ... + +- **Questions:** + + - **This is the question as it appears in easyDMP** + + - **Example Responses:** + + - ..... + - ... + +**All the question marked with a (\*) are mandatory.** +## The core requirements + +- {ref}`Administrative information ` +- {ref}`Data description and collection or reuse of existing data ` +- {ref}`Documentation and data quality ` +- {ref}`Storage and backup during the research process ` +- {ref}`Legal and ethical requirements, codes of conduct ` +- {ref}`Data sharing and long term preservation ` +- {ref}`Data management responsibilities and resources ` + +(Administrative-information)= +## Administrative information + +- Provide information such as name of applicant, project number, funding programme, (version of DMP). +- **Questions:** + - **Please give the title of the project.** * + - **Example Response:** + - The Example Project + - **Please give the project number, if known.** + - **Example Response:** + - Project 12345678 + - **Please give the funding programme.** * + - Example Response: + - The Norwegian Research Council + +(Data-description-and-collection-or-reuse-of-existing-data)= +## Data description and collection or reuse of existing data + +### *How will new data be collected or produced and/or how will existing data be re-used?* + + - Explain which methodologies or software will be used if new data are collected or produced. + - State any constraints on re-use of existing data if there are any. + - Explain how data provenance will be documented. + - Briefly state the reasons if the re-use of any existing data sources has been considered but discarded. + - **Questions:** + - **Describe how you plan to collect new or use existing data.** * + - **Example Response:** + - The project will use data from three sources: + - Experimental data produced by an X-ray crystallography on our sample materials. The data will be collected at the European Synchrotron Radiation Facility (ESRF ) and will be processed at the facility using the CCP4 () suite of software. Information on when the data was produced, and the conditions will be recorded in electronic notebooks at the ESRF. ASCII Log files will be produced by the CCP4 processing (the log files produced standardised output that is described in CCP4 documentation). + - Simulation data on the molecular dynamics of the sample material. Simulation data does exist, but we need to produce more simulation samples to increase the statistical accuracy. For the existing simulations, information on how the data were produced is recorded in log files that are maintained on central servers at institution X (e.g. `https://example.org/institution/simulationData`) that are publicly accessible. Information on the additional simulation produced by this project will be stored in ASCII log files. The information in simulation log files follows the structure described in the documentation at institution X. The reused simulation data and the simulation data produced by this project will be publicly accessible at no cost. All simulations are produced with the GROMACS program (). + - Images of the objects from which the samples were taken. The images will be taken with conventional digital photography. The images contain information on the location and date when they were taken. In addition, information on the conditions in which the photographs were taken will also be electronically recorded in an e-logbook. + + +### *What data (for example the kind, formats, and volumes), will be collected or produced?* + + - Give details on the kind of data: for example numeric (databases, spreadsheets), textual (documents), image, audio, video, and/or mixed media. + - Give details on the data format: the way in which the data is encoded for storage, often reflected by the filename extension (for example pdf, xls, doc, txt, or rdf). + - Justify the use of certain formats. For example, decisions may be based on staff expertise within the host organisation, a preference for open formats, standards accepted by data repositories, widespread usage within the research community, or on the software or equipment that will be used. + - Give preference to open and standard formats as they facilitate sharing and long-term re-use of data (several repositories provide lists of such ‘preferred formats’). + - Give details on the volumes (they can be expressed in storage space required (bytes), and/or in numbers of objects, files, rows, and columns). + - **Questions:** + - **Describe how much data, and the type of data you plan to collect or produce.** * + - **Example Response:** + - The project will create (approximately): + - 100 GB of high-resolution images in the standard JPEG () format. The JPEG format is chosen as a wide variety of software programs can read this format. + - 5 TB simulation data produced by the project will be stored in the GROMACS (GRO ) format that is standard in the community and is open. In addition, we will use 2 TB of existing simulation data. So, a total of 7 TB simulation data. + - 10 TB of experimental data produced by the CCP4 program and stored in the standardised Crystallographic Information File (CIF ) format that has an open license. + +(Documentation-and-data-quality)= +## Documentation and data quality + +### *What metadata and documentation (for example the methodology of data collection and way of organising data) will accompany the data?* + + - Indicate which metadata will be provided to help others identify and discover the data. + - Indicate which metadata standards (for example DDI, TEI, EML, MARC, CMDI) will be used. + _ Use community metadata standards where these are in place. + - Indicate how the data will be organised during the project, mentioning for example conventions, version control, and folder structures. Consistent, well-ordered research data will be easier to find, understand, and re-use. + - Consider what other documentation is needed to enable re-use. This may include information on the methodology used to collect the data, analytical and procedural information, definitions of variables, units of measurement, and so on. + - Consider how this information will be captured and where it will be recorded for example in a database with links to each item, a ‘readme’ text file, file headers, code books, or lab notebooks. + - **Questions:** + - **Describe how you will organise and document your data.** * + - **Example Response:** + - The experimental data will be arranged in directories according to sample, synchrotron beam run and processing run. Information will be recorded in a relational database at the project leader’s institution with access control that enables collaborators to access the information. The metadata schema will be extracted from the ESRF ICAT metadata catalogue and will follow the ICAT schema that is an agreed standard and contains all the information necessary to understand the data. + - The simulation data will be stored in the central repository in institution X and will follow their layout. Metadata information will follow the metadata schema adopted by institution X which is used by many projects in this field. Documentation on the schema is widely available to researchers in the field and enables use of the data. + - The e-logbook information on the digital photographs will follow the Dublin Core metadata standard () to record information on the images. Each image will have a unique identifier that will match the Dublin Core identifier term making it easy for researchers to match the metadata to the data. + - Tutorials and documentation on the tools necessary to analyse the data will be maintained on the project web-site. In some cases, these will be links to widely-used tools. + + + +### *What data quality control measures will be used?* + + - Explain how the consistency and quality of data collection will be controlled and documented. This may include processes such as calibration, repeated samples or measurements, standardised data capture, data entry validation, peer review of data, or representation with controlled vocabularies. + - **Questions:** + - **Describe how you will control the consistency and quality of your data.** * + - **Example Response:** + - For the experimental data the quality of the data is recorded in the ESRF ICAT metadata catalogue and will be replicated to the project metadata catalogue. This metadata contains information on the position of the sample, the experimental station, beam conditions etc which is sufficient to understand the experimental data. The sample itself will be labelled and kept at the project leader’s institution for reference. + - Simulation data quality will be recorded in log files and reference data will be produced during each simulation that will be compared with existing reference data to ensure simulations remain within tolerance. Information on the machines the simulations ran on and when is recorded in the log files which will be archived at institution X. + - Digital photographs will be visually inspected on-site by project collaborators that have the right collect the images to ensure the images are of sufficient quality. A checklist of features will be drawn-up by experts in digital photography and each image will require approval by the WP leader responsible for acquiring the images. + +(Storage-and-backup-during-the-research-process)= +## Storage and backup during the research process + +### *How will data and metadata be stored and backed up during the research?* + + - Describe where the data will be stored and backed up during research activities and how often the backup will be performed. It is recommended to store data in least at two separate locations. + - Give preference to the use of robust, managed storage with automatic backup, such as provided by IT support services of the home institution. Storing data on laptops, stand-alone hard drives, or external storage devices such as USB sticks is not recommended. + - Explain how the data will be recovered in the event of an incident. + - Explain who will have access to the data during the research and how access to data is controlled, especially in collaborative partnerships. + - **Questions:** + - **Describe how you will securely store and back up and recover your data during your project.** * + - **Example Response:** + - The experimental data will initially be stored on the ESRF storage facility during data collection. The data will be subject to the ESRF storage and backup procedures () that project members will run in accordance with advice from ESRF. The ESRF provides access control which the project will control. Once the experiment has completed data will be transferred to the Norwegian Infrastructure for Research Data (NIRD) project storage where data backup is provided by NIRD. NIRD provides storage with access control, only project collaborators will be provided access to the NIRD storage for the project. In the case of NIRD and ESRF project storage data being lost the backup procedures provide restore capabilities where lost data can be identified and recovered. + - The simulation data will be produced on the Norwegian High-Performance Computing Facility (FRAM) which will store data on the NIRD project storage to which NIRD applies backup and recovery procedures. During the production data will be subject to access control restrictions to project collaborators. Once the simulation data are validated it will be transferred to the central storage at institution X which provides safe long-term storage for simulation data. Backup and recovery of NIRD storage is provided by NIRD and institution X will enforce the same backup and recovery procedures for the project’s data. + - The images will be transferred from digital camera to the University of Oslo Sensitive Data facility (TSD) as the data contain information of a personal sensitive nature in accordance to Norwegian privacy regulation. Data will be backed-up according to TSD policies and only approved researchers will have access to the images. The project has been approved by the Regional Ethical committee (REK SØ 2019/1234). + + + +### *How will data security and protection of sensitive data be taken care of during the research?* + + - Consider data protection, particularly if your data is sensitive for example containing personal data, politically sensitive information, or trade secrets. Describe the main risks and how these will be managed. + - Explain which institutional data protection policies are in place. + - **Questions:** + - **If your project uses sensitive data describe how you will take care of data protection and security.** + - **Example Response:** + - The images collected are data objects that are of a sensitive nature and therefore are subject to Norwegian legislation for handling and managing sensitive data as implemented by the University of Oslo (). The image data objects will be encrypted and imported inside the TSD by using an encrypted protocol. The camera removable hard-drives containing the images will be scrubbed and destroyed once the data has been transferred to the TSD. Only authorized collaborators will be provided access to the images, and export from the TSD server will be by no means possible. The images will be anonymised in accordance with the sensitive data legislation before exporting them out from the secure TSD area. + +(Legal-and-ethical-requirements-codes-of-conduct)= +## Legal and ethical requirements, codes of conduct + +### *If personal data are processed, how will compliance with legislation on personal data and on security be ensured?* + + - Ensure that when dealing with personal data, data protection laws (for example GDPR) are complied with: + - Gain informed consent for preservation and/or sharing of personal data. + - Consider anonymisation of personal data for preservation and/or sharing (truly anonymous data are no longer considered personal data). + - Consider pseudonymisation of personal data (the main difference with anonymisation is that pseudonymisation is reversible). + - Consider encryption which is seen as a special case of pseudonymisation (the encryption key must be stored separately from the data, for instance by a trusted third party). + - Explain whether there is a managed access procedure in place for authorised users of personal data. + - **Questions:** + - **If your project uses personal data describe how you will ensure compliance with legislation on personal data and security.** + - **Example Response:** + - The project has been approved by the Regional Ethical Committee (REK SØ 2019/1234). Sensitive Personal data will be stored inside TSD and access to the data will be strictly controlled by the Project principal investigator. + - The project will handle sensitive personal data containing information about individuals eating habitudes and health condition. Data collected through questionnaire will be pseudorandomized, and the key-file will be stored in a separate area inside TSD accessible only by the project principal investigator. + - The personal data (including mail address and private addresses) handled in the project will be collected after the explicit consent of the data owner, according to the GDPR regulation. Data will be deleted or anonymised after a maximum of four weeks. + + + + + +### *How will other legal issues, such as intellectual property rights and ownership, be managed? What legislation is applicable?* + + - Explain who will be the owner of the data, meaning who will have the rights to control access: + - Explain what access conditions will apply to the data? Will the data be openly accessible, or will there be access restrictions? In the latter case, which? Consider the use of data access and re-use licenses. + - Make sure to cover these matters of rights to control access to data for multi-partner projects and multiple data owners, in the consortium agreement. + - Indicate whether intellectual property rights (for example Database Directive, sui generis rights) are affected. If so, explain which and how will they be dealt with. + - Indicate whether there are any restrictions on the re-use of third-party data. + - **Questions:** + - **Describe how you plan to address other legal issues such as intellectual property rights and ownership.** * + - **Example Response:** + - The project will abide by the University’s intellectual property rights policy (), and the data are subject to no other IPR claims. + - During the course of the project data will remain restricted access until publication of research results. The data used in the publication will be submitted to an archive such as the NIRD research data archive where it will be publicly accessible. A Creative Commons license BY 4.0 () will apply to all the data. + + + +### *What ethical issues and codes of conduct are there, and how will they be taken into account?* + + - Consider whether ethical issues can affect how data are stored and transferred, who can see or use them, and how long they are kept. Demonstrate awareness of these aspects and respective planning. + - Follow the national and international codes of conducts and institutional ethical guidelines, and check if ethical review (for example by an ethics committee) is required for data collection in the research project. + - **Questions:** + - **If your data are impacted by ethical issues and codes of conduct describe how you will take account of them.** + - **Example Response:** + - The project will abide by the recommendations described in the EU Ethics and data protection guidelines to ensure the sensitive image data are correctly handled and will seek ethical review by the University of our plan for handling the sensitive image data. + + + +(Data-sharing-and-long-term-preservation)= +## Data sharing and long term preservation + +### *How and when will data be shared? Are there possible restrictions to data sharing or embargo reasons?* + + - Explain how the data will be discoverable and shared (for example by depositing in a trustworthy data repository, indexed in a catalogue, use of a secure data service, direct handling of data requests, or use of another mechanism). + - Outline the plan for data preservation and give information on how long the data will be retained. + - Explain when the data will be made available. Indicate the expected timely release. Explain whether exclusive use of the data will be claimed and if so, why and for how long. Indicate whether data sharing will be postponed or restricted for example to publish, protect intellectual property, or seek patents. + - Indicate who will be able to use the data. If it is necessary to restrict access to certain communities or to apply a data sharing agreement, explain how and why. Explain what action will be taken to overcome or to minimise restrictions. + - **Questions:** + - **Describe how and when you will share your data, and relevant information, including data you intend to preserve.** * + - **Example Response:** + - The project intends to use the NIRD Research Data Archive to store data, it and the community considers to be of lasting value. This will include data used in publications and data used to derive the results. The project will supply metadata to the archive that will be made publicly accessible and searchable. The archived data will be issued a DOI and made publicly accessible. The simulation data will be deposited in the institution X long-term repository along with the log files and metadata. It will be given an DOI by institution X and will be publicly accessible. The archived data will include documentation on the tools that can be used and how to use the data. + - Data will be published in the NIRD archive and made publicly available after the relevant articles have been published. One year after the end of the project the remaining data will be published in the archive. + - The images are sensitive and will only be accessible upon request from the TSD. + - The project will nominate a data manager responsible for fielding questions on the published data. + + + + + +### *How will data for preservation be selected, and where data will be preserved long-term (for example a data repository or archive)?* + + - Indicate what data must be retained or destroyed for contractual, legal, or regulatory purposes. + - Indicate how it will be decided what data to keep. Describe the data to be preserved long-term. + - Explain the foreseeable research uses (and/ or users) for the data. + - Indicate where the data will be deposited. If no established repository is proposed, demonstrate in the data management plan that the data can be curated effectively beyond the lifetime of the grant. It is recommended to demonstrate that the repositories policies and procedures (including any metadata standards, and costs involved) have been checked. + + + + + +### *What methods or software tools are needed to access and use data?* + + - Indicate whether potential users need specific tools to access and (re-)use the data. Consider the sustainability of software needed for accessing the data. + - Indicate whether data will be shared via a repository, requests handled directly, or whether another mechanism will be used? + + + +### *How will the application of a unique and persistent identifier (such as a Digital Object Identifier (DOI)) to each data set be ensured?* + + - Explain how the data might be re-used in other contexts. Persistent identifiers should be applied so that data can be reliably and efficiently located and referred to. Persistent identifiers also help to track citations and re-use. + - Indicate whether a persistent identifier for the data will be pursued. Typically, a trustworthy, long-term repository will provide a persistent identifier. + - **Questions:** + - **Describe how you will assign persistent identifiers to your data.** * + - **Example Response:** + - Publication of data in the NIRD archive will result in a DOI being issued for the data. Users interested in using the data will be able to discover the data through the publicly available metadata and download the data with the link provided. The dataset includes documentation on how to use the data. A contact person, the data manager, will be available to respond to queries about the data. + +(Data-management-responsibilities-and-resources)= +## Data management responsibilities and resources + +### *Who (for example role, position, and institution) will be responsible for data management (i.e. the data steward)?* + + - Outline the roles and responsibilities for data management/stewardship activities for example data capture, metadata production, data quality, storage and backup, data archiving, and data sharing. Name responsible individual(s) where possible. + - For collaborative projects, explain the co-ordination of data management responsibilities across partners. + - Indicate who is responsible for implementing the DMP, and for ensuring it is reviewed and, if necessary, revised. + - Consider regular updates of the DMP. + - **Questions:** + - **Describe who will be responsible for the management of your data.** * + - **Example Response:** + - The project identifies a work package tasked with project data management. The work package will be responsible for ensuring the data are prepared, securely stored, are of sufficient quality, metadata is collected and data are published once articles have been published. The data manager will be the work package leader and will be a member of the project steering board. + - The data manager will be responsible for quarterly updates of the data management plan. + + + +### *What resources (for example financial and time) will be dedicated to data management and ensuring that data will be FAIR (Findable, Accessible, Interoperable, Re-usable)?* + + - Explain how the necessary resources (for example time) to prepare the data for sharing/preservation (data curation) have been costed in. Carefully consider and justify any resources needed to deliver the data. These may include storage costs, hardware, staff time, costs of preparing data for deposit, and repository charges. + - Indicate whether additional resources will be needed to prepare data for deposit or to meet any charges from data repositories. If yes, explain how much is needed and how such costs will be covered. + - **Questions:** + - **Describe the resources that will be dedicated to the management of your data such that it follows the FAIR (Findable, Accessible, Interoperable, Reusable) principles.** * + - **Example Response:** + - The project has factored into the project timeline the data management through the inclusion of the data management work package that includes personnel funded by the project. The data manager will be a permanent member of staff who will be able to field questions on the data once the project has concluded. The project proposal also includes a request for funds for NIRD and TSD storage to be used during the lifetime of the project. The NIRD archive where the published data will be stored, and the Institution X repository for the simulation data are free to use. + diff --git a/_sources/software/appguides.md.txt b/_sources/software/appguides.md.txt new file mode 100644 index 000000000..bef333fc4 --- /dev/null +++ b/_sources/software/appguides.md.txt @@ -0,0 +1,44 @@ +(appguides)= + +# Application guides + +For a general explanation on how to make an application available for use, the +module system, and information about changes in application software see +{ref}`module-scheme`. + +```{important} +If you are using an application not on the list, or if you are using an +application that can take advantage of GPU compute, we would like to hear from +you so that we can document the application. + +Contact us at [support@nris.no](mailto:support@nris.no) +``` + +**List of applications:** +```{toctree} +:maxdepth: 1 +application_guides/beast.md +application_guides/bioinfo.md +application_guides/castep/castep.md +application_guides/cdo.md +application_guides/cesm.md +application_guides/gaussian/gaussian.md +application_guides/gdal.md +application_guides/gromacs.md +application_guides/lammps.md +application_guides/matlab.md +application_guides/namd.md +application_guides/nco.md +application_guides/ncview.md +application_guides/nwchem.md +application_guides/opendrift.md +application_guides/openfoam.md +application_guides/paraview_server.md +application_guides/paraview_x11.md +application_guides/paraview_web.md +application_guides/proj.md +application_guides/schrodinger/schrodinger.md +application_guides/vasp.md +application_guides/visit.rst +application_guides/wrf.md +``` diff --git a/_sources/software/application_guides/beast.md.txt b/_sources/software/application_guides/beast.md.txt new file mode 100644 index 000000000..9a81fade2 --- /dev/null +++ b/_sources/software/application_guides/beast.md.txt @@ -0,0 +1,138 @@ +# Beast + +> BEAST 2 is a cross-platform program for Bayesian phylogenetic analysis of +> molecular sequences. It estimates rooted, time-measured phylogenies using +> strict or relaxed molecular clock models. It can be used as a method of +> reconstructing phylogenies but is also a framework for testing evolutionary +> hypotheses without conditioning on a single tree topology. BEAST 2 uses +> Markov chain Monte Carlo (MCMC) to average over tree space, so that each tree +> is weighted proportional to its posterior probability. BEAST 2 includes a +> graphical user-interface for setting up standard analyses and a suit of +> programs for analysing the results. + +[More information can be found on Beast2's homepage.](https://www.beast2.org/) + +## Running Beast + +To run Beast load one of the available modules, discoverable with: + +```console +$ module spider beast +``` + +### CPU job script + +The following script can be used to run Beast on CPUs only. + +```{note} +The script requests a full node's worth of CPUs on Saga, `--cpus-per-task`, you +should test for your self with your own input to see if this is necessary. The +same is also true for the `--mem-per-cpu` parameter, other input might require +more or less memory. +``` + +`````{tabs} +````{group-tab} Saga + +```bash +#!/bin/bash + +#SBATCH --account=nnk +#SBATCH --job-name=beast_cpu +#SBATCH --time=04:00:00 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem-per-cpu=1G + +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +module --quiet purge +module load Beast/2.6.7-GCC-10.3.0-CUDA-11.3.1 +module list + +beast -beagle_cpu -threads $SLURM_CPUS_PER_TASK test.xml +``` +```` +````` + +```{eval-rst} +:download:`test.xml <./beast/test.xml>` +``` + +### GPU job script + +Beast supports GPU acceleration through a GPU accelerated version of +[`beagle-lib`](https://github.com/beagle-dev/beagle-lib). If your input +supports it, using GPUs should improve the performance, but always test before +submitting long running jobs as GPU jobs "cost" more than pure CPU jobs. + +`````{tabs} +````{group-tab} Saga + +```bash +#!/bin/bash + +#SBATCH --account=nnk +#SBATCH --job-name=beast_cpu +#SBATCH --time=00:30:00 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=4G +#SBATCH --partition=accel +#SBATCH --gpus=1 + +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +module --quiet purge +module load Beast/2.6.7-GCC-10.3.0-CUDA-11.3.1 +module list + +beast -beagle_gpu test.xml +``` +```` +````` + +```{eval-rst} +:download:`test.xml <./beast/test.xml>` +``` + +#### Performance increase + +Some timing examples are listed below for the example sequences in `test.xml`, +linked above. The requested resources are chosen based on billing units, see +{ref}`projects-accounting`, where one GPU is the equivalent of six CPU cores. +Then the memory is chosen such that it will *not* be the determining factor for +the overall billing. Note that `CPU-hrs` is what your job is billed, i.e. what +you pay for the job. + +| Configuration | CPUs | GPUs | MEM | Runtime | Speedup | Billing | CPU-hrs | +|---------------|------|------|-----|--------:|---------|---------|--------:| +| Reference | 1 | 0 | 4G | 13716.573s | 1.0 | 1 | 3.8 | +| 1 GPU equivalent | 6 | 0 | 6G | 4328.023s | 3.2 | 6 | 7.2 | +| Half normal node | 20 | 0 | 20G | 3998.535s | 3.4 | 20 | 22.2 | +| Full normal node | 40 | 0 | 40G | 6103.471s | 2.3 | 40 | 67.8 | +| 1 GPU | 1 | 1 | 4G | 1174.682s | 11.7 | 6 | 2.0 | +| 2 GPUs | 2 | 2 | 8G | 4153.796s | 3.3 | 12 | 13.8 | + +The impression we get from the above data is that Beast is well suited for +running on GPUs, but limited to a single GPU. By using GPU acceleration for our +test example we reduce our CPU hour spending (`2.0` compared to `3.8`) while +also seeing an increase in performance (lower total runtime). In other words, +using GPUs is _both cheaper and faster_ than using the pure CPU implementation. + +## License information + +Beast 2 is available under the [GNU Lesser General Public License (LGPL) +version 2.1](https://github.com/CompEvol/beast2/blob/master/COPYING). + +It is the **user’s** responsibility to make sure they adhere to the license +agreements. + +## Citation + +The homepage contains [information about how to cite +Beast](https://www.beast2.org/citation/). diff --git a/_sources/software/application_guides/bioinfo.md.txt b/_sources/software/application_guides/bioinfo.md.txt new file mode 100644 index 000000000..609d37292 --- /dev/null +++ b/_sources/software/application_guides/bioinfo.md.txt @@ -0,0 +1,29 @@ +# Bioinformatics Software & Databases + +## Bioinformatics Databases +To save your storage quota and to unload our file system a bit, we offer some +bioinformatics databases in an openly readable folder on Saga. We try to keep +them updated, but if you find them to be outdated, please write us at +support@nris.no + +All of databases for the following programs can be found on Saga under +`/cluster/shared/databases/`. + +- Bowtie2: database coming shortly +- EMBL database excluding human & environmental records +- NCBI taxonomy database +- ecoPCR database reformatted for the ecotag step of [OBITools + pipeline](bioinfo/obitools.md) +- UniProt complete database + +## Installed Bioinformatics Software +```{toctree} +:maxdepth: 1 +bioinfo/alphafold.md +beast.md +bioinfo/blast.md +bioinfo/busco.md +bioinfo/kraken2.md +bioinfo/obitools.md +bioinfo/rosettafold.md +``` diff --git a/_sources/software/application_guides/bioinfo/alphafold.md.txt b/_sources/software/application_guides/bioinfo/alphafold.md.txt new file mode 100644 index 000000000..bb2c70451 --- /dev/null +++ b/_sources/software/application_guides/bioinfo/alphafold.md.txt @@ -0,0 +1,31 @@ +# AlphaFold +AlphaFold v2 is a protein structure prediction software based on a neural network +approach. + +[To find out more, visit the AlphaFold +website.](https://github.com/deepmind/alphafold) + +## Running AlphaFold + +| Module | Version | +| :------------- | :------------- | +| AlphaFold | 2.0.0.1
2.1.1| + +To see available versions when logged into Saga, use: + + module spider alphafold + +To use AlphaFold type + + module load Alphafold/ + +## License Information + +AlphaFold is available under the [Apache License, Version +2.0](https://www.apache.org/licenses/LICENSE-2.0). + +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/bioinfo/blast.md.txt b/_sources/software/application_guides/bioinfo/blast.md.txt new file mode 100644 index 000000000..c39d98057 --- /dev/null +++ b/_sources/software/application_guides/bioinfo/blast.md.txt @@ -0,0 +1,30 @@ +# BLAST +Basic Local Alignment Search Tool, or BLAST, is an algorithm + for comparing primary biological sequence information, such as the amino-acid + sequences of different proteins or the nucleotides of DNA sequences. + +[To find out more, visit the blast website.](https://blast.ncbi.nlm.nih.gov/) + +## Running BLAST + +| Module | Version | +| :------------- | :------------- | +| BLAST+ | 2.8.1-foss-2018b
2.8.1-intel-2018b
2.9.0-gompi-2019a
2.9.0-gompi-2019b
2.9.0-iimpi-2019a
2.9.0-iimpi-2019b
2.10.1-gompi-2020a
2.10.1-iimpi-2020a
2.11.0-gompi-2020b
2.11.0-gompi-2021a
2.12.0-gompi-2021b | + +To see available versions when logged into Saga, write + + module spider BLAST+ + +To use BLAST+ type + + module load BLAST+/ + +## License Information + +The NCBI BLAST+ software suite is available freely under the public domain. + +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/bioinfo/busco.md.txt b/_sources/software/application_guides/bioinfo/busco.md.txt new file mode 100644 index 000000000..1d2d72856 --- /dev/null +++ b/_sources/software/application_guides/bioinfo/busco.md.txt @@ -0,0 +1,30 @@ +# BUSCO +BUSCO is a tool to assess genome assembly and annotation completeness with +single-copy orthologs. + +[To find out more, visit the BUSCO +website.](https://busco.ezlab.org/) + +## Running BUSCO + +| Module | Version | +| :------------- | :------------- | +| BUSCO | 3.0.2-intel-2018b-Python-2.7.15
4.0.5-foss-2019b-Python-3.7.4
4.1.4-foss-2020a-Python-3.8.2
5.0.0-foss-2020b | + +To see available versions when logged into Saga issue command + + module spider BUSCO + +To use BUSCO type + + module load BUSCO/ + +## License Information + +BUSCO is available under the [MIT license](https://opensource.org/licenses/MIT). + +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/bioinfo/kraken2.md.txt b/_sources/software/application_guides/bioinfo/kraken2.md.txt new file mode 100644 index 000000000..1b462223f --- /dev/null +++ b/_sources/software/application_guides/bioinfo/kraken2.md.txt @@ -0,0 +1,30 @@ +# Kraken +Kraken is a system for assigning taxonomic labels to short DNA sequences, +usually obtained through metagenomic studies. + +[To find out more, visit the Kraken +website.](https://github.com/DerrickWood/kraken2/wiki) + +## Running Kraken + +| Module | Version | +| :------------- | :------------- | +| Kraken2 | 2.0.8-beta-foss-2018b-Perl-5.28.0
2.0.9-beta-foss-2018b-Perl-5.28.0
2.1.1-foss-2020b-Perl-5.32.0
2.1.2-gompi-2021a | + +To see available versions when logged into Saga issue command + + module spider Kraken2 + +To use Kraken type + + module load Kraken2/ + +## License Information + +Kraken is available under the [MIT license](https://opensource.org/licenses/MIT). + +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/bioinfo/obitools.md.txt b/_sources/software/application_guides/bioinfo/obitools.md.txt new file mode 100644 index 000000000..62b0f2c02 --- /dev/null +++ b/_sources/software/application_guides/bioinfo/obitools.md.txt @@ -0,0 +1,32 @@ +# OBITools +The OBITools package is a set of programs specifically designed for +analyzing NGS data in a DNA metabarcoding context, taking into account +taxonomic information. + +[To find out more, visit the OBITools +website.](https://pythonhosted.org/OBITools/welcome.html) + +## Running OBITools + +| Module | Version | +| :------------- | :------------- | +| OBITools | 1.2.12-foss-2018b-Python-2.7.15 | + +To see available versions when logged into Saga issue command + + module spider OBITools + +To use OBITools type + + module load OBITools/ + +## License Information + +OBITools is available under the [CeCILL v2 +license](https://cecill.info/licences/Licence_CeCILL_V2-en.html) + +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/bioinfo/rosettafold.md.txt b/_sources/software/application_guides/bioinfo/rosettafold.md.txt new file mode 100644 index 000000000..7d4203382 --- /dev/null +++ b/_sources/software/application_guides/bioinfo/rosettafold.md.txt @@ -0,0 +1,31 @@ +# RoseTTAFold +RoseTTAFold is a collection of deep learning models and related scripts for +accurate prediction of protein structures and interactions using a 3-track +network. + +[To find out more, visit the RoseTTAFold +website.](https://github.com/RosettaCommons/RoseTTAFold/) + +## Running RoseTTAFold + +| Module | Version | +| :------------- | :------------- | +| RoseTTAFold |1.0.0.1| + +To see available versions when logged into Saga issue command + + module spider RoseTTAFold + +To use RoseTTAFold type + + module load RoseTTAFold/ + +## License Information + +RoseTTAFold is available under the [MIT license]( +https://opensource.org/licenses/MIT) +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/castep/castep.md.txt b/_sources/software/application_guides/castep/castep.md.txt new file mode 100644 index 000000000..ca3505a59 --- /dev/null +++ b/_sources/software/application_guides/castep/castep.md.txt @@ -0,0 +1,119 @@ +(castep)= + +# CASTEP + +```{note} +If you want to contribute to the documentation of this code, please contact us at [support@nris.no](mailto:support@nris.no) +``` + +```{contents} Table of Contents +``` +## Introduction + +CASTEP is a leading code for calculating the properties of materials from first principles. Using density functional theory, it can simulate a wide range of properties of materials proprieties including energetics, structure at the atomic level, vibrational properties, electronic response properties etc. In particular it has a wide range of spectroscopic features that link directly to experiment, such as infra-red and Raman spectroscopies, NMR, and core level spectra. + +More information on the [CASTEP +website](http://www.castep.org). + + +(access)= +### License and access + +The CASTEP Developers' Group (CDG) and Cambridge Enterprise have announced a cost-free worldwide source code license to CASTEP and NMR CASTEP for academic use. Detailed information about this is given [here](http://www.castep.org/CASTEP/GettingCASTEP). + +To get access to CASTEP, you need to follow the procedure described below. + +If you, however, wonder if you are in the castep group of users, you can find out by typing: + +```bash +id | tr "," "\n" | grep castep +``` +If this command comes out with a result, then you are in the group and may use the code - if not, go through the following procedure: + +``` +Access to CASTEP is limited by membership in the castep group. +In order to use the software on our infrastructure, first make sure you have access to a valid license. + +**When you have access to a valid CASTEP license** +Send an email to + +contact@sigma2.no with the following information: +* Full name +* E-mail address +* ORCA ID + +We will then have to manually verify this information with STFC UK/CASTEP before granting access. As such there may be some waiting time unfortunately. +``` + +### Citation + +For the recommended citation, please consult the [CASTEP referencing page](http://www.castep.org/CASTEP/ReferencingAndLogo). + +## CASTEP on NRIS machinery + +Currently, **CASTEP** is installed on {ref}`fram` and {ref}`saga`. To see available versions when logged on to the machine in question, use the ```module avail``` or ```module spider``` commands as shown below: + +```bash +module avail castep +``` +If you are in the castep group of users, you may use CASTEP by typing: + +```bash +module load CASTEP/ +# (eg. module load CASTEP/22.1.1-intel-2022b) +``` +specifying one of the available versions. + +### Running CASTEP on NRIS machines + +**Please inspect the job script examples and/or jobscripts before submitting jobs!** + +
+Testing CASTEP: Ethene + +
+To test CASTEP, we have borrowed the Ethene-example from [www.mjr19.org.uk/castep/test.html](https://www.mjr19.org.uk/castep/test.html). To perform this test, you need two files; + +One file called **ethene.cell** with the contents + +```{literalinclude} ethene.cell +:language: bash +``` +and one called ethene.param with the contents + +```{literalinclude} ethene.param +:language: bash +``` + +Running **CASTEP** would produce an ethene.castep file (amongst others) within seconds for the running examples provided below. Towards the end of this file, there are final structure energy outputs printed, ```Final energy, E```; values here should be in the range of -378.015eV. + +A subset of the benchmark sets, the medium set [al3x3](http://www.castep.org/CASTEP/Al3x3) and [solid benzene](http://www.castep.org/CASTEP/SolidBenzene) together with the ethene-example used here has been added to the CASTEP home folder on both Fram and Saga. You get them into your working directory by typing + +```bash +cp /cluster/software/CASTEP/benchmarks/* . +``` + +
+ +
+Running CASTEP on Fram + +
+On Fram, you currently run exclusively on nodes by default. Note that means that you are using the nodes exclusively - thus if you ask for less than a full node, you might experience that more than one job is stacked on one node. This is something that you should keep in mind when submitting jobs. + +```{literalinclude} fram_castep.sh +:language: bash +``` +
+ +
+Running CASTEP on Saga + +
+On Saga, the nodes have more memory than on Fram and you are allowed to share nodes with others. Thus the specs on memory in runscript example below. Note that, due to the higher memory amount you may be able to use more cores/node on Saga. Note, however, that there is a 256 core limit on Saga. + +```{literalinclude} saga_castep.sh +:language: bash +``` +
+ diff --git a/_sources/software/application_guides/cdo.md.txt b/_sources/software/application_guides/cdo.md.txt new file mode 100644 index 000000000..c94e0b1d6 --- /dev/null +++ b/_sources/software/application_guides/cdo.md.txt @@ -0,0 +1,31 @@ +# CDO + +CDO is a collection of tools for analyzing Climate and NWP model Data. + +[More information here.](https://code.zmaw.de/projects/cdo) + +## Running CDO + +| Module | Version | +| :------------- | :------------- | +| CDO |1.8.2-intel-2017a
1.9.3-intel-2018a| + +To see available versions when logged into Fram issue command + + module spider cdo + +To use CDO type + + module load CDO/ + +specifying one of the available versions. + +## License Information + +CDO is available under the [GNU General Public License](https://www.gnu.org/licenses/old-licenses/gpl-2.0.html), version 2 (GPLv2). + +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/cesm.md.txt b/_sources/software/application_guides/cesm.md.txt new file mode 100644 index 000000000..a1d742ca8 --- /dev/null +++ b/_sources/software/application_guides/cesm.md.txt @@ -0,0 +1,31 @@ +# CESM + +Community Earth System Model (CESM) is a software suite for simulating Earth climate. + +[More information here.](https://www.cesm.ucar.edu/) + +## Running CESM + +| Module | Version | +| :------------- | :------------- | +| CESM |1.2.2.1-intel-2017a| + +To see available versions when logged into Fram issue command + + module spider cesm + +To use CESM type + + module load CESM/ + +specifying one of the available versions. + +## License Information + +CESM is public domain software, however, portions of the CESM package are under other licenses. For more information, visit http://www.cesm.ucar.edu/models/cesm1.0/copyright.html + +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/gaussian/gaussian.md.txt b/_sources/software/application_guides/gaussian/gaussian.md.txt new file mode 100644 index 000000000..c9456c4a7 --- /dev/null +++ b/_sources/software/application_guides/gaussian/gaussian.md.txt @@ -0,0 +1,36 @@ +# The GAUSSIAN program system + +* {ref}`gaussian-job-examples` +* {ref}`gaussian-resource-utilization` +* {ref}`gaussian-tuning` +* {ref}`gaussview` + +[Gaussian](https://gaussian.com/) is a versatile program package for for electronic structure modeling and computational chemistry, and frequently used on NRIS Hardware platforms. + +**Official documentation:** + +## License and access + +The installed license of GAUSSIAN on NRIS machines is an academic license and consists of four individual site licenses bought by the four partner Universities of NRIS (NTNU, UiB, UiO, UiT). Users from these institutions should be autmatically added to the `gaussian`group of users. Other users need to document valid access to the relevant license (academic and/or full commercial) before being granted access to Gaussian in NRIS. + +- To have access to the code, you need to be in the `gaussian` group of + users.
Check this with the command `id | grep gaussian`. +- If not in the group, {ref}`contact us ` and ask to be added. +- Provide necessary documentation to be added in the group. + +## Citation + +For the recommended citation, please consult [gaussian.com/citation](https://gaussian.com/citation/) + + +## Gaussian on NRIS machinery + +Currently, the Gaussian software is installed on {ref}`fram` and {ref}`saga`. We use a slightly unorthodox setup for Gaussian - redirecting LD library path to rsocket instead of socket library before loading and starting binaries, which ensures securing satisfactory scaling beyond the 2 nodes/Linda instances (See whitepaper: [Improving Gaussian’s parallel performance using Infiniband](gaussianoverib.pdf)). + +So, if you see this warning in your Slurm output, there is not a reason for concern: +```text +ntsnet: WARNING: /cluster/software/Gaussian/g16_C.01/linda-exe/l302.exel may +not be a valid Tcp-Linda Linda executable. +``` + +Also note that there are internal differences between the different NRIS machines in terms of better practice for running Gaussian jobs. This will be further discussed in the {ref}`gaussian-job-examples`sections and/or the {ref}`gaussian-resource-utilization` section. diff --git a/_sources/software/application_guides/gaussian/gaussian_job_example.md.txt b/_sources/software/application_guides/gaussian/gaussian_job_example.md.txt new file mode 100644 index 000000000..8569e8162 --- /dev/null +++ b/_sources/software/application_guides/gaussian/gaussian_job_example.md.txt @@ -0,0 +1,152 @@ +--- +orphan: true +--- + +(gaussian-job-examples)= + +# Gaussian NRIS machines Job Examples + +```{note} +Here we present tested examples for various job types on the different NRIS machines. +This will be under more or less continoues development, and if you find things missing +and/or not working as expected, do not hesitate to {ref}`contact us `. +``` + +## Expected knowledge base + +Before you run any Gaussian calculations, or any other calculations on NRIS machines for that matter, you are expected to update yourself on NRIS machinery specifics. A decent minimal curriculum is as follows: + +* {ref}`hardware-overview` +* {ref}`getting-started` +* {ref}`getting-started` +* {ref}`running-jobs` +* {ref}`job-scripts` + +### Finding available Gaussian versions and submitting a standard Gaussian job +To see which versions of Gaussian software which are available on a given machine; type the following command after logged into the machine in question: + + module avail Gaussian + +To use Gaussian, type + + module load Gaussian/ + +specifying one of the available versions. + +**Please inspect the job script examples before submitting jobs!** + +To run an example - create a directory, step into it, create an input file (for example for water - see below), download a job script (for example the fram cpu job script as shown below) and submit the script with: + + $ sbatch fram_g16.sh + + +## Gaussian input file examples + +- Water input example (note the blank line at the end; `water.com`): + +```{literalinclude} water.com +``` + +- Caffeine input example (note the blank line at the end; `caffeine.com`): + +```{literalinclude} caffeine.com +``` + +## Running Gaussian on Fram + +On Fram, you currently run exclusively on nodes by default. Note that means that you are using the nodes exclusively - thus if you ask for less than a full node, you might experience that more than one job is stacked on one node. This is something that you should keep in mind when submitting jobs. + + +- Job script example (`fram_g16.sh`): + +```{literalinclude} fram_g16.sh +:language: bash +``` + + +## Running Gaussian on Saga + +On Saga there are more restrictions and tricky situations to consider than on Fram. First and foremost, there is a heterogenous setup with some nodes having 52 cores and most nodes having 40 cores. Secondly, on Saga there is a 256 core limit, efficiently limiting the useful maximum amount of nodes for a Gaussian job on Saga to 6. And third, since you do share the nodes by default - you need to find a way to set resource allocations in a sharing environment not necessarily heterogenous across your given nodes. + +Currently, we are working to find a solution to all these challenges and as of now our advices are: +Up to and including 2 nodes should can be done with standard advices for running jobs on Saga. +For 3 nodes and above you either need to run with full nodes or using the slurm exclusive flag: `#SBATCH--exclusive`. We prefer the latter due to robustness. + +To facilitate this, the g16 wrapper has been edited to both be backwards compatible and adjust for the more recent insight on our side. If you are not using this wrapper, please look into the wrapper to find syntax for using in your job script. Wrapper(s) are all available in Gaussian Software folder. Current name is g16.ib. + + +- Job script example (`saga_g16.sh`): + +```{literalinclude} saga_g16.sh +:language: bash +``` + + + +## Running Gaussian on GPUs on Saga + +Both of the current `g16` versions on Saga supports GPU offloading, and we have provided +an alternative wrapper script for launching the GPU version. The only things that +need to change in the run script are the resource allocation, by adding `--gpus=N` +and `--partition=accel`, and to use the `g16.gpu` wrapper script instead of `g16.ib`. +The `g16.gpu` script is available through the standard Gaussian modules, `Gaussian/g16_B.01` +and `Gaussian/g16_C.01` (the latter will likely have better GPU performance since it is +the more recent version). + +There are some important limitations for the current GPU version: + +- It can only be run as single-node (up to 24 CPU cores + 4 GPU), so please specify `--nodes=1` +- The number of GPUs must be specified with the `--gpus=N` flag (not `--gpus-per-task`) +- The billing ratio between GPUs and CPUs is 6:1 on Saga, so the natural way to increment +resources is to add 6 CPUs per GPU +- Not all parts of Gaussian is able to utilize GPU resources. From the official [docs](https://gaussian.com/gpu/): +```text +GPUs are effective for larger molecules when doing DFT energies, gradients and frequencies +(for both ground and excited states), but they are not effective for small jobs. They are +also not used effectively by post-SCF calculations such as MP2 or CCSD. +``` + + - Run script example (`gpu_g16.sh`) +```{literalinclude} gpu_g16.sh +:language: bash +:emphasize-lines: 5-8, 30 +``` + + +Some timing examples are listed below for a single-point energy calculation on the +Caffeine molecule using a large quadruple zeta basis set. The requested resources are +chosen based on billing units, see {ref}`projects-accounting`, where one GPU is the +equvalent of six CPU cores. Then the memory is chosen such that it will *not* be the +determining factor for the overall billing. + +| Configuration | CPUs | GPUs | MEM | Run time | Speedup | Billing | CPU-hrs | +|:----------------------:|:--------:|:------:|:---------:|:-------------:|:---------:|:---------:|:------------:| +| Reference | 1 | 0 | 4G | 6h51m26s | 1.0 | 1 | 6.9 | +| 1 GPU equivalent | 6 | 0 | 20G | 1h00m45s | 6.8 | 6 | 6.1 | +| 2 GPU equivalents | 12 | 0 | 40G | 36m08s | 11.4 | 12 | 7.2 | +| 3 GPU equivalents | 18 | 0 | 60G | 30m14s | 13.6 | 18 | 9.1 | +| 4 GPU equivalents | 24 | 0 | 80G | 19m52s | 20.7 | 24 | 7.9 | +| Full normal node | 40 | 0 | 140G | 13m05s | 31.4 | 40 | 8.7 | +| 1/4 GPU node | 6 | 1 | 80G | 22m41s | 18.1 | 6 | 2.3 | +| 1/2 GPU node | 12 | 2 | 160G | 15m44s | 26.2 | 12 | 3.1 | +| 3/4 GPU node | 18 | 3 | 240G | 12m03s | 34.1 | 18 | 3.6 | +| Full GPU node | 24 | 4 | 320G | 10m12s | 40.3 | 24 | 4.1 | + +The general impression from these numbers is that Gaussian scales quite well for this +particular calculation, and we see from the last column that the GPU version is +consistently about a factor two more efficient than the CPU version, when comparing +the actual consumed CPU-hours. This will of course depend on the conversion factor from +CPU to GPU billing, which will depend on the system configuration, but at least with +the current ratio of 6:1 on Saga it seems to pay off to use the GPU over the CPU +version (queuing time not taken into account). + +If you find any issues with the GPU version of Gaussian, please contact us at {ref}`our support line `. + +```{note} +The timings in the table above represent a single use case, and the behavior might be +very different in other situations. Please perform simple benchmarks to check that +the program runs efficiently with your particular computational setup. Also do not +hesitate to contact us if you need guidance on GPU efficiency, see our extended +{ref}`extended-support-gpu`. +``` + diff --git a/_sources/software/application_guides/gaussian/gaussian_resources.md.txt b/_sources/software/application_guides/gaussian/gaussian_resources.md.txt new file mode 100644 index 000000000..58c1b66e0 --- /dev/null +++ b/_sources/software/application_guides/gaussian/gaussian_resources.md.txt @@ -0,0 +1,98 @@ +--- +orphan: true +--- +(gaussian-resource-utilization)= +# Memory and Core Utilization + +This page contains info about special features related to the Gaussian install +made on NRIS machines, but also general issues related to Gaussian only +vaguely documented elsewhere. + + +## Gaussian over Infiniband + +First note that the installed Gaussian suites are currently +Linda parallel versions, so they scale out of single nodes. In additidon, our +Gaussian installation is done with a little trick, where loading of the executable +is intercepted before launch and an alternative socket library is loaded. We have also taken care of the rsh/ssh setup in our installation procedure, to avoid `.tsnet.config` dependency on user level. This +enables Gaussian to run on Infiniband network with native IB protocol, giving us two +advantages: + +* The parallel fraction of the code scales to more cores. +* The shared memory performance is significantly enhanced (small scale performance). + +To run Gaussian in parallel on two or more nodes requires the +additional keywords `%LindaWorkers` and `%NProcshared` in the Link 0 part of +the input file. In addition, since we do the interception-trick, we need to add the specific IB network node address into the input file. This is taken care of by a wrapper script (`g16.ib`) around +the original binary in each individual version folder. Please use this example(s) as starting point(s) when submitting jobs. + +Advised job submit syntax using the wrapper: + + g16.ib $input.com > g16_$input.out + +## Parallel scaling + +Gaussian is a rather large program system with a range of different binaries, +and users need to verify whether the functionality they use is parallelized and +how it scales both in terms of core and memory utilization. + +### Core utilization + +Due to the preload Infiniband trick, we have a somewhat more generous policy +when it comes to allocating cores/nodes to Gaussian jobs. + +- We strongly advice users to first +study the scaling of the code for a representative system. +- **Please do not reuse scripts inherited from others without studying the +performance and scaling of your job**. We recommend to take our +{ref}`gaussian-job-examples` as a starting point. + +Due to its versatility, it is hard to be general about Gaussian binary scaling. We do know that plain DFT-jobs with rather non-complicated molecules like caffeine scales easily up to the core-count limit on Saga and into the range of 16 nodes on Fram. On the other side, jobs with transition-metal containing molecules like Fe-corroles scales moderately outside of 2 full nodes on Saga. On a general note, the range on 2-4 nodes seems to be within decent scaling behaviour for most linda-executables (the LXYZ.exel-binaries, see the Gaussian home folder on each NRIS machine). Also note that due to different node-sharing policies for Fram and Saga there will be the need for setting an additional slurm-flag when running Gaussian jobs on Saga. + +### Memory utilization: + +The `%mem` allocation of memory in the Gaussian input file means two things: + +- In general, it means memory/node – for share between `nprocshared`, and + additional to the memory allocated per process. This is also documented by + Gaussian. +- For parallel jobs it also means the memory allocation hte LindaWorker will make, but from Linda9 and onwards the need for a memory allocation on the master node has been removed. + +However, setting %mem to a value of less than 80% of the physical memory (the actual number depends on the actual node since we have standard-, medium-memory-, and big-memory nodes, see {ref}`fram` and {ref}`saga`) is good practice since other system buffers, disk I/O and others can avoid having the system swap more than necessary. This is especially true for post-SCF calculations. To top this, the `%mem` tag is also influencing on performance; too high makes the job go slower, too low makes the job +fail. + +Please consider the memory size in your input if jobs fail. Our job example is +set up with 500MB (which is actually a bit on the small side), test-jobs were +ran with 2000MB. Memory demand also increases with an increasing number of +cores. But this would also limit the size of problems possible to run at a +certain number of cores. For a DFT calculation with 500-1500 basis sets, `%mem=25GB` +can be a reasonable setup. + +**Note that "heap size" is also eating from the memory +pool of the node (see below).** + + +## Management of large files + +### On Fram + +As commented in the {ref}`storage-performance`-page, there is an issue with very +large temporary output files (termed RW files in Gaussian). It is advisable to +slice them into smaller parts using the `lfs setstripe` command. + +### On Saga: +The corresponding situtation for Saga is described here; {ref}`saga-filesystem`. + +## Important aspects of Gaussian NRIS setup + + +### On Fram: + +On Fram, we have not allocated swap space on the nodes, meaning that the heap +size for the Linda processes in Gaussian is very important for making parallel +jobs run. The line + + export GAUSS_LFLAGS2="--LindaOptions -s 20000000" + +contains info about the heap size for the Linda communication of Gaussian +and the setting 20 GB (the number above) is sufficient for most calculations. This is the double of the standard amount for Gaussian, but after testing this seems necessary when allocating more than 4 nodes (more than 4 Linda-workers) and sufficient up to 8 nodes. Above 8 nodes, Linda-communication seems to need 30 - which amounts to half of the standard node physical memory and reducing the available amount for `%mem`accordingly. \ No newline at end of file diff --git a/_sources/software/application_guides/gaussian/gaussian_tuning.md.txt b/_sources/software/application_guides/gaussian/gaussian_tuning.md.txt new file mode 100644 index 000000000..13eb4b2aa --- /dev/null +++ b/_sources/software/application_guides/gaussian/gaussian_tuning.md.txt @@ -0,0 +1,172 @@ +--- +orphan: true +--- +(gaussian-tuning)= +# Gaussian performance tuning using single node shared memory + +###**The Linda version is not covered on this page**. +The local Linda installation +contain some local adaptations, like wrapper script etc not present in the +threaded shared memory version. **A tuning guide for the Linda versions is in preparation**. + +## Introduction + +Gaussian is a widely used and well known application. The current (pt. 2021) +implementation is Gaussian 16. This version can utilise several cores +using an OpenMP threading model and hence confined to a single machine using +shared memory. + +This guide is dealing with the shared memory version running on single node. +Gaussian is compiled using the PGI (now NVIDIA) compiler using OpenMP. +For those of you who know OpenMP well, be aware that the PGI compiler only support a +limited set of OpenMP environment flags. + +The Gaussian manual describe several options which is important for the performance. +The keywords most associated with parallel execution is +- `%NPROCSHARED=xx` +- `%MEM=yyyyMB` + +* `%NPROCSHARED` set how many OpenMP threads Gaussian can launch, as the name +suggest it's shared memory (which implies a single node). +* `%MEM` set how much memory (often called core) Gaussian can use in total, this include +memory for 2-electron integrals which might be calculated in-core if possible. + + +There are also environment variables that handle OpenMP and PGI compiler settings. +* `OMP_PROC_BIND` can be tried to improve performance (have been reported to fail with some SLURM jobs, +use with caution). +* `PGI_FASTMATH_CPU` should be set to the architecture used, see later. Some performance improvement. + +Recomputing 2-electron integrals, direct SCF is now default as it's faster to +compute these than reading them from disk. In addition doing the calculation +in memory (in-core) is also default (if possible). This can have a major +impact of performance. See later about memory allocation. + +Please consult the [Gaussian manual](https://gaussian.com) about specific settings. + + +## Scaling +Single node shared memory OpenMP is known to have limited scaling in most usage scenarios. +With the current processors it's common to have far more cores available in +each compute node than what Gaussian can effectively utilise. The figure below +show examples of scaling, not all applications scale linearly. Some even run +slower with a high core count. +![Types of scaling](figures/scaling.png "Types of scaling") + +Hence, care must be taken not to waste cores by requesting too many. +A scaling study should be undertaken and the optimal core count for the input +in question should be established. As Gaussian has a huge number of +different modes and methods clear guidelines cannot be given, only that +Gaussian (using OpenMP) does not scale to high core counts. + +It's important to set the same number of cores in `both` the SLURM file and the +Gaussian input file, or at least as many as specified in the Gaussian input. +The Linda version have a wrapper for this, but that is an exception. Please verify +that the number of cores in slurm `--tasks-per-node` and `%NPROCSHARED` are the same. +Gaussian will launch NPROCSHARED threads, if less allocated cores +some threads will share cores with severe performance implications. + +Quite common findings is that a relatively small number of cores are +optimal as the scaling is not very good for any higher number of cores. +These extra cores are essential wasted and could be used by other jobs. + +Below is an example of a real Gaussian run: + +![G16 run times](figures/g16-runtimes.png "G16 run times") + +![G16 speedup](figures/g16-speedup.png "G16 speedup") + +There is a significant speedup with core count from one core and up to +a core count of about 32. Using more than 40 cores seems counterproductive +for shared-memory parallelization. With Linda parallelization, we can go +further. +Even for problems with fairly long run times in the order of hours. It's not +expected that this will change with run times in the order of days as this is +an iterative process. + +After about 16 cores the return of applying more cores levels +off. Hence about 16 cores seems like a sweet spot. Running at 128 +cores waste 7/8 of the CPU resources. However, there is also the +amount of memory needed to run efficiently to be considered (see next +section). While cores might be idle when huge amount of memory is +need this is the nature of two limited resources. Both are resources, +but cores have historically been perceived as the most valuable. + +How many cores to use is an open question, most runs are different. A scaling study +should be undertaken at the beginning of a compute campaign. Running at 1,2,4,8 cores etc +and plotting speedup vs cores to find an optimal core count. + +## Memory specification +The Gaussian manual states that `"Requests a direct SCF calculation, in +which the two-electron integrals are recomputed as needed"`. This is +the default. In addition it states `"... SCF be performed storing the full +integral list in memory"`. This is done automatically if enough memory is requested, +see Gaussian manual on SCF : [Gaussian manual SCF](https://gaussian.com/scf/) . + +The figure below show a dramatic increase in performance at the memory size +where the 2-electron fit in the memory. From 8 to 6 hours (depending on +memory requested) down to less than 3 hours at the memory size where +all the 2-electrons integrals fit in the memory. + +![Effect of requesting memory](figures/g16-mem.png "Performance and memory requested") + +The problem is then to find how much memory is needed to fit the integrals in memory, the real gain +in performance is when enough memory is allowed to keep the 2-electron integrals in core. +This require a large amount of memory as seen from the figure above. +Another possibility is to review the `SLURM` log (all SLURM log files emit memory statistics) +and look for maximum resident memory at the end of the log file. When the job is running it's possible +to log in to the node the job is running on and run tools like `top` or `htop` and look at +memory usage for your application. See also our page on {ref}`choosing-memory-settings`. + +Just requesting far too much memory is a waste of resources. +We advise spending some time to get a handle of this threshold value +for your relevant input and use this a guideline for future runs. + +As the number of nodes with large amount of memory is limited it will +always be a trade off between queue time and run time. How much gain +(lower run time) will adding extra memory yield? + +The Sigma2 systems have a range of nodes with different amount of +memory, see the {ref}`hardware-overview`. + +It might be beneficial to check different nodes and associated +memory capabilities. Both Saga and Fram have Gaussian installed and both systems +have nodes with more memory installed. + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_sources/software/application_guides/gaussian/gaussview.md.txt b/_sources/software/application_guides/gaussian/gaussview.md.txt new file mode 100644 index 000000000..eea922211 --- /dev/null +++ b/_sources/software/application_guides/gaussian/gaussview.md.txt @@ -0,0 +1,19 @@ +--- +orphan: true +--- +(gaussview)= +# GaussView + +Gaussview is a visualization program that can be used to open Gaussian output files and checkpoint files (.chk) to display structures, molecular orbitals, normal modes, etc. You can also set up jobs and submit them directly. + +**Official documentation:** [https://gaussian.com/gaussview6/](https://gaussian.com/gaussview6/) + +## License and access +The license for Gaussian is commercial/proprietary and currently only UiT holds a site license. Thus GaussView on Sigma2 machines is available for UiT users only, unless a user or a group holds an invividual license to the code. + +### GaussView on Fram +To load and run GaussView on Fram, load the relevant Gaussian module, and then call GaussView: + + $ module avail GaussView + $ module load GaussView/6.0.16 + $ gview diff --git a/_sources/software/application_guides/gdal.md.txt b/_sources/software/application_guides/gdal.md.txt new file mode 100644 index 000000000..5f78fa549 --- /dev/null +++ b/_sources/software/application_guides/gdal.md.txt @@ -0,0 +1,31 @@ +# GDAL + +Geospatial Data Abstraction Library (GDAL) is a library for handling geospatial data in vector form. + +[To find out more, visit the GDAL website.](https://gdal.org/) + +## Running GDAL + +| Module | Version | +| :------------- | :------------- | +| GDAL |2.2.0-foss-2017a-Python-2.7.13-HDF5-1.8.18
2.2.0-intel-2017a-Python-2.7.13-HDF5-1.8.18
2.2.2-intel-2017b-Python-2.7.14-HDF5-1.8.19
2.2.3-intel-2018a-Python-2.7.14-HDF5-1.8.19
| + +To see available versions when logged into Fram issue command + + module spider gdal + +To use GDAL type + + module load GDAL/ + +specifying one of the available versions. + +## License Information + +GDAL is available under the MIT License (X11). For more information, visit https://trac.osgeo.org/gdal/wiki/FAQGeneral#WhatlicensedoesGDALOGRuse + +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/gromacs.md.txt b/_sources/software/application_guides/gromacs.md.txt new file mode 100644 index 000000000..85a07e03d --- /dev/null +++ b/_sources/software/application_guides/gromacs.md.txt @@ -0,0 +1,141 @@ +# GROMACS + + +GROMACS is a versatile package to perform molecular dynamics, i.e. simulate the +Newtonian equations of motion for systems with hundreds to millions of +particles. + +[More information can be found on GROMACS' homepage.](https://www.gromacs.org) + +## Running GROMACS + +| Module | Version | +| :------------- | :------------- | +| GROMACS |2018.1-foss-2018b
2019-foss-2018b
2019.4-foss-2019b
2020-foss-2019b
2020-fosscuda-2019b
2021-foss-2020b
| + +To see available versions when logged in, issue the following command: +```bash +module spider gromacs +``` + +To use GROMACS, type: + +```bash +module load GROMACS/ +``` + +specifying one of the available versions in the table above. + +### Sample GROMACS Job Script + +```bash +#!/bin/bash +#SBATCH --account=nnk +#SBATCH --job-name=topol +#SBATCH --time=1-0:0:0 +#SBATCH --nodes=10 + +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +module --quiet purge +module load GROMACS/ +module list + +case=$SLURM_JOB_NAME + +mpirun gmx_mpi mdrun $case.tpr +``` + +## Accelerating using GPUs +GROMACS is capable of speeding up by using attached accelerators, like the +Nvidia P100 GPU cards in Saga, or the A100 Nvidia GPUs in Betzy. Very little +adaptation is required on the user's side, as GROMACS is able to detect the GPUs +when available. Simply load a version of GROMACS with `fosscuda` in the name, +like `GROMACS/2020-fosscuda-2019b`, and then request GPUs with +`--partition=accel` and `--gpus-per-task=1`. + +```{note} +GROMACS can use multiple GPUs, but these must be attached to separate MPI +ranks. By using the `--gpus-per-task` flag we can request one GPU per MPI rank. +Keep in mind that both Saga and Betzy only have 4 GPUs per node which limits +the number of ranks per node. +``` + +```bash +#!/bin/bash +#SBATCH --account=nnk +#SBATCH --job-name=topol +#SBATCH --time=1-0:0:0 +## Total number of MPI ranks, can be more than 4, but should be multiple of 2 +#SBATCH --ntasks=1 +## Setup number of tasks and CPU cores per task +#SBATCH --ntasks-per-node=4 +#sbatch --cpus-per-task=2 # Minimum number of cores per MPI rank for GROMACS +## GPU setup +#SBATCH --partition=accel +#SBATCH --gpus-per-task=1 + +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +module --quiet purge +module load GROMACS/2020-fosscuda-2019b +module list + +case=$SLURM_JOB_NAME + +mpirun gmx_mpi mdrun $case.tpr +``` + +```{note} +In the above job script we combined `--ntasks`, `--ntasks-per-node`, +`--cpus-per-task` and `--gpus-per-task`. This might seem counter intuitive, but +we did it for good reason. + +First, by combining `--ntasks` and `--ntasks-per-node` the `--ntasks` takes +precedence and determines the number of MPI ranks to start. The +`--ntasks-per-node` then acts as limitation, determining the maximum number of +tasks per node +([reference](https://slurm.schedmd.com/sbatch.html#OPT_ntasks-per-node)). This +means that if we asked for `--ntasks=6` with `--ntasks-per-node=4` we would +still get 6 MPI ranks, but Slurm would have to reserve two nodes for us. + +We then used `--cpus-per-task`, this was done since GROMACS requires at least +two threads per MPI rank so that each MPI rank has one computation thread and +one communication thread. We could give GROMACS more CPUs per MPI rank, since +GROMACS supports shared memory parallelization in addition to GPU acceleration, +but that is something that each project/experiment needs to test for themselves +to determine the utility. + +Lastly, all of this combined ensures that if we want to use multiple GPUs +(which GROMACS support) we can simply increase the number of `--ntasks` and all +other parameters will be correct. There will only ever be 4 tasks per node +which corresponds to the number of GPUs per node and each of these MPI ranks +will have the necessary cores to run all bookkeeping tasks. +``` + +Using accelerators can give a nice speed-up, depending on the problem. As an +example we modeled a box of water with constant temperature and pressure. To +measure the difference in performance we compared a full CPU node on Saga with +a single GPU. + +| Configuration | Wall time (s) | ns/day | Speed-up | +|:--------------|:--------------|:-------|---------:| +| 40 CPU cores | 1892 | 456.427| `1x` | +| 1 GPU + 2 CPU cores | 823 | 1049.088| `2.3x` | + +## License Information + +GROMACS is available under the [GNU Lesser General Public License +(LGPL)](http://www.gnu.org/licenses/lgpl-2.1.html), version 2.1. + +It is the user's responsibility to make sure they adhere to the license +agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check +the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/lammps.md.txt b/_sources/software/application_guides/lammps.md.txt new file mode 100644 index 000000000..bfd6b5601 --- /dev/null +++ b/_sources/software/application_guides/lammps.md.txt @@ -0,0 +1,122 @@ +# LAMMPS + +LAMMPS is a classical molecular dynamics code, and an acronym for Large-scale +Atomic/Molecular Massively Parallel Simulator. LAMMPS has potentials for +solid-state materials (metals, semiconductors) and soft matter (biomolecules, +polymers) and coarse-grained or mesoscopic systems. + +It can be used to model atoms or, more generically, as a parallel particle +simulator at the atomic, meso, or continuum scale. LAMMPS runs on single +processors or in parallel using message-passing techniques and a +spatial-decomposition of the simulation domain. The code is designed to be easy +to modify or extend with new functionality. LAMMPS is distributed as an open +source code under the terms of the GPL. + +[More information here.](https://www.lammps.org) + +## Running LAMMPS + +| Module | Version | +| :------------- | :------------- | +| LAMMPS |11Aug17-foss-2017a
13Mar18-foss-2018a
| + +To see available versions when logged into Fram issue command + + module spider lammps + +To use LAMMPS type + + module load LAMMPS/ + +specifying one of the available versions. + +## Running on GPUs + +```{note} +We are working on compiling the LAMMPS module on Saga and Betzy with GPU +support, however, since that could take some time we are, for the time being, +presenting an alternative solution below. +``` + +LAMMPS is capable of running on GPUs using the [`Kokkos` +framework](https://github.com/kokkos/kokkos). However, the default distribution +of LAMMPS on Saga and Betzy are not compiled with GPU support. We will therefor +use {ref}`singularity` to use LAMMPS with GPUs. + +### Preparations + +We will first start by creating our LAMMPS singularity image based on [Nvidia's +accelerated images](https://catalog.ngc.nvidia.com/orgs/hpc/containers/lammps). +The following command will download the image and create a singularity +container with the name `lammps.sif`. + +```console +[user@login-X.SAGA ~]$ singularity pull --name lammps.sif docker://nvcr.io/hpc/lammps:29Sep2021 +``` + +Then, to retrieve the input test file (`in.lj.txt`) execute the following: + +```console +[user@login-X.SAGA ~]$ wget https://lammps.sandia.gov/inputs/in.lj.txt +``` + +### Slurm script + +To run this we will use the following Slurm script: + +```sh +#!/bin/bash + +#SBATCH --job-name=LAMMPS-Singularity +#SBATCH --account=nnk +#SBATCH --time=10:00 +#SBATCH --ntasks=2 +#SBATCH --mem-per-cpu=2G +#SBATCH --ntasks-per-node=4 +#SBATCH --partition=accel +#SBATCH --gpus=2 + +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error + +module --quiet purge # Reset the modules to the system default +# Note: We don't need any additional modules here as Singularity is always +# available + +srun singularity run --nv -B ${PWD}:/host_dir lammps.sif\ + lmp -k on g 2 -sf kk -pk kokkos cuda/aware on neigh full comm device binsize 2.8\ + -var x 8 -var y 8 -var z 8 -in /host_dir/in.lj.txt +``` + +In the script above note that `--gpus=X` (or `--gpus-per-node=X`) needs to be +the same as the parameter `-k on g X` for LAMMPS. We also recommend that users +have one MPI rank per GPU so that `--ntasks=X` is either equal to `--gpus=X` or +use `--gpus-per-task=1`. + +### Performance increase + +We modified the above input file to run for a bit longer (increased the number +of steps to `1000`). This gave us the following speed-ups compared to +`LAMMPS/3Mar2020-foss-2019b-Python-3.7.4-kokkos` on Saga. + +| Node configuration | Performance (`tau/day`) | Speed up | +|--------------------|-------------------------|----------| +| 40 CPU cores | 1439.286 | 1.0x | +| 1 GPU | 2670.089 | 1.8x | +| 2 GPUs | 4895.459 | 3.4x | +| 4 GPUs | 11610.741 | 8.0x | + +## License Information + +LAMMPS is available under the GNU Public License (GPLv3). For more information, +visit [the LAMMPS documentation +pages](https://docs.lammps.org/Intro_opensource.html). + +It is the user's responsibility to make sure they adhere to the license +agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check +the [developers web page in order to find the correct +citation(s)](https://docs.lammps.org/Intro_citing.html). diff --git a/_sources/software/application_guides/matlab.md.txt b/_sources/software/application_guides/matlab.md.txt new file mode 100644 index 000000000..a92aca03c --- /dev/null +++ b/_sources/software/application_guides/matlab.md.txt @@ -0,0 +1,87 @@ +# MATLAB + +MATLAB is a platform for solving engineering, mathematical, and graphical +problems. To find out more, visit the [MATLAB +website](https://se.mathworks.com/help/matlab). + +## License Information + +### Academic users + +Academic users need a link to a MATLAB license server for their university +(UiB, UiT, UiO or NTNU). Users from other universities can also use MATLAB on +Fram. Send an email to + +support@nris.no + +and ask for the link for your university and the license name to use when +submitting jobs. Add this link to the environment variable `MLM_LICENSE_FILE`: + +```bash +export MLM_LICENSE_FILE= +``` + +Add this environment variable setting into your `~/.bashrc`. When submitting +a job with, e.g., sbatch, use `sbatch --licenses=`. + + +#### Third-Party Access for Collaborative Research in Academia +See this [link (external +page)](https://se.mathworks.com/support/collaborative-research-academia.html). + +#### Commercial users +Commercial users need to sign a Hosting Provider agreement. Contact: +[contact@sigma2.no](mailto:contact@sigma2.no) + +## Running MATLAB + +| Module | Version | +| :------------- | :------------- | +| MATLAB |2017a| +| MATLAB |2018a| +| MATLAB |2018b| +| MATLAB |2019a| +| MATLAB |2020b| +| MATLAB |2021a| +| MATLAB |2021b| +| MATLAB |2022b| +| MATLAB |2023a| + +To see available versions when logged into Fram issue command +```bash +module spider matlab +``` +To use MATLAB type +```bash +module load MATLAB/ +# (eg. module load MATLAB/2021a) +``` + +specifying one of the available versions. + +### Sample MATLAB Job Script +```bash +#!/bin/bash +#SBATCH --account=nnNNNNk +#SBATCH --job-name=jobname +#SBATCH --time=1-0:0:0 +#SBATCH --nodes=1 + +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +module restore system +module load MATLAB/ + +matlab -nodisplay -nodesktop -nojvm -r "myprogram" + +## Note: if you are using the Parallel Computing Toolbox, remove -nojvm + +``` + +## MPI for Matlab +MPI for Matlab is installed on Fram/Saga/Betzy (for parallelizing on many +compute nodes). The following guides can help you get started: +- [Distributed Matlab (for non MPI programmers)](https://www.hpc.ntnu.no/matlab-for-hpc/distributed-matlab-using-mpi/) +- [Matlab MPI](https://www.hpc.ntnu.no/matlab-for-hpc/matlab-mpi/) diff --git a/_sources/software/application_guides/namd.md.txt b/_sources/software/application_guides/namd.md.txt new file mode 100644 index 000000000..22a0779cb --- /dev/null +++ b/_sources/software/application_guides/namd.md.txt @@ -0,0 +1,114 @@ +# NAMD +NAMD is designed for simulating large biomolecular systems. NAMD scales to many cores and is compatible with AMBER. + +[More information here.](https://www.ks.uiuc.edu/Research/namd/) + +## Running NAMD + +| Module | Version | +| :------------- | :------------- | +| NAMD |2.12-foss-2017a-mpi
2.12-intel-2018a-mpi
2017-11-06-foss-2017a-mpi
2018-02-15-intel-2018a-mpi
| + +To see available versions when logged into Fram issue command + + module spider namd + +To use NAMD type + + module load NAMD/ + +specifying one of the available versions. + +### Sample NAMD Job Script +``` +#!/bin/bash +#SBATCH --account=nnNNNNk +#SBATCH --job-name=ubq_ws +#SBATCH --time=1-0:0:0 +#SBATCH --nodes=10 + +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +module restore system +module load NAMD/ + +case=$SLURM_JOB_NAME + +## Prepare input files +cp $case.* $SCRATCH +cp par_all27_prot_lipid.inp $SCRATCH +cd $SCRATCH + +mpirun namd2 $case.conf + +## Copy results back to the submit directory +cleanup "cp $SCRATCH/* $SLURM_SUBMIT_DIR" +``` + +## GPU acceleration + +NAMD optionally supports acceleration on Saga and Betzy, using the attached +Nvidia GPUs of those systems. For users, very little adaptation is required, +mostly focused on the launch parameters needed (see below). To use the GPU +accelerated NAMD library, load any version of NAMD with the `fosscuda` +toolchain. Unfortunately this toolchain does not support MPI so if your problem +benefits more from wide scaling this is not applicable (please contact us +{ref}`if this applies to you `). + +```{note} +NAMD can utilize multiple GPUs, on a single node, but can also benefit from +running with multiple threads per GPU. Therefore we recommend testing with the +`--gpus=X` flag when selecting GPUs so that the number of threads are +independent from the number of GPUs requested. +``` + +### Example Slurm GPU script + +```bash +#!/usr/bin/bash + +#SBATCH --job-name=apoa1 +#SBATCH --account=nnk +#SBATCH --time=00:30:00 + +#SBATCH --cpus-per-task=8 +#SBATCH --mem-per-cpu=1G + +#SBATCH --partition=accel +#SBATCH --gpus=1 + +## Set up job environment: +set -o errexit # Exit the script on any error +set -o nounset # Treat any unset variables as an error + +module --quiet purge # Reset the modules to the system default +module load NAMD/2.14-fosscuda-2019b +module list + +namd2 +p$SLURM_CPUS_PER_TASK +devices $CUDA_VISIBLE_DEVICES apoa1.namd +``` + +### Potential speed-up + +We ran the [`APOA1` +benchmark](https://www.ks.uiuc.edu/Research/namd/utilities/apoa1/) on Saga +comparing a full node (40 CPUs) against a few GPU configurations and got the +following performance. + +| Node configuration | Wall time (s) | Speed up | +|--------------------|---------------|----------| +| 40 CPU cores (MPI) | 235 | 1x | +| 1 CPU core + 1 GPU | 181 | 1.3x | +| 8 CPU cores + 1 GPU | 60 | 3.9x | +| 8 CPU cores + 2 GPUs | 56 | 4.2x | +| 24 CPU cores + 4 GPUs | 33 | 7.1x | + +Note that depending on your setup you might not see the same performance, we +urge researchers to test with GPU to see if they can benefit and +{ref}`contact us for assistance in getting started ` if necessary. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/nco.md.txt b/_sources/software/application_guides/nco.md.txt new file mode 100644 index 000000000..33e582c94 --- /dev/null +++ b/_sources/software/application_guides/nco.md.txt @@ -0,0 +1,28 @@ +# NCO +NetCDF Operators (NCO) is a collection of tools that operate on NetCDF and other data types. + +[To find out more, visit the NCO website.](http://nco.sourceforge.net/) + +## Running NCO + +| Module | Version | +| :------------- | :------------- | +| NCO |4.6.6-intel-2017a
4.7.2-intel-2018a| + +To see available versions when logged into Fram issue command + + module spider nco + +To use NCO type + + module load NCO/ + +## License Information + +NCO is available under the [GNU General Public License (GPLv3)](https://www.gnu.org/licenses/gpl.html). + +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/ncview.md.txt b/_sources/software/application_guides/ncview.md.txt new file mode 100644 index 000000000..db665785a --- /dev/null +++ b/_sources/software/application_guides/ncview.md.txt @@ -0,0 +1,31 @@ +# Ncview + +Ncview is a visual browser for NetCDF files. + +[More information here.](http://meteora.ucsd.edu/~pierce/ncview_home_page.html) + +## Running Ncview + +| Module | Version | +| :------------- | :------------- | +| ncview |2.1.7-intel-2017a
2.1.7-intel-2018a
| + +To see available versions when logged into Fram issue command + + module spider ncview + +To use ncview type + + module load ncview/ + +specifying one of the available versions. + +## License Information + +Ncview is available under the [GNU General Public License (GPLv3)](https://www.gnu.org/licenses/gpl.html). + +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/nwchem.md.txt b/_sources/software/application_guides/nwchem.md.txt new file mode 100644 index 000000000..667d63bf9 --- /dev/null +++ b/_sources/software/application_guides/nwchem.md.txt @@ -0,0 +1,55 @@ +# NWChem + +The North Western program system for computational Chemistry (NWChem) is ab initio computational chemistry software package, which also includes quantum chemical and molecular dynamics functionality. + +[More information here.](https://www.nwchem-sw.org) + + +## Running NWChem + +| Module | Version | +| :------------- | :------------- | +| NWChem |6.6.revision27746-intel-2017a-2015-10-20-patches-20170814-Python-2.7.13
6.8.revision-v47-intel-2018a-2017-12-14-Python-2.7.14| + +To see available versions when logged into Fram issue command + + module spider nwchem + +To use NWChem type + + module load NWChem/ + +specifying one of the available versions. + +### Sample NWChem Job Script +``` +#!/bin/bash +#SBATCH --account=nnNNNNk +#SBATCH --job-name=tce_benzene_2emet_1 +#SBATCH --time=1-0:0:0 +#SBATCH --nodes=10 + +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +module restore system +module load NWChem/ + +case=$SLURM_JOB_NAME + +## Prepare input files +cp $case.nw $SCRATCH +cd $SCRATCH +mkdir $SCRATCH/tmp +export SCRATCH_DIR=$SCRATCH/tmp + +mpirun nwchem $case.nw + +## Copy results back to the submit directory +cleanup "cp $SCRATCH/* $SLURM_SUBMIT_DIR" +``` + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/opendrift.md.txt b/_sources/software/application_guides/opendrift.md.txt new file mode 100644 index 000000000..df8b6b179 --- /dev/null +++ b/_sources/software/application_guides/opendrift.md.txt @@ -0,0 +1,41 @@ +# OpenDrift + +This recipe for a **containerized** [OpenDrift](https://opendrift.github.io/) +was provided by a user and is hopefully useful for others. + +In a project folder (the home folder is likely not large enough), run the +following: +```console +$ singularity pull docker://opendrift/opendrift +``` + +This downloads a large file (`opendrift_latest.sif`) which provides OpenDrift +in a container image. + +Then create a Python script which imports opendrift: +```python +import opendrift + +print("the import worked well") +``` + +This script can then be run using: +```console +$ ./opendrift_latest.sif python myscript.py +``` + +It is also possible to open python and run OpenDrift interactively using: +```console +$ ./opendrift_latest.sif python +``` + +For this to work, you might have to mount specific catalogues (for example +where the ocean model forcing files are) using `SINGULARITY_BIND`: +```console +$ export SINGULARITY_BIND="/cluster" +``` + +If more directories are needed, they can be added through: +```console +$ export SINGULARITY_BIND="/cluster,/opt,/data" +``` diff --git a/_sources/software/application_guides/openfoam.md.txt b/_sources/software/application_guides/openfoam.md.txt new file mode 100644 index 000000000..e8b4d7b5a --- /dev/null +++ b/_sources/software/application_guides/openfoam.md.txt @@ -0,0 +1,61 @@ +# OpenFOAM +OpenFOAM is a computational fluid dynamics (CFD) software suite for physical and +engineering simulations. The suite consists of mechanical, electronics, and +embedded software simulation applications. + +[To find out more, visit the OpenFOAM website.](https://www.openfoam.com/) + +## Running OpenFOAM + +| Module | Version | +| :------------- | :------------- | +| OpenFOAM |4.1-intel-2017a
5.0-intel-2017a
1712-foss-2018a
| +| OpenFOAM-Extend |4.0-intel-2017a | + +To see available versions when logged into Fram issue command + + module spider openfoam + +To use OpenFOAM type + + module load OpenFOAM/ + +specifying one of the available versions. + +### Sample OpenFOAM Job Script +``` +#!/bin/bash + +#SBATCH --job-name=damBreak +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node=32 +#SBATCH --time=01:00:00 +#SBATCH --account=nnNNNNk + +## Recommended safety settings: +set -o errexit # Make bash exit on any error +set -o nounset # Treat unset variables as errors + +module restore system +module load OpenFOAM/1712-foss-2018a +source $FOAM_BASH + +case=$SLURM_JOB_NAME + +cp -r $case/* $SCRATCH +cd $SCRATCH + +mpirun interFoam -parallel + +reconstructPar + +mkdir -p $SLURM_SUBMIT_DIR/$SLURM_JOB_ID +cleanup "cp -r $SCRATCH/constant $SCRATCH/system $SCRATCH/[0-9]* $SLURM_SUBMIT_DIR/$SLURM_JOB_ID" + +``` + +## License Information + +OpenFOAM is available under the [GNU General Public License](https://www.gnu.org/licenses/gpl.html) (GPL). For more information, visit http://www.openfoam.com/legal/open-source.php + +It is the user's responsibility to make sure they adhere to the license agreements. diff --git a/_sources/software/application_guides/paraview_server.md.txt b/_sources/software/application_guides/paraview_server.md.txt new file mode 100644 index 000000000..0f2a54ade --- /dev/null +++ b/_sources/software/application_guides/paraview_server.md.txt @@ -0,0 +1,187 @@ +(paraview-server)= + +```{contents} Table of Contents +``` + +# ParaView Server + +## Context + +Running ParaView using remote desktop software on our clusters is far from ideal because it adds an unnecessary layer of virtualization, making the application run slower and taxing the server and users running other programs. + +Running ParaView using the built-in server option has a few advantages: +- You do not rely on IT support to install a particular version of the software; +- It is possible to run the latest version, as long as you download the newer version from ParaView website on your local machine and on the server; +- You can specify exactly how much resources you need (including CPUs and also GPUs where available) and they will be allocated to your project; +- It runs much better on your, already familiar, local computer. + + +## Download ParaView software + +### Linux version on the cluster + +All our servers have version 5.10.1 installed as the most up-to-date version. However, we recommend the download and usage of "osmesa" or "egl" versions available on [ParaView website](https://www.paraview.org/download/), as they have CPU and GPU acceleration and the 3D rendering on ParaView happens much faster. + +In fact, we ran some benchmarks: + +``` +Version: 5.10.1-MPI +Allocation command: `salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --mem=20G --account=nn9999k` +PV Server command: `srun ./pvserver --server-port=7755 --force-offscreen-rendering` +Message: "Display is not accessible on the server side. Remote rendering will be disabled." +Time: 1m57s + +Version: 5.10.1-osmesa +Allocation command: `salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --mem=20G --account=nn9999k` +PV Server command: `srun ./pvserver --server-port=7755 --force-offscreen-rendering` +Message: "None" +Time: 59s + +Version: 5.10.1-egl +Allocation command: `salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --mem=20G --account=nn9999k` +PV Server command: `srun ./pvserver --server-port=7755 --force-offscreen-rendering` +Time: did not run +``` + +``` +Version: 5.10.1-MPI +Allocation command: `salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --mem=20G --partition=accel --gpus=1 --account=nn9999k` +PV Server command: `srun ./pvserver --server-port=7755 --force-offscreen-rendering` +Message: "Display is not accessible on the server side. Remote rendering will be disabled." +Time: 1m46s + +Version: 5.10.1-osmesa +Allocation command: `salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --mem=20G --partition=accel --gpus=1 --account=nn9999k` +PV Server command: `srun ./pvserver --server-port=7755 --force-offscreen-rendering` +Message: "None" +Time: 48s + +Version: 5.10.1-egl +Allocation command: `salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --mem=20G --partition=accel --gpus=1 --account=nn9999k` +PV Server command: `srun ./pvserver --server-port=7755 --force-offscreen-rendering` +Message: "None" +Time: 47s +``` + +``` +Version: 5.10.1-MPI +Allocation command: `salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --mem=20G --partition=a100 --gpus=1 --account=nn9999k` +PV Server command: `srun ./pvserver --server-port=7755 --force-offscreen-rendering` +Message: "Display is not accessible on the server side. Remote rendering will be disabled." +Time: 1m11s + +Version: 5.10.1-osmesa +Allocation command: `salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --mem=20G --partition=a100 --gpus=1 --account=nn9999k` +PV Server command: `srun ./pvserver --server-port=7755 --force-offscreen-rendering` +Message: "None" +Time: 20s + +Version: 5.10.1-egl +Allocation command: `salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --mem=20G --partition=a100 --gpus=1 --account=nn9999k` +PV Server command: `srun ./pvserver --server-port=7755 --force-offscreen-rendering` +Message: "None" +Time: 19s +``` + +Download the version you desire for your operating system and the **same** version for Linux. You will need to upload the .tar.gz file to your home or project directory and extract it with the command `tar -xvf nameOfFile.tar.gz` + +**TIP**: you can speed up the extraction process on the server by extracting first on your computer the `.tar` file inside it. Then, upload the file and extract this one following the same procedures. + + +### Windows client for your local computer + +If your local machine runs Windows, you have to install ParaView executable or, in case you don't have Administrator permissions to install the software, you can also download the zip file, extract it and run the `paraview.exe` file inside the /bin folder + +Also, if an error appears when opening the program saying a dll is missing `msmpi.dll` , you will need to download and install `msmpisetup.exe` from this link: https://www.microsoft.com/en-us/download/details.aspx?id=105289 . Ask for an administrator to install it for you. + + +## Allocating resources for the project + +Run the following command: `salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --mem +=20G --partition=a100 --gpus=1 --account=nnXXXXk` + +If the command above doesn't work, take a look at this [documentation](https://documentation.sigma2.no/jobs/interactive_jobs.html#requesting-an-interactive-job) or on the benchmarks as we have provided the allocation commands we ran. + +Please, note that here we are asking 1 CPU and 1 GPU only for 30 minutes. **If you need more resources and time, adjust the parameters accordingly.** +Also, the A100 GPU is faster but, if resources are not available, try the P100 GPU changing the `--partition` flag from "a100" to "accel". + +The output will be similar to this one: + +``` +salloc: Pending job allocation 5442258 +salloc: job 5442258 queued and waiting for resources +salloc: job 5442258 has been allocated resources +salloc: Granted job allocation 5442258 +salloc: Waiting for resource configuration +salloc: Nodes gpu-12-8 are ready for job +``` + +## Loading "libOpenGL.so.0" with A100 "egl" package + +Due to a different architecture, our A100 GPU runs paired with an AMD CPU. You can check more details about the hardware [in this page](https://documentation.sigma2.no/hpc_machines/saga.html) + +If you want to run the "egl" package, you will have to: + +- Switch environments running the following commands: +``` +module purge +module --force swap StdEnv Zen2Env +``` + +- Manually [install](https://documentation.sigma2.no/software/userinstallsw/easybuild.html) `libglvnd` available in [EasyBuild](https://docs.easybuild.io/version-specific/supported-software/#libglvnd) + +- Load the module with the following commands (please, adapt the paths and the project number): +``` +module use /cluster/projects/nnXXXXk/EasyBuild +module avail | grep -i libglvnd +module load modules/all/libglvnd/1.4.0-GCCcore-11.3.0 +``` + +**NOTE:** On the simulation we tested, the difference between "osmesa" and "egl" packages was negligible as well as increasing the number of allocated GPUs (only used by "egl"). Do your own tests to find the optimal resources and ParaView version for your case. + +## Running ParaView Server + +Go to the /bin folder with the `cd` command and run the following (replace the "XXXX" for a random 4-digit number): + +`srun ./pvserver --server-port=XXXX --force-offscreen-rendering` + +**NOTE:** You can read more about the `--force-offscreen-rendering` option [here](https://kitware.github.io/paraview-docs/latest/cxx/Offscreen.html). It seems to render slightly faster. + +You should see a message like this: + +``` +Waiting for client... +Connection URL: cs://gpu-12-8:5678 +Accepting connection(s): gpu-12-8:5678 +``` + +## Creating a tunnel between your PC and the Server + +ParaView server needs to communicate with your computer and for this, simply open a new terminal window and type: + +`ssh -N -L XXXX:node:XXXX username@login-Y.server.sigma2.no` + +Please, replace the following: + +``` +`XXXX` - with the chosen 4-digit port +`node` - the number of the node. You can see that on the "Accepting Connection" or salloc messages. +`username` - your username on our servers +`Y` - the login node you connected previously (you can see it on the pvserver message below) +`server` - replace with betzy, fram or saga +``` + +## Connect ParaView's local client to the server + +Open the ParaView executable and follow the steps below: +- Click on File > Connect +- Add Server +- Choose a name and fill in "Host" with `localhost` and "Port" with the 4-digit number you chosen +- Click Configure > Save +- A new connection will be created. Now, click on "Connect" + +If everything is configured correctly, you should see on your left side under "Pipeline Browser" the client connected and a message on the terminal saying "Client connected" + +Here's an image with an overview of the whole process: + +![paraviewServer_overview](pvserver_overview.png) diff --git a/_sources/software/application_guides/paraview_web.md.txt b/_sources/software/application_guides/paraview_web.md.txt new file mode 100644 index 000000000..a09585a79 --- /dev/null +++ b/_sources/software/application_guides/paraview_web.md.txt @@ -0,0 +1,89 @@ +# ParaViewWeb with Apptainer (singularity) + +**Attention**: ParaViewWeb is different than the regular ParaView software. ParaViewWeb is a Web framework to build applications with interactive scientific visualization inside the Web browser. +In case you want to run regular ParaView, use it inside X2Go or follow the guide on how to run it using X11 Forwarding. + +## Context + +Running ParaView using remote desktop software on our clusters is far from ideal because it adds an unnecessary layer of virtualization, making the application run slower and taxing the server and users running other programs. + +Running ParaView through a container has a few advantages: +- You do not rely on IT support to install a particular version of the software; +- It is possible to run the latest version, as long as the container image is also updated; +- You can specify exactly how much resources you need (including CPUs and also GPUs where available) and they will be allocated to your project; +- It runs much better on your, already familiar, browser; +- There is no need to maintain old software that will probably stop working within time; +- You can use the same container image on different hosts (i.e. what is described here can be adapted for other platforms), and always remain in the exact same software environment. + +In this guide, we are going to use containers provided on NVIDIA NGC, which is a hub for GPU-optimized software for deep learning, machine learning, and HPC: https://catalog.ngc.nvidia.com/orgs/nvidia-hpcvis/containers/paraview + +**NOTE**: We are going to use `$USERWORK` for space purposes. Please, remember that this folder is subject to automatic clean up: https://documentation.sigma2.no/files_storage/clusters.html#user-work-area . It might be necessary to download the container again at some point in the future (which will be available as long as Nvidia maintains it) but, DO NOT store important data under this directory. + + +## Pulling ParaView image + +First, log in to your preferred server via SSH (in this example, we are going to use Fram): `ssh -i ~/.ssh/ssh_key username@fram.sigma2.no` + +The first time, you will have to pull the container image, and since these can be quite large it is often better not to use your $HOME but the $USERWORK instead: `cd $USERWORK` + +Also, let's set some variables so there won't be issues while downloading the container: +``` +mkdir -p $USERWORK/.apptainer +export APPTAINER_CACHEDIR=$USERWORK/.apptainer +export APPTAINER_TMPDIR=$USERWORK/.apptainer +``` + +Now, pull a ParaView container image with Apptainer: `apptainer pull docker://nvcr.io/nvidia-hpcvis/paraview:egl-py3-5.11.0` + +This will create a `.sif` file in the directory from where you pulled it (you can rename this file as much as you want, and also move it where you want, it will still work): `ls -lrtah paraview*` + +**WARNING**: If you want to run a different ParaView version, you can do so by replacing the url after "docker://", copying the new one from here: https://catalog.ngc.nvidia.com/orgs/nvidia-hpcvis/containers/paraview/tags. + +However, if you do this, be careful to use the correct PATH for Paraview because for tags `egl-py3-5.9.0` and `egl-py3-5.8.0`, Paraview was installed in `/opt/paraview` whereas for tags `egl-py3-5.11.0`, `glx-5.6.0rc3` and `egl-5.6.0rc` it is installed in `/usr/local/paraview`, so modify the PATH in "[Apptainer exec command](https://documentation.sigma2.no/software/application_guides/paraview.html#running-the-container)" accordingly. + + +## Allocating resources for the project + +Log in again to your preferred server and run the following command: `salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --qos=devel --account=nnxxxxk` + +Please, note that here we are asking 1 CPU only for 30 minutes in the Devel queue. **If you need more resources and time, adjust the parameters accordingly.** + +The output will be similar to this one: + +``` +salloc: Pending job allocation 5442258 +salloc: job 5442258 queued and waiting for resources +salloc: job 5442258 has been allocated resources +salloc: Granted job allocation 5442258 +salloc: Waiting for resource configuration +salloc: Nodes c84-5 are ready for job +``` + +**NOTE**: Remember which node was allocated for the job, it will be needed later. In this case, the allocated node was "c84-5". +Also, you can allocate resources after logging in with SSH or right before executing the container in the step below. + + +## Running the container + +Select a **random port number**, say 7412. Also, for this guide, we will assume your data is located in `$USERWORK/data` + +**REMEMBER**: Please, adjust the command accordingly with the chosen port, data directory and the Paraview image you want to use. If you see an error because the port is already in use, select another port number. + +`apptainer exec --bind $USERWORK/data:/data $USERWORK/paraview_egl-py3-5.11.0.sif /usr/local/paraview/bin/pvpython /usr/local/paraview/share/paraview-5.11/web/visualizer/server/pvw-visualizer.py --content /usr/local/paraview/share/paraview-5.11/web/visualizer/www --port 7412 --data /data -i 0.0.0.0` + +The command above is binding the port and the data folder to the container, so that it can see the information outside of it (by default, a container is relatively isolated from "outside world", meaning we have to specify which folders from the host machine we want to "see" from inside the container). + +The first folder `$USERWORK/data` is only known outside the container and `/data` is only known inside the container, we are binding them together with `--bind $USERWORK/data:/data` but it is **the same folder** therefore changes made in `/data` are actually done to `$USERWORK/data` and hence permanent. + +**From a second terminal window**, log in again to the server but, this time, **forwarding** the port you used for the container: `ssh -L 7412:localhost:7412 -i ~/.ssh/ssh_key username@fram.sigma2.no` + +Now, forward again the same port from the compute node that you were allocated, run the following: `ssh -L 7412:localhost:7412 c84-5`. + +**Remember to replace the last part (c84-5) with the allocated node in the beginning** + + +## Executing ParaView + +Finally, on your computer's browser, type the following address (replacing the chosen port): `127.0.0.1:7412` + +You should see ParaViewWeb interface loading on your browser. diff --git a/_sources/software/application_guides/paraview_x11.md.txt b/_sources/software/application_guides/paraview_x11.md.txt new file mode 100644 index 000000000..5550495b4 --- /dev/null +++ b/_sources/software/application_guides/paraview_x11.md.txt @@ -0,0 +1,65 @@ +# ParaView using X11 Forwarding + +## X Server for running the application + +You first need to download an X server so the GUI can be forwarded and you can interact with ParaView. + +**For Windows, you can use Xming or VcXsrv:** If you use the latter, select "One large window", "Start no client", uncheck "Native opengl" and check "Disable access control" + +**For Mac, you can use XQuartz** + +More information here: https://documentation.sigma2.no/getting_started/ssh.html#x11-forwarding and here: https://documentation.sigma2.no/jobs/interactive_jobs.html#graphical-user-interface-in-interactive-jobs + +Make sure the application is running and it says, when you hover the mouse over it: "nameOfTheMachine:0.0" + + +## Running SSH with forwarding capabilities + +### Windows PowerShell +Open Windows PowerShell and run the following commands: +``` +$env:DISPLAY = "localhost:0" +ssh -X -Y username@server.sigma2.no #(replace "server" with fram, betzy or saga) +``` + +In case the connection is not very stable while running with PowerShell, you can try with Putty + +### Putty +- Install the software from https://www.putty.org/ +- On "Session" tab, under "Host Name", write down `betzy.sigma2.no` (or fram or saga) +- On "Connection" tab, write 240 on "Seconds between keepalives". Also enable "Enable TCP keepalives (SO_KEEPALIVE option)" +- On "SSH > X11" tab, check "Enable X11 forwarding" and write down on "X display location": localhost:0.0 +- Go back to the "Session" tab, write a name for the session under "Saved Sessions" and click "Save" +- Click "Open" and log in normally + + +## Allocating resources for the project + +Run the following command: ```salloc --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --time=00:30:00 --qos=devel --account=nnxxxxk``` + +If the command above doesn't work, take a look at this [documentation](https://documentation.sigma2.no/jobs/interactive_jobs.html#requesting-an-interactive-job). + +Please, note that here we are asking 1 CPU only for 30 minutes in the Devel queue. **If you need more resources and time, adjust the parameters accordingly.** + +The output will be similar to this one: + +``` +salloc: Pending job allocation 5442258 +salloc: job 5442258 queued and waiting for resources +salloc: job 5442258 has been allocated resources +salloc: Granted job allocation 5442258 +salloc: Waiting for resource configuration +salloc: Nodes c84-5 are ready for job +``` + + +## Running ParaView + +Run the following commands: +``` +ml avail | grep ParaView +module load ParaView/versionDesired #(replace "versionDesired" with the options available) +paraview +``` + +The ParaView user interface should load on the X Server within a few seconds. diff --git a/_sources/software/application_guides/proj.md.txt b/_sources/software/application_guides/proj.md.txt new file mode 100644 index 000000000..2222fac18 --- /dev/null +++ b/_sources/software/application_guides/proj.md.txt @@ -0,0 +1,31 @@ +# PROJ + +PROJ is a library for handling cartographic projections typically used for map-making and global coordinates. + +[To find out more, visit the PROJ website.](https://proj.org/) + +## Running PROJ + +| Module | Version | +| :------------- | :------------- | +| PROJ |4.9.3-foss-2017a
4.9.3-intel-2017a
4.9.3-foss-2017b
4.9.3-intel-2017b
5.0.0-foss-2018a
5.0.0-intel-2018a
| + +To see available versions when logged into Fram issue command + + module spider proj + +To use PROJ type + + module load PROJ/ + +specifying one of the available versions. + +## License Information + +PROJ is available under the MIT License. For more information, visit https://trac.osgeo.org/proj/wiki/WikiStart#License + +It is the user's responsibility to make sure they adhere to the license agreements. + +## Citation + +When publishing results obtained with the software referred to, please do check the developers web page in order to find the correct citation(s). diff --git a/_sources/software/application_guides/r.md.txt b/_sources/software/application_guides/r.md.txt new file mode 100644 index 000000000..539f8b804 --- /dev/null +++ b/_sources/software/application_guides/r.md.txt @@ -0,0 +1,64 @@ +--- +orphan: true +--- + +# R +R is a programming environment for performing statistical operations. + +To find out more, visit the R website at: https://www.r-project.org + +## Running R + +| Module | Version | +| :------------- | :------------- | +| R |3.4.0-intel-2017a-X11-20170314
3.4.3-intel-2017b-X11-20171023
3.4.4-intel-2018a-X11-20180132
| + +To see available versions when logged into Fram issue command + + module spider R + +To use R type + + module load R/ + +## How to install packages +There might be some packages missing in the R module we have installed or may be you need a different +version than what we have. In that case you could install the packages yourself. For example, +following is the procedure to install the package called XYZ by the user *user1* on SAGA. +Please Please remember to use your username instead of *user1* + + - Login to saga + - Load the module + +``` + [user1@login-1.SAGA ~]$ module restore + [user1@login-1.SAGA ~]$ module load R/4.0.0-foss-2020a + [user1@login-1.SAGA ~]$ mkdir /cluster/home/user1/R + [user1@login-1.SAGA ~]$ R +``` + + - Use the R prompt to install the package + +``` + #Set the location for the packages to be installed + > .libPaths("/cluster/home/user1/R") + #install the package + > install.packages("XYX", repo="cran.uib.no") + #Check if the package can be loaded + > library(XYZ) +``` + + - How to use an installed package + After installing, everytime the packages needed to be accessed + The `.libPaths("/cluster/home/user1/R")` setting should be done. + When submitting R Script as a job, the `.libPath("/cluster/home/user1/R")` should be + specified before calling the package. + + +## License Information + +R is available under several open-source licenses. For more information, visit https://www.r-project.org/Licenses/ + +It is the user's responsibility to make sure they adhere to the license agreements. + + diff --git a/_sources/software/application_guides/schrodinger/host_file_settings.md.txt b/_sources/software/application_guides/schrodinger/host_file_settings.md.txt new file mode 100644 index 000000000..0eaca3938 --- /dev/null +++ b/_sources/software/application_guides/schrodinger/host_file_settings.md.txt @@ -0,0 +1,39 @@ +--- +orphan: true +--- + +# Keywords for schrodinger.hosts file settings + +| Keyword | Description | +| ------------- |:-------------:| +|base |Name of an entry (the base entry) that is the basis for the current entry. All the keywords from the base entryare inherited by the current entry, and new keywords may be added, in any order. A base entry can include another base entry.| +|env | Environment variables to be set on the host. The syntax for the environment variables is variable=value, regardless of the shell used. List each environment variable on a separate env line.| +|gpgpu | Specify a graphics processor (GPU) to use on the host. One instance should be used for each GPU specified. The specification is in the form id, description, where id is the numerical GPU id, usually starting from 0, and description is the description of the GPU, for example Tesla V100.| +|host |Host name. This entry is only needed if it is different from the name setting or if the queueing software is only available on a particular host. Not valid in the localhost entry.| +|serverhost |Name of host used to stage job output when the host from which the job was submitted is offline. This might be the head node of a cluster, for example. This setting is ignored if the job submission host does not have offline job management enabled.| +|include | Name of an auxiliary hosts file to be included in the current hosts file. The inclusion is done by replacing the include line with the contents of the specified file.| +|knime |Path to an external KNIME installation (i.e. an installation other than the one in the Schrödinger installation).| +|name| Name of the host entry or batch queue. For a host this is usually the host name. This name is displayed in the Start dialog box. The name must not contain spaces. The value localhost is a special name that means the host on which the job is launched.| +|nodelist| List of entry names, used to define a multiple-host entry. A name may be followed by a colon and a number of processors. Can be combined with a host setting.| +|parallel| Specify whether the host supports MPI parallel jobs or not. The value can be specified as yes or no, true or false, 1 or 0.| +|port| Server port to use when sending jobs to a server (Used by KNIME only).| +|processors| Number of processors available on the host. If the host is part of a cluster, this number should be the total number of processors available on the cluster. For multicore processors, the number should be the total number of cores available. The default is 1, except for the localhost entry, where the default is the number of available processors (or cores).| +|processors_per_node| Number of processors (cores) per node available to a batch queue. This setting is used by applications that support threaded parallel execution (OpenMP).| +|proxyhost| Host on which to run jproxy. This setting should be made when the host from which a job is launched cannot open a socket connection to the host on which the job is actually run. By default, jproxy is run on the host specified by the host keyword, and is only run when using a queuing system. This setting is only needed in cases where using the default is impossible or impractical. Only valid when the host entry also contains a queue setting.| +|proxyport| Specify the port or range of ports that jproxy may use. Ports can be specified as comma or colon-separated lists without spaces. Ranges can specified with a dash, for example, 5987:5989-5992:5994. Only valid when the host entry also contains a queue setting.| +|qargs| Arguments to be used when submitting jobs to a batch queue. These arguments should specify any parameters that define the queue.| +|queue| Queuing system name, which is the subdirectory of $SCHRODINGER/queues that contains the support files for the queuing system. PBS10.4, SGE, LSF, Torque, and Slurm are the supported systems. Not valid in the localhost entry.| +|recoverjobs| Disable recovery of failed jobs if set to no. Use this setting only for jobs where job recovery might not be possible (such as on the cloud).| +|schrodinger| The path to the Schrödinger software installation on the host. Not valid in the localhost entry.| +|tmpdir| Base directory for temporary or scratch files, also called the scratch directory. The file system on which this directory is mounted should be large enough for the largest temporary files, should be mounted locally, and should be writable by the user.| +|user| User name to use on the host. This should never be set in the hosts file in the installation directory. It is required if the user has a different user name on the defined host than on the host on which the job is launched.| + +## + +### Go to: +* [Schrodinger main page](schrodinger.md) +* [Using the Schrodinger suite](schrodinger_usage.md) +* [Setting up the Hosts file](schrodinger_hosts.md) +* [Hosts file keywords](host_file_settings.md) +* [Job control facility](job_control.md) +* [Tuning](tuning.md) \ No newline at end of file diff --git a/_sources/software/application_guides/schrodinger/job_control.md.txt b/_sources/software/application_guides/schrodinger/job_control.md.txt new file mode 100644 index 000000000..b830e5977 --- /dev/null +++ b/_sources/software/application_guides/schrodinger/job_control.md.txt @@ -0,0 +1,85 @@ +--- +orphan: true +--- + +# The Schrödinger job control facility +The job control facility sits on top of the slurm queuing system. Thus, when running Schrodinger jobs, please use +the job control facility to check, cancel or update your jobs. + +## List my jobs +* `$SCHRODINGER/jobcontrol -list` + +If you have subjobs, append -c (child of the main job/driver): +* `$SCHRODINGER/jobcontrol -list -c` + +This will print something like: + +![Job control facility](figures/jobcontrol.png "Output from control") + + +In the above example the driver (main thread) is running on localhost (login-1 in this example) and 20 subjobs on the +compute nodes on SAGA. Each Schrodinger job gets its own `JobId` that is internally used by the job control facility. +When using the job control facility, you point to the JobId. The `BatchId` is the normal job id generated by SLURM and +is what you will see in the queue from the command `squeue -u $USER`. The jobcontrol will also display the progress +status of each job (`Status`), errors (`Err`), `Runtime` and compute nodes/Host (`Host`). + +## Cancel my jobs +If you for some reason need to cancel a job completely, you need to kill the main thread (driver). From the print above, +the command would be: +* `$SCHRODINGER/jobcontrol -kill login-1-0-61288ba6` + +This will kill the main thread and the subjobs, and also remove them from the slurm queue. + + +### List of jobcontrol commands +`$SCHRODINGER/jobcontrol -keyword` + +| Keyword | Description | +| ------------- |:-------------:| + | -list |list the jobid, job name and status. Use '-children' flag to include subjobs.| + |-show | show the basic information about the job| + |-showsys | show system information about the job (pids, ports, etc.)| + |-dump | display the complete job record| + |-db | use the given jobdb, rather than ~/.schrodinger/.jobdb2| + |-reindex | recreate the jobdb index for the current host| + |-reindexall | recreate the jobdb index for all hosts| + |-cancel | cancel a job that has been launched, but not started| + |-kill | stop the job immediately (and recover output files)| + |-killnooutput | stop the job immediately and abandon output files| + |-abort | emergency stop, abandon output files and the job record| + |-stop | ask the job to stop itself as soon as possible (bmin,desmond,multisim only)| + |-pause | suspend the job temporarily| + |-resume | continue running a paused job| + |-monitor | ask for monitoring files to be sent every sec. Specify zero seconds to turn off monitoring| + |-files | list the files related to the job, with their labels| + |-view