diff --git a/.clang-tidy b/.clang-tidy
index a996e64c0a4..8d269d52fa9 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -10,6 +10,8 @@ Checks: |
    -bugprone-implicit-widening-of-multiplication-result,
    -bugprone-macro-parentheses,
    -bugprone-reserved-identifier,
+   -bugprone-switch-missing-default-case,
+   -bugprone-unchecked-optional-access,
    clang-analyzer-alpha.*,
    modernize-deprecated-headers,
    modernize-make-shared,
@@ -41,7 +43,6 @@ Checks: |
    readability-function-size'
 WarningsAsErrors: '*,-clang-analyzer-core.StackAddrEscapeBase,-clang-analyzer-optin.mpi.MPI-Checker'
 HeaderFilterRegex: '.*'
-AnalyzeTemporaryDtors: false
 FormatStyle:     none
 User: espresso
 CheckOptions:    
diff --git a/.github/actions/build_and_check/action.yml b/.github/actions/build_and_check/action.yml
index 8b41a8ad119..5e87f0e7ad0 100644
--- a/.github/actions/build_and_check/action.yml
+++ b/.github/actions/build_and_check/action.yml
@@ -6,7 +6,7 @@ runs:
     - run: |
        brew install boost boost-mpi fftw
        brew install hdf5-mpi
-       pip3 install -c requirements.txt numpy "cython<3.0" h5py scipy
+       pip3 install -c requirements.txt "cython<3.0" numpy scipy h5py packaging
       shell: bash
       if: runner.os == 'macOS'
     - run: |
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 06de904ae70..7ab35c52b74 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -16,7 +16,7 @@ jobs:
       - name: Install pandoc
         uses: r-lib/actions/setup-pandoc@v2
       - name: Setup SSH agent
-        uses: webfactory/ssh-agent@v0.7.0
+        uses: webfactory/ssh-agent@v0.9.0
         with:
           ssh-private-key: ${{ secrets.GH_PAGES_SSH_PRIVATE_KEY }}
       - name: Checkout
diff --git a/.github/workflows/push_pull.yml b/.github/workflows/push_pull.yml
index 1a2eb4e39fd..3cbd6fc2d46 100644
--- a/.github/workflows/push_pull.yml
+++ b/.github/workflows/push_pull.yml
@@ -9,7 +9,7 @@ permissions:
 
 jobs:
   macos:
-    runs-on: macos-12
+    runs-on: macos-13
     if: ${{ github.repository == 'espressomd/espresso' }}
     steps:
       - name: Checkout
@@ -20,22 +20,22 @@ jobs:
           key: macos
           save: ${{ github.ref == 'refs/heads/python' }}
       - name: Setup Python environment
-        uses: actions/setup-python@v4.3.1
+        uses: actions/setup-python@v5.1.0
         with:
-          python-version: '3.9'
+          python-version: '3.12'
       - name: Get runner specifications
         run: system_profiler SPHardwareDataType
       - name: Build and check
         uses: ./.github/actions/build_and_check
         env:
-          build_procs: 3
-          check_procs: 3
+          build_procs: 4
+          check_procs: 4
           with_ccache: 'true'
 
   debian:
     runs-on: ubuntu-latest
     container:
-      image: ghcr.io/espressomd/docker/debian:339903979196fd7e72127f2cb5bfb27759d129f9-base-layer
+      image: ghcr.io/espressomd/docker/debian:f7f8ef2c0ca93c67aa16b9f91785492fb04ecc1b-base-layer
       credentials:
          username: ${{ github.actor }}
          password: ${{ secrets.github_token }}
@@ -74,7 +74,7 @@ jobs:
     runs-on: ubuntu-latest
     if: ${{ github.repository == 'espressomd/espresso' }}
     container:
-      image: ghcr.io/espressomd/docker/ubuntu-wo-dependencies:339903979196fd7e72127f2cb5bfb27759d129f9-base-layer
+      image: ghcr.io/espressomd/docker/ubuntu-wo-dependencies:f7f8ef2c0ca93c67aa16b9f91785492fb04ecc1b-base-layer
       credentials:
          username: ${{ github.actor }}
          password: ${{ secrets.github_token }}
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e80fe191cf5..ea94df3ffd8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,4 @@
-image: ghcr.io/espressomd/docker/ubuntu-22.04:339903979196fd7e72127f2cb5bfb27759d129f9
+image: ghcr.io/espressomd/docker/ubuntu:f7f8ef2c0ca93c67aa16b9f91785492fb04ecc1b
 
 stages:
   - prepare
@@ -18,6 +18,7 @@ stages:
 
 .notification_job_template: &notification_job_definition
   <<: *global_job_definition
+  image: ghcr.io/espressomd/docker/fedora:f7f8ef2c0ca93c67aa16b9f91785492fb04ecc1b
   variables:
     GIT_SUBMODULE_STRATEGY: none
   before_script:
@@ -33,7 +34,7 @@ variables:
   GIT_SUBMODULE_STRATEGY: recursive
   CCACHE_DIR: /cache
   CCACHE_MAXSIZE: 100G
-  with_ccache: "true"
+  with_ccache: 'true'
 
 status_pending:
   <<: *notification_job_definition
@@ -80,9 +81,9 @@ default:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'gcc-10'
-     CXX: 'g++-10'
-     GCOV: 'gcov-10'
+     CC: 'gcc-13'
+     CXX: 'g++-13'
+     GCOV: 'gcov-13'
      with_cuda: 'false'
      myconfig: 'default'
      with_coverage: 'true'
@@ -101,11 +102,10 @@ maxset:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'gcc-10'
-     CXX: 'g++-10'
-     GCOV: 'gcov-10'
+     CC: 'gcc-13'
+     CXX: 'g++-13'
+     GCOV: 'gcov-13'
      with_cuda: 'false'
-     with_cxx_standard: '20'
      myconfig: 'maxset'
      with_coverage: 'true'
      with_scafacos: 'true'
@@ -113,7 +113,7 @@ maxset:
      with_stokesian_dynamics: 'true'
      with_caliper: 'true'
      check_skip_long: 'true'
-     cmake_params: '-D ESPRESSO_TEST_NP=8'
+     cmake_params: '-D CMAKE_CXX_STANDARD=23 -D ESPRESSO_TEST_NP=8'
   script:
     - bash maintainer/CI/build_cmake.sh
   tags:
@@ -126,9 +126,9 @@ no_rotation:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'gcc-10'
-     CXX: 'g++-10'
-     GCOV: 'gcov-10'
+     CC: 'gcc-13'
+     CXX: 'g++-13'
+     GCOV: 'gcov-13'
      with_cuda: 'false'
      myconfig: 'no_rotation'
      with_coverage: 'true'
@@ -141,16 +141,17 @@ no_rotation:
     - no-cuda
     - numa
 
-fedora:36:
+fedora:40:
   <<: *global_job_definition
   stage: build
-  image: ghcr.io/espressomd/docker/fedora:339903979196fd7e72127f2cb5bfb27759d129f9
+  image: ghcr.io/espressomd/docker/fedora:f7f8ef2c0ca93c67aa16b9f91785492fb04ecc1b
   variables:
      with_cuda: 'false'
      with_gsl: 'false'
      myconfig: 'maxset'
      make_check_python: 'true'
      with_stokesian_dynamics: 'true'
+     cmake_params: '-D CMAKE_INCLUDE_PATH=/usr/include/mpich-x86_64 -D CMAKE_PREFIX_PATH=/usr/lib64/mpich/lib/'
   script:
     - bash maintainer/CI/build_cmake.sh
   tags:
@@ -163,19 +164,21 @@ clang-sanitizer:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'clang-14'
-     CXX: 'clang++-14'
-     CUDAXX: 'clang++-14'
+     CC: 'clang-18'
+     CXX: 'clang++-18'
+     CUDACXX: 'clang++-18'
      myconfig: 'maxset'
      with_cuda: 'true'
-     with_coverage: 'false'
+     with_cuda_compiler: 'clang'
      with_static_analysis: 'true'
      check_skip_long: 'true'
-     with_asan: 'true'
+     with_asan: 'false'
      with_ubsan: 'true'
      with_scafacos: 'true'
      with_walberla: 'true'
      with_stokesian_dynamics: 'true'
+     with_coverage: 'false'
+     with_coverage_python: 'false'
   script:
     - bash maintainer/CI/build_cmake.sh
   timeout: 2h
@@ -188,9 +191,9 @@ fast_math:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'gcc-10'
-     CXX: 'g++-10'
-     GCOV: 'gcov-10'
+     CC: 'gcc-12'
+     CXX: 'g++-12'
+     GCOV: 'gcov-12'
      myconfig: 'maxset'
      with_cuda: 'true'
      with_coverage: 'false'
@@ -204,13 +207,13 @@ fast_math:
     - cuda
   when: manual
 
-cuda11-coverage:
+cuda12-coverage:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'gcc-10'
-     CXX: 'g++-10'
-     GCOV: 'gcov-10'
+     CC: 'gcc-12'
+     CXX: 'g++-12'
+     GCOV: 'gcov-12'
      myconfig: 'maxset'
      with_cuda: 'true'
      with_coverage: 'true'
@@ -225,13 +228,13 @@ cuda11-coverage:
     - cuda
     - numa
 
-cuda11-maxset:
+cuda12-maxset:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'gcc-10'
-     CXX: 'g++-10'
-     GCOV: 'gcov-10'
+     CC: 'gcc-12'
+     CXX: 'g++-12'
+     GCOV: 'gcov-12'
      myconfig: 'maxset'
      with_cuda: 'true'
      with_coverage: 'false'
@@ -254,14 +257,15 @@ cuda11-maxset:
     - cuda
     - numa
     - avx2
+    - reuse-artifacts-same-arch
 
 tutorials-samples-maxset:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'gcc-10'
-     CXX: 'g++-10'
-     GCOV: 'gcov-10'
+     CC: 'gcc-12'
+     CXX: 'g++-12'
+     GCOV: 'gcov-12'
      myconfig: 'maxset'
      with_cuda: 'true'
      with_coverage: 'false'
@@ -288,9 +292,9 @@ tutorials-samples-default:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'gcc-10'
-     CXX: 'g++-10'
-     GCOV: 'gcov-10'
+     CC: 'gcc-12'
+     CXX: 'g++-12'
+     GCOV: 'gcov-12'
      myconfig: 'default'
      with_cuda: 'true'
      with_coverage: 'false'
@@ -317,9 +321,9 @@ tutorials-samples-empty:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'gcc-10'
-     CXX: 'g++-10'
-     GCOV: 'gcov-10'
+     CC: 'gcc-12'
+     CXX: 'g++-12'
+     GCOV: 'gcov-12'
      myconfig: 'empty'
      with_cuda: 'true'
      with_coverage: 'false'
@@ -344,9 +348,9 @@ tutorials-samples-no-gpu:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'gcc-10'
-     CXX: 'g++-10'
-     GCOV: 'gcov-10'
+     CC: 'gcc-12'
+     CXX: 'g++-12'
+     GCOV: 'gcov-12'
      myconfig: 'maxset'
      with_cuda: 'true'
      with_coverage: 'false'
@@ -373,9 +377,9 @@ installation:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'gcc-10'
-     CXX: 'g++-10'
-     GCOV: 'gcov-10'
+     CC: 'gcc-12'
+     CXX: 'g++-12'
+     GCOV: 'gcov-12'
      myconfig: 'maxset'
      with_cuda: 'true'
      with_coverage: 'false'
@@ -383,6 +387,7 @@ installation:
      make_check_python: 'false'
      with_scafacos: 'true'
      with_walberla: 'true'
+     with_walberla_avx: 'true'
      with_stokesian_dynamics: 'true'
      srcdir: '${CI_PROJECT_DIR}'
      build_type: 'Release'
@@ -391,31 +396,26 @@ installation:
     - cd build
     - make install
     - cmake . -D ESPRESSO_BUILD_TESTS=ON
-    # get path of installed files
-    - CI_INSTALL_DIR="/tmp/espresso-unit-tests"
-    - CI_INSTALL_PYTHON_PATH=$(dirname $(find "${CI_INSTALL_DIR}/lib" -name espressomd))
-    - CI_CORES=$(cmake -L . | grep ESPRESSO_CTEST_ARGS | grep --color=never -Po '(?<=-j)[0-9]+')
-    # deploy object-in-fluid module
-    - cp -r "src/python/object_in_fluid" "${CI_INSTALL_PYTHON_PATH}/object_in_fluid"
-    # run all tests with the installed files
-    - sed -i "s|$(pwd)/pypresso|${CI_INSTALL_DIR}/bin/pypresso|" testsuite/{python,scripts/samples,scripts/tutorials}/CTestTestfile.cmake
+    - sed -i "s|$(pwd)/pypresso|/tmp/espresso-unit-tests/bin/pypresso|" testsuite/{python,scripts/samples,scripts/tutorials}/CTestTestfile.cmake
     - make check_python_skip_long
     - make check_samples
     - make check_tutorials
   tags:
     - espresso
     - cuda
+    - avx2
   when: manual
 
 empty:
   <<: *global_job_definition
   stage: build
   variables:
-     CC: 'clang-14'
-     CXX: 'clang++-14'
-     CUDAXX: 'clang++-14'
+     CC: 'clang-18'
+     CXX: 'clang++-18'
+     CUDACXX: 'clang++-18'
      myconfig: 'empty'
      with_cuda: 'true'
+     with_cuda_compiler: 'clang'
      with_static_analysis: 'true'
      with_scafacos: 'false'
      with_walberla: 'false'
@@ -433,7 +433,7 @@ check_sphinx:
   <<: *global_job_definition
   stage: additional_checks
   needs:
-    - cuda11-maxset
+    - cuda12-maxset
   when: on_success
   script:
     - cd ${CI_PROJECT_DIR}/build
@@ -450,12 +450,13 @@ check_sphinx:
     - espresso
     - cuda
     - numa
+    - reuse-artifacts-same-arch
 
 run_tutorials:
   <<: *global_job_definition
   stage: additional_checks
   needs:
-    - cuda11-maxset
+    - cuda12-maxset
   when: on_success
   script:
     - cd ${CI_PROJECT_DIR}/build
@@ -470,10 +471,12 @@ run_tutorials:
     paths:
     - build/doc/tutorials
     expire_in: 1 week
+  timeout: 2h
   tags:
     - espresso
     - cuda
     - numa
+    - reuse-artifacts-same-arch
   only:
     - schedules
 
@@ -481,7 +484,7 @@ run_doxygen:
   <<: *global_job_definition
   stage: additional_checks
   needs:
-    - cuda11-maxset
+    - cuda12-maxset
   when: on_success
   only:
     - python
@@ -496,13 +499,14 @@ run_doxygen:
     - espresso
     - no-cuda
     - numa
+    - reuse-artifacts-same-arch
 
 maxset_no_gpu:
   <<: *global_job_definition
   stage: additional_checks
   when: on_success
   needs:
-    - cuda11-maxset
+    - cuda12-maxset
   script:
     - export CUDA_VISIBLE_DEVICES=""
     - cd ${CI_PROJECT_DIR}/build
@@ -511,13 +515,14 @@ maxset_no_gpu:
     - espresso
     - no-cuda
     - numa
+    - reuse-artifacts-same-arch
 
 maxset_3_cores:
   <<: *global_job_definition
   stage: additional_checks
   when: on_success
   needs:
-    - cuda11-maxset
+    - cuda12-maxset
   script:
     - cd ${CI_PROJECT_DIR}/build
     - cmake -D ESPRESSO_TEST_NP=3 .
@@ -526,6 +531,7 @@ maxset_3_cores:
     - espresso
     - cuda
     - numa
+    - reuse-artifacts-same-arch
 
 status_success:
   <<: *notification_job_definition
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9212797e37e..43720dfacec 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ repos:
         always_run: false
         files: '.*\.(py|pyx|pxd)'
         exclude: '\.pylintrc|.*.\.py\.in|^libs/'
-        args: ["--ignore=E266,E402,E701,W291,W293", "--in-place", "--aggressive"]
+        args: ["--ignore=E266,E402,E701,W291,W293", "--in-place"]
 
     -   id: cmake-format
         name: cmake-format
diff --git a/.pylintrc b/.pylintrc
index c6667ebc3aa..27461e3c92b 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,16 +1,21 @@
-[MASTER]
+[MAIN]
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
 
 # A comma-separated list of package or module names from where C extensions may
 # be loaded. Extensions are loading into the active Python interpreter and may
 # run arbitrary code.
-extension-pkg-whitelist=
+extension-pkg-allow-list=
 
-# Add files or directories to the blacklist. They should be base names, not
-# paths.
-ignore=CVS build
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=build
 
-# Add files or directories matching the regex patterns to the blacklist. The
-# regex matches against base names, not paths.
+# Files or directories matching the regular expression patterns are skipped.
+# The regex matches against base names, not paths. The default value ignores
+# Emacs file locks
 ignore-patterns=
 
 # Python code to execute, usually for sys.path manipulation such as
@@ -18,23 +23,25 @@ ignore-patterns=
 #init-hook=
 
 # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
-# number of processors available to use.
-jobs=2
+# number of processors available to use, and will cap the count on Windows to
+# avoid hangs.
+jobs=4
 
 # Control the amount of potential inferred values when inferring a single
 # object. This can help the performance when dealing with large functions or
 # complex, nested conditions.
 limit-inference-results=100
 
-# List of plugins (as comma separated values of python modules names) to load,
+# List of plugins (as comma separated values of python module names) to load,
 # usually to register additional checkers.
 load-plugins=
 
 # Pickle collected data for later comparisons.
 persistent=yes
 
-# Specify a configuration file.
-#rcfile=
+# Minimum Python version to use for version dependent checks. Will default to
+# the version used to run pylint.
+py-version=3.10
 
 # When enabled, pylint would attempt to guess common misconfiguration and emit
 # user-friendly hints instead of false-positive error messages.
@@ -45,218 +52,21 @@ suggestion-mode=yes
 unsafe-load-any-extension=no
 
 
-[MESSAGES CONTROL]
-
-# Only show warnings with the listed confidence levels. Leave empty to show
-# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
-confidence=
-
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once). You can also use "--disable=all" to
-# disable everything first and then reenable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use "--disable=all --enable=classes
-# --disable=W".
-disable=all
-
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time (only on the command line, not in the configuration file where
-# it should appear only once). See also the "--disable" option for examples.
-enable=dangerous-default-value, # W0102
-       duplicate-key, # W0109
-       wildcard-import, # W0401
-       assert-on-tuple, # W0199
-       unused-import, # W0611
-       unused-variable, # W0612
-       unused-argument, # W0613
-       unused-wildcard-import, # W0614
-       deprecated-method, # W1505
-       cyclic-import, # R0401
-       trailing-comma-tuple, # R1707
-       bad-classmethod-argument, # C0202
-       undefined-variable, # E0602
-
-
-[REPORTS]
-
-# Python expression which should return a note less than 10 (10 is the highest
-# note). You have access to the variables errors warning, statement which
-# respectively contain the number of errors / warnings messages and the total
-# number of statements analyzed. This is used by the global evaluation report
-# (RP0004).
-evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
-
-# Template used to display messages. This is a python new-style format string
-# used to format the message information. See doc for all details.
-#msg-template=
-
-# Set the output format. Available formats are text, parseable, colorized, json
-# and msvs (visual studio). You can also give a reporter class, e.g.
-# mypackage.mymodule.MyReporterClass.
-output-format=text
-
-# Tells whether to display a full report or only the messages.
-reports=no
-
-# Activate the evaluation score.
-score=no
-
-
-[REFACTORING]
-
-# Maximum number of nested blocks for function / method body
-max-nested-blocks=5
-
-# Complete name of functions that never returns. When checking for
-# inconsistent-return-statements if a never returning function is called then
-# it will be considered as an explicit return statement and no message will be
-# printed.
-never-returning-functions=sys.exit
-
-
-[VARIABLES]
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid defining new builtins when possible.
-additional-builtins=
-
-# Tells whether unused global variables should be treated as a violation.
-allow-global-unused-variables=yes
-
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,
-          _cb
-
-# A regular expression matching the name of dummy variables (i.e. expected to
-# not be used).
-dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
-
-# Argument names that match this expression will be ignored. Default to name
-# with leading underscore.
-ignored-argument-names=_.*|^ignored_|^unused_
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-
-# List of qualified module names which can have objects that can redefine
-# builtins.
-redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
-
-
-[TYPECHECK]
-
-# List of decorators that produce context managers, such as
-# contextlib.contextmanager. Add to this list to register other decorators that
-# produce valid context managers.
-contextmanager-decorators=contextlib.contextmanager
-
-# List of members which are set dynamically and missed by pylint inference
-# system, and so shouldn't trigger E1101 when accessed. Python regular
-# expressions are accepted.
-generated-members=
-
-# Tells whether missing members accessed in mixin class should be ignored. A
-# mixin class is detected if its name ends with "mixin" (case insensitive).
-ignore-mixin-members=yes
-
-# Tells whether to warn about missing members when the owner of the attribute
-# is inferred to be None.
-ignore-none=yes
-
-# This flag controls whether pylint should warn about no-member and similar
-# checks whenever an opaque object is returned when inferring. The inference
-# can return multiple potential results while evaluating a Python object, but
-# some branches might not be evaluated, which results in partial inference. In
-# that case, it might be useful to still emit no-member and other checks for
-# the rest of the inferred objects.
-ignore-on-opaque-inference=yes
-
-# List of class names for which member attributes should not be checked (useful
-# for classes with dynamically set attributes). This supports the use of
-# qualified names.
-ignored-classes=optparse.Values,thread._local,_thread._local
-
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis. It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules=
-
-# Show a hint with possible names when a member name was not found. The aspect
-# of finding the hint is based on edit distance.
-missing-member-hint=yes
-
-# The minimum edit distance a name should have in order to be considered a
-# similar match for a missing member name.
-missing-member-hint-distance=1
-
-# The total number of similar names that should be taken in consideration when
-# showing a hint for a missing member.
-missing-member-max-choices=1
-
-
-[SPELLING]
-
-# Limits count of emitted suggestions for spelling mistakes.
-max-spelling-suggestions=4
-
-# Spelling dictionary name. Available dictionaries: de (aspell), de_AT
-# (aspell), de_CH (aspell), de_DE (aspell), en (aspell), en_AU (aspell), en_CA
-# (aspell), en_GB (aspell), en_US (aspell)..
-spelling-dict=
-
-# List of comma separated words that should not be checked.
-spelling-ignore-words=
-
-# A path to a file that contains private dictionary; one word per line.
-spelling-private-dict-file=
-
-# Tells whether to store unknown words to indicated private dictionary in
-# --spelling-private-dict-file option instead of raising a message.
-spelling-store-unknown-words=no
-
-
-[MISCELLANEOUS]
-
-# List of note tags to take in consideration, separated by a comma.
-notes=FIXME,
-      XXX,
-      TODO
-
-
-[SIMILARITIES]
-
-# Ignore comments when computing similarities.
-ignore-comments=yes
-
-# Ignore docstrings when computing similarities.
-ignore-docstrings=yes
-
-# Ignore imports when computing similarities.
-ignore-imports=no
-
-# Minimum lines number of a similarity.
-min-similarity-lines=4
-
-
 [BASIC]
 
 # Naming style matching correct argument names.
 argument-naming-style=snake_case
 
 # Regular expression matching correct argument names. Overrides argument-
-# naming-style.
+# naming-style. If left empty, argument names will be checked with the set
+# naming style.
 #argument-rgx=
 
 # Naming style matching correct attribute names.
 attr-naming-style=snake_case
 
 # Regular expression matching correct attribute names. Overrides attr-naming-
+# style. If left empty, attribute names will be checked with the set naming
 # style.
 #attr-rgx=
 
@@ -272,20 +82,22 @@ bad-names=foo,
 class-attribute-naming-style=any
 
 # Regular expression matching correct class attribute names. Overrides class-
-# attribute-naming-style.
+# attribute-naming-style. If left empty, class attribute names will be checked
+# with the set naming style.
 #class-attribute-rgx=
 
 # Naming style matching correct class names.
 class-naming-style=PascalCase
 
 # Regular expression matching correct class names. Overrides class-naming-
-# style.
+# style. If left empty, class names will be checked with the set naming style.
 #class-rgx=
 
 # Naming style matching correct constant names.
 const-naming-style=UPPER_CASE
 
 # Regular expression matching correct constant names. Overrides const-naming-
+# style. If left empty, constant names will be checked with the set naming
 # style.
 #const-rgx=
 
@@ -297,7 +109,8 @@ docstring-min-length=-1
 function-naming-style=snake_case
 
 # Regular expression matching correct function names. Overrides function-
-# naming-style.
+# naming-style. If left empty, function names will be checked with the set
+# naming style.
 #function-rgx=
 
 # Good variable names which should always be accepted, separated by a comma.
@@ -315,21 +128,22 @@ include-naming-hint=no
 inlinevar-naming-style=any
 
 # Regular expression matching correct inline iteration names. Overrides
-# inlinevar-naming-style.
+# inlinevar-naming-style. If left empty, inline iteration names will be checked
+# with the set naming style.
 #inlinevar-rgx=
 
 # Naming style matching correct method names.
 method-naming-style=snake_case
 
 # Regular expression matching correct method names. Overrides method-naming-
-# style.
+# style. If left empty, method names will be checked with the set naming style.
 #method-rgx=
 
 # Naming style matching correct module names.
 module-naming-style=snake_case
 
 # Regular expression matching correct module names. Overrides module-naming-
-# style.
+# style. If left empty, module names will be checked with the set naming style.
 #module-rgx=
 
 # Colon-delimited sets of names that determine each other's naming style when
@@ -349,10 +163,75 @@ property-classes=abc.abstractproperty
 variable-naming-style=snake_case
 
 # Regular expression matching correct variable names. Overrides variable-
-# naming-style.
+# naming-style. If left empty, variable names will be checked with the set
+# naming style.
 #variable-rgx=
 
 
+[CLASSES]
+
+# Warn about protected attribute access inside special methods
+check-protected-access-in-special-methods=no
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=cls
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when caught.
+overgeneral-exceptions=builtins.BaseException,builtins.Exception
+
+
 [FORMAT]
 
 # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
@@ -374,13 +253,6 @@ max-line-length=100
 # Maximum number of lines in a module.
 max-module-lines=1000
 
-# List of optional constructs for which whitespace checking is disabled. `dict-
-# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
-# `trailing-comma` allows a space between comma and closing bracket: (a, ).
-# `empty-line` allows space-only lines.
-no-space-check=trailing-comma,
-               dict-separator
-
 # Allow the body of a class to be on the same line as the declaration if body
 # contains single statement.
 single-line-class-stmt=no
@@ -390,48 +262,25 @@ single-line-class-stmt=no
 single-line-if-stmt=no
 
 
-[LOGGING]
-
-# Format style used to check logging format string. `old` means using %
-# formatting, while `new` is for `{}` formatting.
-logging-format-style=old
-
-# Logging modules to check that the string format arguments are in logging
-# function parameter format.
-logging-modules=logging
-
-
-[STRING]
-
-# This flag controls whether the implicit-str-concat-in-sequence should
-# generate a warning on implicit string concatenation in sequences defined over
-# several lines.
-check-str-concat-over-line-jumps=no
-
-
 [IMPORTS]
 
 # Allow wildcard imports from modules that define __all__.
 allow-wildcard-with-all=no
 
-# Analyse import fallback blocks. This can be used to support both Python 2 and
-# 3 compatible code, which means that the block might have code that exists
-# only in one or another interpreter, leading to false positives when analysed.
-analyse-fallback-blocks=no
-
 # Deprecated modules which should not be used, separated by a comma.
 deprecated-modules=optparse,tkinter.tix
 
-# Create a graph of external dependencies in the given file (report RP0402 must
-# not be disabled).
+# Output a graph (.gv or any supported image format) of external dependencies
+# to the given file (report RP0402 must not be disabled).
 ext-import-graph=
 
-# Create a graph of every (i.e. internal and external) dependencies in the
-# given file (report RP0402 must not be disabled).
+# Output a graph (.gv or any supported image format) of all (i.e. internal and
+# external) dependencies to the given file (report RP0402 must not be
+# disabled).
 import-graph=
 
-# Create a graph of internal dependencies in the given file (report RP0402 must
-# not be disabled).
+# Output a graph (.gv or any supported image format) of internal dependencies
+# to the given file (report RP0402 must not be disabled).
 int-import-graph=
 
 # Force import order to recognize a module as part of the standard
@@ -442,64 +291,236 @@ known-standard-library=
 known-third-party=enchant
 
 
-[CLASSES]
+[LOGGING]
 
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,
-                      __new__,
-                      setUp
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style=new
 
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,
-                  _fields,
-                  _replace,
-                  _source,
-                  _make
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
 
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls
 
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=cls
+[MESSAGES CONTROL]
 
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
+# UNDEFINED.
+confidence=
 
-[DESIGN]
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then re-enable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=all
 
-# Maximum number of arguments for function / method.
-max-args=5
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=dangerous-default-value, # W0102
+       duplicate-key, # W0109
+       wildcard-import, # W0401
+       assert-on-tuple, # W0199
+       unused-import, # W0611
+       unused-variable, # W0612
+       unused-argument, # W0613
+       unused-wildcard-import, # W0614
+       deprecated-method, # W1505
+       cyclic-import, # R0401
+       trailing-comma-tuple, # R1707
+       bad-classmethod-argument, # C0202
+       undefined-variable, # E0602
 
-# Maximum number of attributes for a class (see R0902).
-max-attributes=7
 
-# Maximum number of boolean expressions in an if statement.
-max-bool-expr=5
+[METHOD_ARGS]
 
-# Maximum number of branch for function / method body.
-max-branches=12
+# List of qualified names (i.e., library.method) which require a timeout
+# parameter e.g. 'requests.api.get,requests.api.post'
+timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
 
-# Maximum number of locals for function / method body.
-max-locals=15
 
-# Maximum number of parents for a class (see R0901).
-max-parents=7
+[MISCELLANEOUS]
 
-# Maximum number of public methods for a class (see R0904).
-max-public-methods=20
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+      XXX,
+      TODO
 
-# Maximum number of return / yield for function / method body.
-max-returns=6
 
-# Maximum number of statements in function / method body.
-max-statements=50
+[REFACTORING]
 
-# Minimum number of public methods for a class (see R0903).
-min-public-methods=2
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
 
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit
 
-[EXCEPTIONS]
 
-# Exceptions that will emit a warning when being caught. Defaults to
-# "BaseException, Exception".
-overgeneral-exceptions=BaseException,
-                       Exception
+[REPORTS]
+
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'fatal', 'error', 'warning', 'refactor',
+# 'convention', and 'info' which contain the number of messages in each
+# category, as well as 'statement' which is the total number of statements
+# analyzed. This score is used by the global evaluation report (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+msg-template=
+
+# Set the output format. Available formats are: text, parseable, colorized,
+# json2 (improved json format), json (old json format) and msvs (visual
+# studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=no
+
+
+[SIMILARITIES]
+
+# Comments are removed from the similarity computation
+ignore-comments=yes
+
+# Docstrings are removed from the similarity computation
+ignore-docstrings=yes
+
+# Imports are removed from the similarity computation
+ignore-imports=no
+
+# Signatures are removed from the similarity computation
+ignore-signatures=yes
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. No available dictionaries : You need to install
+# both the python package and the system dependency for enchant to work.
+spelling-dict=
+
+# List of comma separated words that should be considered directives if they
+# appear at the beginning of a comment and should not be checked.
+spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains the private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=no
+
+# This flag controls whether the implicit-str-concat should generate a warning
+# on implicit string concatenation in sequences defined over several lines.
+check-str-concat-over-line-jumps=no
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+          _cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c40cb72a7c..fd98f91fac4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-cmake_minimum_required(VERSION 3.20)
+cmake_minimum_required(VERSION 3.25.1)
 message(STATUS "CMake version: ${CMAKE_VERSION}")
 if(POLICY CMP0076)
   # make target_sources() convert relative paths to absolute
@@ -34,7 +34,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 # C++ standard
 enable_language(CXX)
-set(CMAKE_CXX_STANDARD 17 CACHE STRING "C++ standard to be used")
+set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ standard to be used")
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
@@ -47,11 +47,11 @@ macro(espresso_minimal_compiler_version)
   endif()
 endmacro()
 
-espresso_minimal_compiler_version("GNU" 8.0.0)
-espresso_minimal_compiler_version("Clang" 9.0.0)
-espresso_minimal_compiler_version("AppleClang" 11.0.0)
-espresso_minimal_compiler_version("Intel" 18.0)
-espresso_minimal_compiler_version("IntelLLVM" 2021.0)
+espresso_minimal_compiler_version("GNU" 10.5.0)
+espresso_minimal_compiler_version("Clang" 14.0.0)
+espresso_minimal_compiler_version("AppleClang" 14.0.0)
+espresso_minimal_compiler_version("Intel" 2021.9)
+espresso_minimal_compiler_version("IntelLLVM" 2023.1)
 
 include(FeatureSummary)
 project(ESPResSo)
@@ -190,46 +190,30 @@ add_library(espresso::avx_flags ALIAS espresso_avx_flags)
 
 # CUDA compiler
 if(ESPRESSO_BUILD_WITH_CUDA)
-  set(ESPRESSO_DEFINE_CUDA_ARCHITECTURES OFF)
-  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(ESPRESSO_DEFINE_CUDA_ARCHITECTURES ON)
-  endif()
+  cmake_minimum_required(VERSION 3.25.2)
   include(CheckLanguage)
   enable_language(CUDA)
   check_language(CUDA)
-  set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+  espresso_minimal_compiler_version("GNU" 11.4.0)
+  espresso_minimal_compiler_version("Clang" 17.0.0)
+  set(CMAKE_CUDA_STANDARD 20)
   set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-  set(ESPRESSO_MINIMAL_CUDA_VERSION 11.0)
+  set(ESPRESSO_MINIMAL_CUDA_VERSION 12.0)
   find_package(CUDAToolkit ${ESPRESSO_MINIMAL_CUDA_VERSION} REQUIRED)
-  if(ESPRESSO_DEFINE_CUDA_ARCHITECTURES)
-    unset(ESPRESSO_CUDA_ARCHITECTURES)
-    # 1. sm_75: RTX-2000 series (Turing)
-    # 2. sm_61: GTX-1000 series (Pascal)
-    # 3. sm_52: GTX-900  series (Maxwell)
-    if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
-      list(APPEND ESPRESSO_CUDA_ARCHITECTURES 75)
-      list(APPEND ESPRESSO_CUDA_ARCHITECTURES 61)
-    elseif(CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
-      # GTX-900 series (Maxwell)
-      if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
-        list(APPEND ESPRESSO_CUDA_ARCHITECTURES 52)
-      endif()
-      if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 10)
-        list(APPEND ESPRESSO_CUDA_ARCHITECTURES 61)
-        # With Clang 14+, architectures sm_70+ are only supported with Thrust
-        # 1.11+ from CUDA 11.3+, for details see
-        # https://github.com/NVIDIA/cub/pull/170
-        if((CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 14)
-           OR (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.3.0))
-          list(APPEND ESPRESSO_CUDA_ARCHITECTURES 75)
-        endif()
-      endif()
-    endif()
-    # only override CMAKE_CUDA_ARCHITECTURES when dependencies are satisfied
-    if(DEFINED ESPRESSO_CUDA_ARCHITECTURES)
-      set(CMAKE_CUDA_ARCHITECTURES ${ESPRESSO_CUDA_ARCHITECTURES})
+  if(NOT DEFINED ESPRESSO_CMAKE_CUDA_ARCHITECTURES)
+    if("$ENV{CUDAARCHS}" STREQUAL "")
+      # 1. sm_61: GTX-1000 series (Pascal)
+      # 2. sm_75: RTX-2000 series (Turing)
+      # 3. sm_86: RTX-3000 series (Ampere)
+      # 4. sm_89: RTX-4000 series (Ada)
+      set(ESPRESSO_CUDA_ARCHITECTURES "61;75")
+    else()
+      set(ESPRESSO_CUDA_ARCHITECTURES "$ENV{CUDAARCHS}")
     endif()
+    set(ESPRESSO_CMAKE_CUDA_ARCHITECTURES "${ESPRESSO_CUDA_ARCHITECTURES}"
+        CACHE INTERNAL "")
   endif()
+  set(CMAKE_CUDA_ARCHITECTURES "${ESPRESSO_CMAKE_CUDA_ARCHITECTURES}")
   if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
     find_package(CUDACompilerNVCC ${ESPRESSO_MINIMAL_CUDA_VERSION} REQUIRED)
   elseif(CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
@@ -238,7 +222,7 @@ if(ESPRESSO_BUILD_WITH_CUDA)
         FATAL_ERROR
           "Cannot enable code coverage with Clang as the CUDA compiler")
     endif()
-    find_package(CUDACompilerClang 9.0 REQUIRED)
+    find_package(CUDACompilerClang 17.0 REQUIRED)
   else()
     message(FATAL_ERROR "Unknown CUDA compiler '${CMAKE_CUDA_COMPILER_ID}'")
   endif()
@@ -246,8 +230,8 @@ endif()
 
 # Python interpreter and Cython interface library
 if(ESPRESSO_BUILD_WITH_PYTHON)
-  find_package(Python 3.9 REQUIRED COMPONENTS Interpreter Development NumPy)
-  find_package(Cython 0.29.21...<3.0.8 REQUIRED)
+  find_package(Python 3.10 REQUIRED COMPONENTS Interpreter Development NumPy)
+  find_package(Cython 0.29.28...<3.0.10 REQUIRED)
   find_program(IPYTHON_EXECUTABLE NAMES jupyter ipython3 ipython)
 endif()
 
@@ -457,9 +441,8 @@ if(ESPRESSO_BUILD_WITH_COVERAGE)
       espresso_coverage_flags INTERFACE -g -fprofile-instr-generate
                                         -fcoverage-mapping)
   else()
-    target_compile_options(
-      espresso_coverage_flags INTERFACE -g --coverage -fprofile-arcs
-                                        -ftest-coverage)
+    target_compile_options(espresso_coverage_flags INTERFACE -g --coverage
+                                                             -fprofile-abs-path)
     target_link_libraries(espresso_coverage_flags INTERFACE gcov)
   endif()
 endif()
@@ -472,7 +455,6 @@ target_compile_options(
     -Wall
     -Wextra
     -pedantic
-    $<$<BOOL:${ESPRESSO_WARNINGS_ARE_ERRORS}>:-Werror>
     # add extra warnings
     $<$<CXX_COMPILER_ID:Clang>:-Wextern-initializer>
     $<$<CXX_COMPILER_ID:Clang>:-Wrange-loop-analysis>
@@ -482,16 +464,35 @@ target_compile_options(
     $<$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>:-Wmissing-variable-declarations>
     $<$<AND:$<CXX_COMPILER_ID:Clang>,$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,11.0.0>>:-Wnon-c-typedef-for-linkage>
     $<$<NOT:$<CXX_COMPILER_ID:Intel>>:-Wdelete-non-virtual-dtor>
-    # disable warnings from -Wextra
+    # disable warnings from -Wall and -Wextra
     -Wno-sign-compare
     -Wno-unused-function
     -Wno-unused-parameter
+    -Wno-array-bounds
+    $<$<CXX_COMPILER_ID:GNU>:-Wno-restrict>
     $<$<CXX_COMPILER_ID:GNU>:-Wno-clobbered>
     $<$<CXX_COMPILER_ID:Intel,IntelLLVM>:-diag-disable=592>
     $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-gnu-zero-variadic-macro-arguments>
     $<$<AND:$<CXX_COMPILER_ID:GNU>,$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,8.1.0>>:-Wno-cast-function-type>
     $<$<NOT:$<CXX_COMPILER_ID:Intel,IntelLLVM>>:-Wno-implicit-fallthrough>
-    $<$<NOT:$<CXX_COMPILER_ID:Intel,IntelLLVM,GNU>>:-Wno-unused-private-field>)
+    $<$<NOT:$<CXX_COMPILER_ID:Intel,IntelLLVM,GNU>>:-Wno-unused-private-field>
+    # warnings are errors
+    $<$<BOOL:${ESPRESSO_WARNINGS_ARE_ERRORS}>:-Werror>)
+
+if(ESPRESSO_BUILD_WITH_CUDA)
+  target_compile_options(
+    espresso_cuda_flags
+    INTERFACE
+      -Wall
+      -Wextra
+      -Wno-sign-compare
+      -Wno-unused-parameter
+      $<$<NOT:$<CXX_COMPILER_ID:Intel,IntelLLVM>>:-Wno-implicit-fallthrough>
+      # warnings are errors
+      $<$<AND:$<BOOL:${ESPRESSO_WARNINGS_ARE_ERRORS}>,$<CUDA_COMPILER_ID:NVIDIA>>:--Werror=all-warnings>
+      $<$<AND:$<BOOL:${ESPRESSO_WARNINGS_ARE_ERRORS}>,$<CUDA_COMPILER_ID:Clang>>:-Werror>
+  )
+endif()
 
 # disable warning from -Wextra on ARM processors
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_SYSTEM_PROCESSOR MATCHES
@@ -526,16 +527,25 @@ if(ESPRESSO_BUILD_WITH_ASAN)
   target_compile_options(espresso_cpp_flags INTERFACE -fsanitize=address
                                                       -fno-omit-frame-pointer)
   target_link_libraries(espresso_cpp_flags INTERFACE -fsanitize=address)
+  if(ESPRESSO_BUILD_WITH_CUDA)
+    target_link_libraries(espresso_cuda_flags INTERFACE -fsanitize=address)
+  endif()
 endif()
 if(ESPRESSO_BUILD_WITH_MSAN)
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g -O1")
   target_compile_options(espresso_cpp_flags INTERFACE -fsanitize=memory
                                                       -fno-omit-frame-pointer)
   target_link_libraries(espresso_cpp_flags INTERFACE -fsanitize=memory)
+  if(ESPRESSO_BUILD_WITH_CUDA)
+    target_link_libraries(espresso_cuda_flags INTERFACE -fsanitize=memory)
+  endif()
 endif()
 if(ESPRESSO_BUILD_WITH_UBSAN)
   target_compile_options(espresso_cpp_flags INTERFACE -fsanitize=undefined)
   target_link_libraries(espresso_cpp_flags INTERFACE -fsanitize=undefined)
+  if(ESPRESSO_BUILD_WITH_CUDA)
+    target_link_libraries(espresso_cuda_flags INTERFACE -fsanitize=undefined)
+  endif()
 endif()
 
 target_link_libraries(espresso_cpp_flags INTERFACE espresso::coverage_flags)
@@ -595,7 +605,7 @@ if(ESPRESSO_BUILD_WITH_WALBERLA)
   FetchContent_Declare(
     walberla
     GIT_REPOSITORY https://i10git.cs.fau.de/walberla/walberla.git
-    GIT_TAG        065ce5f311850371a97ac4766f47dbb5ca8424ba
+    GIT_TAG        b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
   )
   # workaround for https://gitlab.kitware.com/cmake/cmake/-/issues/21146
   if(NOT DEFINED walberla_SOURCE_DIR OR NOT EXISTS "${walberla_SOURCE_DIR}")
@@ -614,10 +624,9 @@ if(ESPRESSO_BUILD_WITH_WALBERLA)
   set(CMAKE_POSITION_INDEPENDENT_CODE on CACHE BOOL "")
   if(ESPRESSO_BUILD_WITH_CUDA)
     set(WALBERLA_BUILD_WITH_CUDA "on" CACHE BOOL "")
-    if(CMAKE_VERSION VERSION_LESS 3.25 OR NOT ESPRESSO_CUDA_COMPILER STREQUAL
-                                          "clang")
+    if(NOT ESPRESSO_CUDA_COMPILER STREQUAL "clang")
       if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        set(CMAKE_CUDA_ARCHITECTURES 75)
+        message(FATAL_ERROR "variable CMAKE_CUDA_ARCHITECTURES is undefined")
       endif()
     endif()
   endif()
@@ -637,7 +646,7 @@ if(ESPRESSO_BUILD_WITH_WALBERLA)
     set(WALBERLA_LIBS ${WALBERLA_LIBS} walberla::fft)
   endif()
   if(ESPRESSO_BUILD_WITH_CUDA AND WALBERLA_BUILD_WITH_CUDA)
-    set(WALBERLA_LIBS ${WALBERLA_LIBS} walberla::cuda)
+    set(WALBERLA_LIBS ${WALBERLA_LIBS} walberla::gpu)
   endif()
   # workaround for https://gitlab.kitware.com/cmake/cmake/-/issues/21283
   foreach(target_w_namespace IN LISTS WALBERLA_LIBS)
@@ -660,7 +669,7 @@ if(ESPRESSO_BUILD_WITH_CALIPER)
   FetchContent_Declare(
     caliper
     GIT_REPOSITORY https://github.com/LLNL/Caliper.git
-    GIT_TAG        v2.9.1
+    GIT_TAG        v2.10.0
   )
   if(NOT DEFINED caliper_SOURCE_DIR OR NOT EXISTS "${caliper_SOURCE_DIR}")
     FetchContent_Populate(caliper)
@@ -668,17 +677,18 @@ if(ESPRESSO_BUILD_WITH_CALIPER)
   # cmake-format: on
   set(CALIPER_OPTION_PREFIX on CACHE BOOL "")
   set(CALIPER_WITH_MPI on CACHE BOOL "")
-  if(ESPRESSO_BUILD_WITH_CUDA)
-    set(CALIPER_WITH_NVTX on CACHE BOOL "")
-    set(CALIPER_WITH_CUPTI on CACHE BOOL "")
-  endif()
+  set(CALIPER_WITH_NVTX off CACHE BOOL "")
+  set(CALIPER_WITH_CUPTI off CACHE BOOL "")
   set(CALIPER_BUILD_SHARED_LIBS on CACHE BOOL "")
   add_subdirectory("${caliper_SOURCE_DIR}")
-  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL
-                                              "GNU")
-    target_compile_options(caliper-services
-                           PRIVATE -Wno-deprecated-declarations)
-  endif()
+  target_compile_options(
+    caliper-services
+    PRIVATE
+      $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-deprecated-declarations>)
+  target_compile_options(
+    caliper-runtime
+    PRIVATE $<$<CXX_COMPILER_ID:GNU>:-Wno-maybe-uninitialized>
+            $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-volatile>)
 endif()
 
 #
diff --git a/NEWS b/NEWS
index 2632dadb3d8..a0867e89755 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,129 @@
 = ESPRESSO NEWS =
 =================
 
+ESPResSo 4.2.2
+==============
+
+This release provides a number of corrections for the ESPResSo 4.2 line.
+We recommend that this release be used for all production simulations.
+The interface has not been changed between ESPResSo 4.2.1 and 4.2.2.
+However, some bugs were discovered which can affect simulation results.
+Please find the list of changes below. The numbers in brackets refer to
+ticket numbers on https://github.com/espressomd/espresso
+
+Improved documentation
+----------------------
+
+* Installation instructions now mention the FFTW3 MPI dependency
+  of long-range solvers and provide recommended version numbers
+  for Jupyter Notebook dependencies (#4790).
+
+* Installation instructions now mention Python environments (#4922).
+
+* Observables not properly document return values, array shapes,
+  and use a more consistent mathematical notation (#4898).
+
+Bug fixes
+---------
+
+* Fatal runtime errors due to MPI global variables lifetime
+  were addressed (#4858). Older ESPResSo releases built with
+  Boost 1.84 or later might randomly crash when exiting
+  the Python interpreter.
+
+* Virtual sites no longer contribute to the kinetic energy
+  of the system (#4839). The regression was introduced
+  in April 2021 and affected the 4.2 branch of ESPResSo.
+
+* Inertialess tracers are now integrated along the z-axis (#4714).
+  The regression was introduced in February 2022 and affected
+  the 4.2 branch of ESPResSo.
+
+* Inertialess tracers now throw an exception when attempting to use
+  LB GPU with 2 or more MPI ranks (#4714). Before, tracers on non-root
+  MPI ranks would be silently ignored by the CUDA kernels,
+  and would have a constant velocity, either 0 if the particle never
+  visited the fluid domain on the root rank, or the last known velocity
+  if the particle was once on the root rank. This bug affected all
+  ESPResSo versions.
+
+* Particles close to the faces of the simulation box are now properly
+  coupled to the LB fluid (#4827). Due to numerical instability, it was
+  previously possible for particles to be outside the box simulation by
+  a tiny amount and skip LB particle coupling. The probability of this
+  bug occurring was low, but could be enhanced in simulations that
+  purposefully placed particle near the faces of the simulation box:
+  polymers sheared by Lees-Edwards boundary conditions, raspberry
+  particles (colloids, bacteria, etc.) when crossing a periodic
+  boundary, or cell membranes placed close to a periodic boundary.
+
+* Resizing the box now throws a runtime error if there are constraints
+  present (#4778), since constraint preconditions might no longer be
+  fulfilled. For example, a wall constraint might end up outside the
+  box boundaries when the box shrinks.
+
+* Resizing the box via `system.box_l = new_box_l` now throws
+  a runtime error if there are particles present, because particle
+  position folding cannot be guaranteed to be correct (#4901);
+  use `system.change_volume_and_rescale_particles()` instead,
+  which properly rescales particle positions.
+
+* The velocity Verlet NpT propagator doesn't apply friction and noise
+  on angular velocities. ESPResSo now throws an error when NpT
+  encounters a rotating particle (#4843). This bug affected all
+  ESPResSo versions.
+
+* The Brownian thermostat can no longer be configured with
+  `act_on_virtual=True` due to an unresolved bug (#4295)
+  that will be addressed in the next minor release.
+
+* Restrictions on the number of MPI ranks have been lifted from the
+  checkpointing mechanism (#4724). It is now possible to use
+  checkpointing again in MPI-parallel simulations when the system
+  contains LB boundaries or `Union` shape-based constraints.
+  These restrictions had been introduced in 4.2.0 for technical
+  reasons that have since been resolved.
+
+* When passing an invalid value to a function that expects an input
+  parameter of type `list` of size 3, an exception is now raised (#4911).
+  Previously, some functions would print an error message and continue
+  their execution with uninitialized data.
+
+* The per-`type` and per-`mol_id` contributions from
+  `system.analysis.energy()`, `system.analysis.pressure()`
+  and `system.analysis.pressure_tensor()` now return the correct
+  values (#4788). Older version of ESPResSo were confusing the
+  particle `mol_id` with the particle `type`. The total pressure
+  was unreliable when `mol_id` properties were set to non-zero values.
+
+* The OpenGL visualizer now extracts the correct non-bonded potential
+  parameter `sigma` when feature `WCA` is compiled in but `LENNARD_JONES`
+  isn't (#4720). The regression was introduced in 4.2.1.
+
+* Method `OifCell.elastic_forces()` no longer throws a `TypeError` (#4813).
+
+* Benchmark scripts were adjusted to support large particle numbers (#4753).
+
+Under the hood changes
+----------------------
+
+* Several Clang 16 and GCC 13 compiler diagnostics have been addressed
+  (#4715).
+
+* A non-critical GCC C++20 deprecation warning in Cython-generated code
+  was disabled (#4725).
+
+* Several deprecation warnings emitted by CMake 3.27 have been silenced
+  (#4792).
+
+* Add support for setuptools version 67.3.0 and above (#4709).
+
+* Add support for Python 3.12 in testsuites run by CTest (#4852).
+
+* Python requirements have been updated (#4924).
+
+* CI pipeline URLs have been fixed (#4736).
+
 ESPResSo 4.2.1
 ==============
 
diff --git a/Readme.md b/Readme.md
index 95048e8c80b..098817a275c 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,3 +1,9 @@
+# Invitation to the ESPResSo Summer School 2024
+
+[![CECAM Flagship School registration link](https://img.shields.io/badge/CECAM%20Flagship%20School-Register%20Now-blue?style=for-the-badge)](https://www.cecam.org/workshop-details/1324)
+
+The summer school "Simulating soft matter across scales" will take place on October 7-11, 2024, in Stuttgart. Registration is now open on [CECAM](https://www.cecam.org/workshop-details/1324).
+
 # ESPResSo
 
 [![GitLab CI](https://gitlab.icp.uni-stuttgart.de/espressomd/espresso/badges/python/pipeline.svg)](https://gitlab.icp.uni-stuttgart.de/espressomd/espresso/-/commits/python)
@@ -70,7 +76,7 @@ For most users, we recommend downloading the latest release version of ESPResSo.
 can find it in the [release page](https://github.com/espressomd/espresso/releases),
 together with past releases until 4.0. When choosing a release, we recommend that
 you get the latest bugfix release in that line. For example, for 4.2 you would like
-to use 4.2.1.
+to use 4.2.2.
 
 ### Join the community
 
diff --git a/cmake/FindCUDACompilerClang.cmake b/cmake/FindCUDACompilerClang.cmake
index 113eefde460..56f02f1023a 100644
--- a/cmake/FindCUDACompilerClang.cmake
+++ b/cmake/FindCUDACompilerClang.cmake
@@ -95,24 +95,34 @@ target_compile_options(
   $<$<CONFIG:Release>:-O3 -DNDEBUG>
   $<$<CONFIG:MinSizeRel>:-O2 -DNDEBUG>
   $<$<CONFIG:RelWithDebInfo>:-O2 -g -DNDEBUG>
-  $<$<CONFIG:Coverage>:-O3 -g>
+  $<$<CONFIG:Coverage>:-O3 -g -fprofile-instr-generate -fcoverage-mapping>
   $<$<CONFIG:RelWithAssert>:-O3 -g>
 )
 
-function(espresso_add_gpu_library)
-  set(options STATIC SHARED MODULE EXCLUDE_FROM_ALL)
-  set(oneValueArgs)
-  set(multiValueArgs)
-  cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  list(GET ARG_UNPARSED_ARGUMENTS 0 TARGET_NAME)
-  list(REMOVE_AT ARG_UNPARSED_ARGUMENTS 0)
-  set(TARGET_SOURCES ${ARG_UNPARSED_ARGUMENTS})
+function(espresso_setup_gpu_app)
+  cmake_parse_arguments(TARGET "" "NAME" "SOURCES" ${ARGN})
   set_source_files_properties(${TARGET_SOURCES} PROPERTIES LANGUAGE "CUDA")
-  add_library(${ARGV})
   set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE "CXX")
   target_link_libraries(${TARGET_NAME} PRIVATE espresso::cuda_flags)
 endfunction()
 
+function(espresso_add_gpu_library)
+  add_library(${ARGV})
+  cmake_parse_arguments(ARG "STATIC;SHARED;MODULE;EXCLUDE_FROM_ALL" "" "" ${ARGN})
+  list(GET ARGV 0 TARGET_NAME)
+  set(TARGET_SOURCES ${ARG_UNPARSED_ARGUMENTS})
+  list(POP_FRONT TARGET_SOURCES)
+  espresso_setup_gpu_app(NAME ${TARGET_NAME} SOURCES ${TARGET_SOURCES})
+endfunction()
+
+function(espresso_add_gpu_executable)
+  add_executable(${ARGV})
+  list(GET ARGV 0 TARGET_NAME)
+  set(TARGET_SOURCES ${ARGV})
+  list(POP_FRONT TARGET_SOURCES)
+  espresso_setup_gpu_app(NAME ${TARGET_NAME} SOURCES ${TARGET_SOURCES})
+endfunction()
+
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(
   CUDACompilerClang REQUIRED_VARS CMAKE_CUDA_COMPILER VERSION_VAR
diff --git a/cmake/FindCUDACompilerNVCC.cmake b/cmake/FindCUDACompilerNVCC.cmake
index 0ddce340a8c..d9600ecf6d7 100644
--- a/cmake/FindCUDACompilerNVCC.cmake
+++ b/cmake/FindCUDACompilerNVCC.cmake
@@ -23,12 +23,12 @@
 # include the toolkit libraries and declare a custom
 # `add_library()` wrapper function named `espresso_add_gpu_library()`.
 
-get_filename_component(ESPRESO_CUDAToolkit_ROOT_RESOLVED "${CUDAToolkit_ROOT}/bin/nvcc" REALPATH)
-get_filename_component(ESPRESO_CMAKE_CUDA_COMPILER_RESOLVED "${CMAKE_CUDA_COMPILER}" REALPATH)
+file(REAL_PATH "${CUDAToolkit_ROOT}/bin/nvcc" ESPRESO_CUDAToolkit_ROOT_RESOLVED)
+file(REAL_PATH "${CMAKE_CUDA_COMPILER}" ESPRESO_CMAKE_CUDA_COMPILER_RESOLVED)
 if(NOT "${ESPRESO_CUDAToolkit_ROOT_RESOLVED}" STREQUAL "${ESPRESO_CMAKE_CUDA_COMPILER_RESOLVED}"
    AND NOT ESPRESSO_INSIDE_DOCKER)
-  get_filename_component(ESPRESSO_NVCC_EXECUTABLE_DIRNAME "${CMAKE_CUDA_COMPILER}" DIRECTORY)
-  get_filename_component(ESPRESSO_NVCC_EXECUTABLE_DIRNAME "${ESPRESSO_NVCC_EXECUTABLE_DIRNAME}" DIRECTORY)
+  cmake_path(GET CMAKE_CUDA_COMPILER PARENT_PATH ESPRESSO_NVCC_EXECUTABLE_DIRNAME)
+  cmake_path(GET ESPRESSO_NVCC_EXECUTABLE_DIRNAME PARENT_PATH ESPRESSO_NVCC_EXECUTABLE_DIRNAME)
   message(
     WARNING
       "Your nvcc compiler (${CMAKE_CUDA_COMPILER}) does not appear to match your CUDA toolkit installation (${CUDAToolkit_ROOT}). While ESPResSo will still compile, you might get unexpected crashes. Try hinting it with '-D CUDAToolkit_ROOT=\"${ESPRESSO_NVCC_EXECUTABLE_DIRNAME}\"'."
@@ -47,17 +47,25 @@ target_compile_options(
   $<$<CONFIG:Release>:-Xptxas=-O3 -Xcompiler=-O3 -DNDEBUG>
   $<$<CONFIG:MinSizeRel>:-Xptxas=-O2 -Xcompiler=-Os -DNDEBUG>
   $<$<CONFIG:RelWithDebInfo>:-Xptxas=-O2 -Xcompiler=-O2,-g -DNDEBUG>
-  $<$<CONFIG:Coverage>:-Xptxas=-O3 -Xcompiler=-Og,-g>
+  $<$<CONFIG:Coverage>:-Xptxas=-O3 -Xcompiler=-Og,-g,--coverage,-fprofile-abs-path>
   $<$<CONFIG:RelWithAssert>:-Xptxas=-O3 -Xcompiler=-O3,-g>
-  $<$<BOOL:${ESPRESSO_WARNINGS_ARE_ERRORS}>:-Xcompiler=-Werror;-Xptxas=-Werror>
   $<$<BOOL:${CMAKE_OSX_SYSROOT}>:-Xcompiler=-isysroot;-Xcompiler=${CMAKE_OSX_SYSROOT}>
+  # workaround for https://github.com/espressomd/espresso/issues/4943
+  $<$<BOOL:${ESPRESSO_BUILD_WITH_CCACHE}>:$<$<CONFIG:Coverage>:--coverage -fprofile-abs-path>>
 )
 
 function(espresso_add_gpu_library)
   add_library(${ARGV})
   set(TARGET_NAME ${ARGV0})
   set_target_properties(${TARGET_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-  target_link_libraries(${TARGET_NAME} PRIVATE espresso::cuda_flags)
+  target_link_libraries(${TARGET_NAME} PRIVATE espresso::cuda_flags $<$<CONFIG:Coverage>:gcov>)
+endfunction()
+
+function(espresso_add_gpu_executable)
+  add_executable(${ARGV})
+  set(TARGET_NAME ${ARGV0})
+  set_target_properties(${TARGET_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+  target_link_libraries(${TARGET_NAME} PRIVATE espresso::cuda_flags $<$<CONFIG:Coverage>:gcov>)
 endfunction()
 
 include(FindPackageHandleStandardArgs)
diff --git a/cmake/FindCython.cmake b/cmake/FindCython.cmake
index f0373c6217e..ccfc6903a1f 100644
--- a/cmake/FindCython.cmake
+++ b/cmake/FindCython.cmake
@@ -35,11 +35,11 @@
 # Use the Cython executable that lives next to the Python executable
 # if it is a local installation.
 if(Python_EXECUTABLE)
-  get_filename_component(_python_path ${Python_EXECUTABLE} PATH)
+  cmake_path(GET Python_EXECUTABLE PARENT_PATH _python_path)
 elseif(Python3_EXECUTABLE)
-  get_filename_component(_python_path ${Python3_EXECUTABLE} PATH)
+  cmake_path(GET Python3_EXECUTABLE PARENT_PATH _python_path)
 elseif(DEFINED PYTHON_EXECUTABLE)
-  get_filename_component(_python_path ${PYTHON_EXECUTABLE} PATH)
+  cmake_path(GET PYTHON_EXECUTABLE PARENT_PATH _python_path)
 endif()
 
 if(DEFINED _python_path)
diff --git a/cmake/espresso_resource_files.cmake b/cmake/espresso_resource_files.cmake
new file mode 100644
index 00000000000..6daae13c6c4
--- /dev/null
+++ b/cmake/espresso_resource_files.cmake
@@ -0,0 +1,44 @@
+#
+# Copyright (C) 2024 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+define_property(TARGET PROPERTY EspressoResourceFiles
+                BRIEF_DOCS "List of resource files to be deployed with target")
+
+# Register resource files (Python files, text files, etc.) that need to be
+# deployed alongside a target. If the file exists in the project source
+# directory, it is configured with COPYONLY. If not, it is assumed to be a
+# generated file.
+function(espresso_target_resources)
+  list(POP_FRONT ARGV TARGET_NAME)
+  foreach(RESOURCE_RELPATH ${ARGV})
+    if(IS_ABSOLUTE ${RESOURCE_RELPATH})
+      message(
+        FATAL_ERROR
+          "function espresso_target_resources() only supports relative paths, could not process \"${RESOURCE_RELPATH}\""
+      )
+    endif()
+    set(RESOURCE_SOURCE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_RELPATH}")
+    set(RESOURCE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_RELPATH}")
+    if(EXISTS ${RESOURCE_SOURCE_PATH})
+      configure_file(${RESOURCE_SOURCE_PATH} ${RESOURCE_BINARY_PATH} COPYONLY)
+    endif()
+    set_property(TARGET ${TARGET_NAME} APPEND
+                 PROPERTY EspressoResourceFiles "${RESOURCE_BINARY_PATH}")
+  endforeach()
+endfunction()
diff --git a/cmake/unit_test.cmake b/cmake/espresso_unit_test.cmake
similarity index 74%
rename from cmake/unit_test.cmake
rename to cmake/espresso_unit_test.cmake
index 535d5ac80ca..c857d9bed13 100644
--- a/cmake/unit_test.cmake
+++ b/cmake/espresso_unit_test.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2016-2022 The ESPResSo project
+# Copyright (C) 2016-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,10 +17,17 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-# unit_test function
-function(UNIT_TEST)
-  cmake_parse_arguments(TEST "" "NAME;NUM_PROC" "SRC;DEPENDS" ${ARGN})
-  add_executable(${TEST_NAME} ${TEST_SRC})
+function(ESPRESSO_UNIT_TEST)
+  cmake_parse_arguments(TEST "" "SRC;NAME;NUM_PROC" "DEPENDS" ${ARGN})
+  if(NOT DEFINED TEST_NAME)
+    cmake_path(GET TEST_SRC STEM TEST_NAME)
+    set(TEST_NAME ${TEST_NAME} PARENT_SCOPE)
+  endif()
+  if(${TEST_SRC} MATCHES ".*\.cu$")
+    espresso_add_gpu_executable(${TEST_NAME} ${TEST_SRC})
+  else()
+    add_executable(${TEST_NAME} ${TEST_SRC})
+  endif()
   # Build tests only when testing
   set_target_properties(${TEST_NAME} PROPERTIES EXCLUDE_FROM_ALL ON)
   set_target_properties(${TEST_NAME} PROPERTIES CXX_CLANG_TIDY "${ESPRESSO_CXX_CLANG_TIDY}")
@@ -29,7 +36,15 @@ function(UNIT_TEST)
     target_link_libraries(${TEST_NAME} PRIVATE ${TEST_DEPENDS})
   endif()
   target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/src/core)
-  target_link_libraries(${TEST_NAME} PRIVATE espresso::config espresso::cpp_flags)
+  if(ESPRESSO_BUILD_WITH_COVERAGE AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    target_compile_options(
+      ${TEST_NAME} PRIVATE -fno-default-inline -fno-elide-constructors)
+  endif()
+  if(${TEST_SRC} MATCHES ".*\.cu$")
+    target_link_libraries(${TEST_NAME} PRIVATE espresso::config CUDA::cuda_driver CUDA::cudart)
+  else()
+    target_link_libraries(${TEST_NAME} PRIVATE espresso::config espresso::cpp_flags)
+  endif()
 
   # If NUM_PROC is given, set up MPI parallel test case
   if(TEST_NUM_PROC)
@@ -60,4 +75,4 @@ function(UNIT_TEST)
     ${TEST_NAME} PROPERTIES ENVIRONMENT "${TEST_ENV_VARIABLES}")
 
   add_dependencies(check_unit_tests ${TEST_NAME})
-endfunction(UNIT_TEST)
+endfunction()
diff --git a/doc/bibliography.bib b/doc/bibliography.bib
index c9423a91904..77c61e8901a 100644
--- a/doc/bibliography.bib
+++ b/doc/bibliography.bib
@@ -239,21 +239,6 @@ @Article{brown95a
   publisher={Taylor \& Francis},
 }
 
-@InCollection{burtscher11a,
-author = {Burtscher, Martin and Pingali, Keshav},
-chapter = {6},
-title = {An efficient {CUDA} implementation of the tree-based {B}arnes {H}ut n-body algorithm},
-editor = {Hwu, Wen-mei W.},
-booktitle = {{GPU} Computing Gems Emerald Edition},
-publisher = {Morgan Kaufmann},
-address = {Boston},
-pages = {75--92},
-year = {2011},
-series = {Applications of GPU Computing Series},
-isbn = {978-0-12-384988-5},
-doi = {10.1016/B978-0-12-384988-5.00006-1},
-}
-
 @Article{cerda08d,
   title                    = {{P3M} algorithm for dipolar interactions},
   author                   = {Cerd\`{a}, Juan J. and Ballenegger, Vincent and Lenz, Olaf and Holm, Christian},
@@ -925,17 +910,6 @@ @Article{plimpton95a
   publisher={Elsevier}
 }
 
-@Article{polyakov13a,
-author = {Polyakov, A. Yu. and Lyutyy, T. V. and Denisov, S. and Reva, V. V. and H\"{a}nggi, P.},
-title = {Large-scale ferrofluid simulations on graphics processing units},
-journal = {Computer Physics Communications},
-year = {2013},
-volume = {184},
-number = {6},
-pages = {1483--1489},
-doi = {10.1016/j.cpc.2013.01.016},
-}
-
 @Book{pottier10a,
   title={Nonequilibrium Statistical Physics},
   subtitle={Linear Irreversible Processes},
diff --git a/doc/doxygen/Doxyfile.in b/doc/doxygen/Doxyfile.in
index e4cc02ded2e..67333f21afd 100644
--- a/doc/doxygen/Doxyfile.in
+++ b/doc/doxygen/Doxyfile.in
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.17
+# Doxyfile 1.9.8
 
 @INCLUDE = "@CMAKE_CURRENT_BINARY_DIR@/doxy-features"
 
@@ -95,14 +95,6 @@ ALLOW_UNICODE_NAMES    = NO
 
 OUTPUT_LANGUAGE        = English
 
-# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all generated output in the proper direction.
-# Possible values are: None, LTR, RTL and Context.
-# The default value is: None.
-
-OUTPUT_TEXT_DIRECTION  = None
-
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -424,6 +416,14 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# If the TIMESTAMP tag is set different from NO then each generated page will
+# contain the date or date and time when the page was generated. Setting this to
+# NO can help when comparing the output of multiple runs.
+# Possible values are: YES, NO, DATETIME and DATE.
+# The default value is: NO.
+
+TIMESTAMP              = YES
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -1202,15 +1202,6 @@ HTML_COLORSTYLE_SAT    = 100
 
 HTML_COLORSTYLE_GAMMA  = 80
 
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = YES
-
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1481,17 +1472,6 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
 # https://www.mathjax.org) which uses client side JavaScript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
@@ -1774,16 +1754,6 @@ LATEX_BATCHMODE        = NO
 
 LATEX_HIDE_INDICES     = NO
 
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
 # https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
@@ -1792,14 +1762,6 @@ LATEX_SOURCE_CODE      = NO
 
 LATEX_BIB_STYLE        = plainnat
 
-# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_TIMESTAMP        = NO
-
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1856,16 +1818,6 @@ RTF_STYLESHEET_FILE    =
 
 RTF_EXTENSIONS_FILE    =
 
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
 #---------------------------------------------------------------------------
@@ -1955,15 +1907,6 @@ GENERATE_DOCBOOK       = NO
 
 DOCBOOK_OUTPUT         = docbook
 
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
@@ -2143,25 +2086,9 @@ EXTERNAL_GROUPS        = YES
 EXTERNAL_PAGES         = YES
 
 #---------------------------------------------------------------------------
-# Configuration options related to the dot tool
+# Configuration options related to diagram generator tools
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
 # The default value is: YES.
@@ -2187,23 +2114,6 @@ HAVE_DOT               = $(HAVE_DOT)
 
 DOT_NUM_THREADS        = 0
 
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
 # By default doxygen will tell dot to use the default font as specified with
 # DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
 # the path where dot can find it using this tag.
@@ -2418,18 +2328,6 @@ DOT_GRAPH_MAX_NODES    = 100
 
 MAX_DOT_GRAPH_DEPTH    = 0
 
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = YES
-
 # Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
 # makes dot run faster, but since only newer versions of dot (>1.8.10) support
diff --git a/doc/sphinx/CMakeLists.txt b/doc/sphinx/CMakeLists.txt
index 7dc5dc3a0bb..676a0e9dffd 100644
--- a/doc/sphinx/CMakeLists.txt
+++ b/doc/sphinx/CMakeLists.txt
@@ -72,11 +72,11 @@ if(SPHINX_FOUND)
       "${CMAKE_CURRENT_SOURCE_DIR}/bibliography.rst")
 
   foreach(file ${FILE_LIST})
-    get_filename_component(basename ${file} NAME)
+    cmake_path(GET file FILENAME basename)
     configure_file(${file} ${CMAKE_CURRENT_BINARY_DIR}/${basename} COPYONLY)
   endforeach()
-  configure_file(${CMAKE_SOURCE_DIR}/doc/bibliography.bib
-                 ${CMAKE_CURRENT_BINARY_DIR}/bibliography.bib COPYONLY)
+  configure_file("${CMAKE_SOURCE_DIR}/doc/bibliography.bib"
+                 "${CMAKE_CURRENT_BINARY_DIR}/bibliography.bib" COPYONLY)
 
   # Place files to be excluded from SPHINX documentation
   set(EXCLUDE "${SPHINX_PYTHON_DIR}/gen_code_info.py"
diff --git a/doc/sphinx/advanced_methods.rst b/doc/sphinx/advanced_methods.rst
index b18b813281d..a4553a0a138 100644
--- a/doc/sphinx/advanced_methods.rst
+++ b/doc/sphinx/advanced_methods.rst
@@ -126,43 +126,6 @@ Several modes are available for different types of binding.
             part_type_to_be_glued=3,
             part_type_after_glueing=4)
 
-* ``"bind_three_particles"`` allows for the creation of agglomerates which maintain
-  their shape similarly to those create by the mode ``"bind_at_point_of_collision"``.
-  The present approach works without virtual sites. Instead, for each two-particle
-  collision, the surrounding is searched for a third particle. If one is found,
-  angular bonds are placed to maintain the local shape.
-  If all three particles are within the cutoff distance, an angle bond is added
-  on each of the three particles in addition
-  to the distance based bonds between the particle centers.
-  If two particles are within the cutoff of a central particle (e.g., chain of three particles)
-  an angle bond is placed on the central particle.
-  The angular bonds being added are determined from the angle between the particles.
-  This method does not depend on the particles' rotational
-  degrees of freedom being integrated. Virtual sites are not required.
-  The method, along with the corresponding bonds are setup as follows::
-
-        first_angle_bond_id = 0
-        n_angle_bonds = 181  # 0 to 180 degrees in one degree steps
-        for i in range(0, n_angle_bonds, 1):
-            bond_id = first_angle_bond_id + i
-            system.bonded_inter[bond_id] = espressomd.interactions.AngleHarmonic(
-                bend=1., phi0=float(i) / float(n_angle_bonds - 1) * np.pi)
-
-        bond_centers = espressomd.interactions.HarmonicBond(k=1., r_0=0.1, r_cut=0.5)
-        system.bonded_inter.add(bond_centers)
-
-        system.collision_detection.set_params(
-            mode="bind_three_particles",
-            bond_centers=bond_centers,
-            bond_three_particles=first_angle_bond_id,
-            three_particle_binding_angle_resolution=n_angle_bonds,
-            distance=0.1)
-
-  Important: The bonds for the angles are mapped via their numerical bond ids.
-  In this example, ids from 0 to 180 are used. All other bonds required for
-  the simulation need to be added to the system after those bonds. In particular,
-  this applies to the bonded interaction passed via ``bond_centers``
-
 
 The following limitations currently apply for the collision detection:
 
@@ -1301,8 +1264,7 @@ In |es|, the basic ingredients to simulate such a system are split into three bo
 The system-wide thermostat has to be applied to the centre of mass and not to
 the core particle directly. Therefore, the particles have to be excluded from
 global thermostatting.  With ``THERMOSTAT_PER_PARTICLE`` enabled, we set the
-friction coefficient of the Drude complex to zero, which allows
-to still use a global Langevin thermostat for non-polarizable particles.
+friction coefficient of the Drude complex to zero.
 
 As the Drude charge should not alter the *charge* or *mass* of the Drude
 complex, both properties have to be subtracted from the core when adding the
@@ -1313,9 +1275,11 @@ polarizability :math:`\alpha` (in units of inverse volume) with :math:`q_d =
 
 The following helper method takes into account all the preceding considerations
 and can be used to conveniently add a Drude particle to a given core particle.
-It returns an `espressomd.particle_data.ParticleHandle` to the created Drude
+It returns a :class:`~espressomd.particle_data.ParticleHandle` of the created Drude
 particle. Note that as the function also adds the first two bonds between Drude
-and core, these bonds have to be already available.::
+and core, these bonds have to be already available.
+
+.. code-block::
 
     import espressomd.drude_helpers
     dh = espressomd.drude_helpers.DrudeHelpers()
diff --git a/doc/sphinx/electrostatics.rst b/doc/sphinx/electrostatics.rst
index e3a5e39e417..8f6713cbcc8 100644
--- a/doc/sphinx/electrostatics.rst
+++ b/doc/sphinx/electrostatics.rst
@@ -342,10 +342,6 @@ MMM1D
 
 :class:`espressomd.electrostatics.MMM1D`
 
-.. note::
-    Required features: ``ELECTROSTATICS`` for MMM1D, the GPU version
-    additionally needs the features ``CUDA`` and ``MMM1D_GPU``.
-
 Please cite :cite:`arnold05b` when using MMM1D. See :ref:`MMM1D theory` for
 the details.
 
@@ -369,34 +365,6 @@ change the value of the ``timings`` argument of the
 :class:`~espressomd.electrostatics.MMM1D` class,
 which controls the number of test force calculations.
 
-.. _MMM1D on GPU:
-
-MMM1D on GPU
-~~~~~~~~~~~~
-
-:class:`espressomd.electrostatics.MMM1DGPU`
-
-MMM1D is also available in a GPU implementation. Unlike its CPU
-counterpart, it does not need the N-squared cell system.
-
-::
-
-    import espressomd.electrostatics
-    mmm1d = espressomd.electrostatics.MMM1DGPU(prefactor=C, far_switch_radius=fr,
-                                               maxPWerror=err, tune=False, bessel_cutoff=bc)
-    mmm1d = espressomd.electrostatics.MMM1DGPU(prefactor=C, maxPWerror=err)
-
-The first form sets parameters manually. The switch radius determines at which
-xy-distance the force calculation switches from the near to the far
-formula. If the Bessel cutoff is not explicitly given, it is determined
-from the maximal pairwise error, otherwise this error only counts for
-the near formula. The second tuning form just takes the maximal pairwise
-error and tries out a lot of switching radii to find out the fastest one.
-
-For details on the MMM family of algorithms, refer to appendix
-:ref:`The MMM family of algorithms`.
-
-
 .. _ScaFaCoS electrostatics:
 
 ScaFaCoS electrostatics
diff --git a/doc/sphinx/installation.rst b/doc/sphinx/installation.rst
index 46ba6fbe7ee..6b86106b8b2 100644
--- a/doc/sphinx/installation.rst
+++ b/doc/sphinx/installation.rst
@@ -25,7 +25,7 @@ performance of the code. Therefore it is not possible to build a single
 binary that can satisfy all needs. For performance reasons a user
 should always activate only those features that are actually needed.
 This means, however, that learning how to compile is a necessary evil.
-The build system of |es| uses CMake [4]_ to compile
+The build system of |es| uses CMake to compile
 software easily on a wide range of platforms.
 
 Users who only need a "default" installation of |es| and have an account
@@ -45,10 +45,10 @@ are required to be able to compile and use |es|:
 .. glossary::
 
     CMake
-        The build system is based on CMake.
+        The build system is based on CMake version 3 or later [4]_.
 
     C++ compiler
-        The C++ core of |es| needs to be built by a C++17-capable compiler.
+        The C++ core of |es| needs to be built by a C++20-capable compiler.
 
     Boost
         A number of advanced C++ features used by |es| are provided by Boost.
@@ -58,6 +58,11 @@ are required to be able to compile and use |es|:
         For some algorithms like P\ :math:`^3`\ M, |es| needs the FFTW library
         version 3 or later [5]_ for Fourier transforms, including header files.
 
+    CUDA
+        For some algorithms like P\ :math:`^3`\ M,
+        |es| provides GPU-accelerated implementations for NVIDIA GPUs.
+        We strongly recommend CUDA 12.0 or later [6]_.
+
     MPI
         An MPI library that implements the MPI standard version 1.2 is required
         to run simulations in parallel. |es| is currently tested against
@@ -79,22 +84,43 @@ are required to be able to compile and use |es|:
     Python
         |es|'s main user interface relies on Python 3.
 
+        We strongly recommend using Python environments to isolate
+        packages required by |es| from packages installed system-wide.
+        This can be achieved using venv [7]_, conda [8]_, or any similar tool.
+        Inside an environment, commands of the form
+        ``sudo apt install python3-numpy python3-scipy``
+        can be rewritten as ``python3 -m pip install numpy scipy``,
+        and thus do not require root privileges.
+
+        Depending on your needs, you may choose to install all |es|
+        dependencies inside the environment, or only the subset of
+        dependencies not already satisfied by your workstation or cluster.
+        For the exact syntax to create and configure an environment,
+        please refer to the tool documentation.
+
     Cython
         Cython is used for connecting the C++ core to Python.
 
+        Python environment tools may allow you to install a Python executable
+        that is more recent than the system-wide Python executable.
+        Be aware this might lead to compatibility issues if Cython
+        accidentally picks up the system-wide :file:`Python.h` header file.
+        In that scenario, you will have to manually adapt the C++ compiler
+        include paths to find the correct :file:`Python.h` header file.
+
 
 .. _Installing requirements on Ubuntu Linux:
 
 Installing requirements on Ubuntu Linux
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To compile |es| on Ubuntu 22.04 LTS, install the following dependencies:
+To compile |es| on Ubuntu 24.04 LTS, install the following dependencies:
 
 .. code-block:: bash
 
-    sudo apt install build-essential cmake cython3 python3-pip python3-numpy \
-      libboost-all-dev openmpi-common fftw3-dev libfftw3-mpi-dev libhdf5-dev libhdf5-openmpi-dev \
-      python3-scipy python3-opengl libgsl-dev freeglut3
+    sudo apt install build-essential cmake cython3 python3-dev openmpi-bin \
+      libboost-all-dev fftw3-dev libfftw3-mpi-dev libhdf5-dev libhdf5-openmpi-dev \
+      python3-pip python3-numpy python3-scipy python3-opengl libgsl-dev freeglut3
 
 Optionally the ccmake utility can be installed for easier configuration:
 
@@ -120,7 +146,7 @@ paths before building the project, for example via environment variables:
 
 .. code-block:: bash
 
-    export CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda-11.5"
+    export CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda-12.0"
     export PATH="${CUDA_TOOLKIT_ROOT_DIR}/bin${PATH:+:$PATH}"
     export LD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
 
@@ -130,41 +156,41 @@ to activate CUDA. These commands may need to be adapted depending on which
 operating system and CUDA version you are using.
 
 You can control the list of CUDA architectures to generate device code for.
-For example, ``-D CMAKE_CUDA_ARCHITECTURES=61;75`` will generate device code
-for both sm_61 and sm_75 architectures.
+For example, ``CUDAARCHS="61;75" cmake .. -D ESPRESSO_BUILD_WITH_CUDA=ON``
+will generate device code for both sm_61 and sm_75 architectures.
 
-On Ubuntu 22.04, the default GCC compiler is too recent for nvcc and will fail
-to compile sources that rely on ``std::function``. You can either use GCC 10:
+On Ubuntu 24.04, the default GCC compiler may too recent for nvcc.
+You can either use GCC 12:
 
 .. code-block:: bash
 
-    CC=gcc-10 CXX=g++-10 CUDACXX=/usr/local/cuda-11.5/bin/nvcc cmake .. \
+    CC=gcc-12 CXX=g++-12 CUDACXX=/usr/local/cuda-12.0/bin/nvcc cmake .. \
       -D ESPRESSO_BUILD_WITH_CUDA=ON \
-      -D CUDAToolkit_ROOT=/usr/local/cuda-11.5 \
-      -D CMAKE_CUDA_FLAGS="--compiler-bindir=/usr/bin/g++-10"
+      -D CUDAToolkit_ROOT=/usr/local/cuda-12.0 \
+      -D CMAKE_CUDA_FLAGS="--compiler-bindir=/usr/bin/g++-12"
 
-or alternatively install Clang 14 as a replacement for nvcc and GCC:
+or alternatively install Clang 18 as a replacement for nvcc and GCC:
 
 .. code-block:: bash
 
-    CC=clang-14 CXX=clang++-14 CUDACXX=clang++-14 cmake .. \
+    CC=clang-18 CXX=clang++-18 CUDACXX=clang++-18 cmake .. \
       -D ESPRESSO_BUILD_WITH_CUDA=ON \
-      -D CUDAToolkit_ROOT=/usr/local/cuda-11.5 \
-      -D CMAKE_CXX_FLAGS="-I/usr/include/x86_64-linux-gnu/c++/10 -I/usr/include/c++/10 --cuda-path=/usr/local/cuda-11.5" \
-      -D CMAKE_CUDA_FLAGS="-I/usr/include/x86_64-linux-gnu/c++/10 -I/usr/include/c++/10 --cuda-path=/usr/local/cuda-11.5"
+      -D CUDAToolkit_ROOT=/usr/local/cuda-12.0 \
+      -D CMAKE_CXX_FLAGS="-I/usr/include/x86_64-linux-gnu/c++/12 -I/usr/include/c++/12 --cuda-path=/usr/local/cuda-12.0" \
+      -D CMAKE_CUDA_FLAGS="-I/usr/include/x86_64-linux-gnu/c++/12 -I/usr/include/c++/12 --cuda-path=/usr/local/cuda-12.0"
 
 Please note that all CMake options and compiler flags that involve
 ``/usr/local/cuda-*`` need to be adapted to your CUDA environment.
 But they are only necessary on systems with multiple CUDA releases installed,
 and can be safely removed if you have only one CUDA release installed.
 
-Please also note that with Clang, you still need the GCC 10 toolchain,
-which can be set up with ``apt install gcc-10 g++-10 libstdc++-10-dev``.
+Please also note that with Clang, you still need the GCC 12 toolchain,
+which can be set up with ``apt install gcc-12 g++-12 libstdc++-12-dev``.
 The extra compiler flags in the Clang CMake command above are needed to pin
 the search paths of Clang. By default, it searches trough the most recent
-GCC version, which is GCC 12 on Ubuntu 22.04. It is not possible to install
-the NVIDIA driver without GCC 12 due to a dependency resolution issue
-(``nvidia-dkms`` depends on ``dkms`` which depends on ``gcc-12``).
+GCC version, which is GCC 13 on Ubuntu 24.04. It is not possible to install
+the NVIDIA driver without GCC 13 due to a dependency resolution issue
+(``nvidia-dkms`` depends on ``dkms`` which depends on ``gcc-13``).
 
 .. _Requirements for building the documentation:
 
@@ -255,11 +281,11 @@ Installing requirements on Windows via WSL
 
 To run |es| on Windows, use the Linux subsystem. For that you need to
 
-* follow `these instructions <https://docs.microsoft.com/en-us/windows/wsl/install-win10>`__ to install Ubuntu
-* start Ubuntu (or open an Ubuntu tab in `Windows Terminal <https://www.microsoft.com/en-us/p/windows-terminal/9n0dx20hk701>`__)
+* follow `these instructions <https://learn.microsoft.com/en-us/windows/wsl/install>`__ to install Ubuntu
+* start Ubuntu (or open an Ubuntu tab in `Windows Terminal <https://apps.microsoft.com/detail/9n0dx20hk701?hl=en-us&gl=US>`__)
 * execute ``sudo apt update`` to prepare the installation of dependencies
 * optional step: If you have a NVIDIA graphics card available and want to make
-  use of |es|'s GPU acceleration, follow `these instructions <https://docs.nvidia.com/cuda/wsl-user-guide/index.html#ch03a-setting-up-cuda>`__
+  use of |es|'s GPU acceleration, follow `these instructions <https://docs.nvidia.com/cuda/wsl-user-guide/index.html>`__
   to set up CUDA.
 * follow the instructions for :ref:`Installing requirements on Ubuntu Linux`
 
@@ -409,10 +435,6 @@ General features
 
    .. seealso:: :ref:`Electrostatics`
 
--  ``MMM1D_GPU``: This enables MMM1D on GPU. It is faster than the CPU version
-   by several orders of magnitude, but has float precision instead of double
-   precision.
-
 -  ``MMM1D_MACHINE_PREC``: This enables high-precision Bessel functions
    for MMM1D on CPU. Comes with a 60% slow-down penalty. The low-precision
    functions are enabled by default and are precise enough for most applications.
@@ -807,12 +829,13 @@ When an option is enabled, additional options may become available.
 For example with ``-D ESPRESSO_BUILD_TESTS=ON``, one can specify
 the CTest parameters with ``-D ESPRESSO_CTEST_ARGS=-j$(nproc)``.
 
-Environment variables can be passed to CMake. For example, to select Clang, use
-``CC=clang CXX=clang++ CUDACXX=clang++ cmake .. -D ESPRESSO_BUILD_WITH_CUDA=ON``.
-If you have multiple versions of the CUDA library installed, you can select the
-correct one with ``CUDA_BIN_PATH=/usr/local/cuda-11.5 cmake .. -D ESPRESSO_BUILD_WITH_CUDA=ON``
-(with Clang as the CUDA compiler, you also need to override its default CUDA
-path with ``-D CMAKE_CUDA_FLAGS=--cuda-path=/usr/local/cuda-11.5``).
+Environment variables can be passed to CMake. For example, to select the Clang
+compiler and specify which GPU architectures to generate device code for, use
+``CC=clang CXX=clang++ CUDACXX=clang++ CUDAARCHS="61;75" cmake .. -D ESPRESSO_BUILD_WITH_CUDA=ON``.
+When multiple versions of the CUDA library are available, the correct one can be
+selected with ``CUDA_BIN_PATH=/usr/local/cuda-12.0 cmake .. -D ESPRESSO_BUILD_WITH_CUDA=ON``
+(with Clang as the CUDA compiler, it is also necessary to override its default
+CUDA path with ``-D CMAKE_CUDA_FLAGS=--cuda-path=/usr/local/cuda-12.0``).
 
 .. _Build types and compiler flags:
 
@@ -990,3 +1013,12 @@ ____
 
 .. [5]
    https://www.fftw.org/
+
+.. [6]
+   https://docs.nvidia.com/cuda/
+
+.. [7]
+   https://docs.python.org/3/library/venv.html
+
+.. [8]
+   https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html
diff --git a/doc/sphinx/integration.rst b/doc/sphinx/integration.rst
index 2c82f184043..6802e59d086 100644
--- a/doc/sphinx/integration.rst
+++ b/doc/sphinx/integration.rst
@@ -9,7 +9,7 @@ Particle integration and propagation
 ------------------------------------
 
 The main integration scheme of |es| is the velocity Verlet algorithm.
-A steepest descent algorithm is used to minimize the system.
+A steepest descent algorithm is used to minimize forces and torques in the system.
 
 Additional integration schemes are available, which can be coupled to
 thermostats to enable Langevin dynamics, Brownian dynamics, Stokesian dynamics,
@@ -21,109 +21,107 @@ Integrators
 -----------
 
 To run the integrator call the method
-:meth:`system.integrate.run() <espressomd.integrate.Integrator.run>`::
+:meth:`system.integrator.run() <espressomd.integrate.Integrator.run>`::
 
     system.integrator.run(number_of_steps, recalc_forces=False, reuse_forces=False)
 
 where ``number_of_steps`` is the number of time steps the integrator should perform.
 
-.. _Velocity Verlet Algorithm:
+The following sections detail the different integrators available.
+
+.. _Velocity Verlet algorithm:
 
 Velocity Verlet algorithm
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
-:meth:`espressomd.integrate.IntegratorHandle.set_vv`
+The velocity Verlet integrator is active by default.
+If you used a different integrator and want to switch back, use 
+:meth:`system.integrator.set_vv() <espressomd.integrate.IntegratorHandle.set_vv>`.
+
+The Velocity Verlet algorithm is used for equations of motion of the general form
 
-The equations of motion for the trajectory of point-like particles read
+.. math::
 
-.. math:: \dot v_i(t) = F_i(\{x_j\},v_i,t)/m_i \\ \dot x_i(t) = v_i(t),
+    \begin{aligned}
+    \dot{\vec{x}}_i(t) &= \vec{v}_i(t), \\
+    \dot{\vec{v}}_i(t) &= \frac{\vec{F}_i(\{ \vec{x}_j \} ,\vec{v}_i,t)}{m_i},
+    \end{aligned}
 
-where :math:`x_i`, :math:`v_i`, :math:`m_i` are position, velocity and mass of
-particle :math:`i` and :math:`F_i(\{x_j\},v_i,t)` the forces acting on it.
-These forces comprise all interactions with other particles and external fields
-as well as non-deterministic contributions described in :ref:`Thermostats`.
+where :math:`\vec{x}_i`, :math:`\vec{v}_i`, :math:`m_i` are position, velocity and mass of
+particle :math:`i` and :math:`\vec{F}_i(\{\vec{x}_j\},\vec{v}_i,t)` the forces acting on it.
+The force :math:`\vec{F}_i` comprises all interactions of particle :math:`i` with other particles :math:`j` and external fields
+as well as contributions from thermostats, see :ref:`Thermostats`.
 
-For numerical integration, this equation is discretized to the following steps (:cite:`rapaport04a` eqs. 3.5.8 - 3.5.10):
+For numerical integration, the equation of motion is discretized to the following steps (:cite:`rapaport04a` eqs. 3.5.8 - 3.5.10):
 
 1. Calculate the velocity at the half step
 
-   .. math:: v(t+dt/2) = v(t) + \frac{F(x(t),v(t-dt/2),t)}{m} dt/2
+   .. math:: \vec{v}(t+dt/2) = \vec{v}(t) + \frac{\vec{F}(\vec{x}(t),\vec{v}(t-dt/2),t)}{m} dt/2
 
 2. Calculate the new position
 
-   .. math:: x(t+dt) = x(t) + v(t+dt/2) dt
+   .. math:: \vec{x}(t+dt) = \vec{x}(t) + \vec{v}(t+dt/2) dt
 
 3. Calculate the force based on the new position
 
-   .. math:: F = F(x(t+dt), v(t+dt/2), t+dt)
+   .. math:: \vec{F} = \vec{F}(\vec{x}(t+dt), \vec{v}(t+dt/2), t+dt)
 
 4. Calculate the new velocity
 
-   .. math:: v(t+dt) = v(t+dt/2) + \frac{F(x(t+dt),t+dt)}{m} dt/2
+   .. math:: \vec{v}(t+dt) = \vec{v}(t+dt/2) + \frac{\vec{F}(\vec{x}(t+dt), \vec{v}(t+dt/2), t+dt)}{m} dt/2
+
+Here, for simplicity, we have omitted the particle index :math:`i`.
+Read, e.g., :math:`\vec{x}` as the position of all particles.
 
 Note that this implementation of the velocity Verlet algorithm reuses
 forces in step 1. That is, they are computed once in step 3,
-but used twice, in step 4 and in step 1 of the next iteration. In the first time
-step after setting up, there are no forces present yet. Therefore, |es| has
+but used twice, in step 4 and in step 1 of the next iteration. 
+The first time the integrator is called, there are no forces present yet. 
+Therefore, |es| has
 to compute them before the first time step. That has two consequences:
-first, random forces are redrawn, resulting in a narrower distribution
-of the random forces, which we compensate by stretching. Second,
-coupling forces of e.g. the lattice-Boltzmann fluid cannot be computed
+first, if thermostats are active, random forces are computed twice during 
+the first time step, resulting in a narrower distribution of the random forces.
+Second,
+coupling forces of, e.g., the lattice-Boltzmann fluid cannot be computed
 and are therefore lacking in the first half time step. In order to
 minimize these effects, |es| has a quite conservative heuristics to decide
-whether a change makes it necessary to recompute forces before the first
-time step. Therefore, calling 100 times
-:meth:`espressomd.integrate.Integrator.run` with ``steps=1`` does the
-same as with ``steps=100``, apart from some small calling overhead.
-
-However, for checkpointing, there is no way for |es| to tell that the forces
-that you read back in actually match the parameters that are set.
-Therefore, |es| would recompute the forces before the first time step, which
-makes it essentially impossible to checkpoint LB simulations, where it
-is vital to keep the coupling forces. To work around this, there is
-an additional parameter ``reuse_forces``, which tells integrate to not recalculate
-the forces for the first time step, but use that the values still stored
-with the particles. Use this only if you are absolutely sure that the
-forces stored match your current setup!
-
-The opposite problem occurs when timing interactions: In this case, one
-would like to recompute the forces, despite the fact that they are
-already correctly calculated. To this aim, the option ``recalc_forces`` can be used to
-enforce force recalculation.
+whether a change makes it necessary to recompute forces before the first time step. 
+Therefore, calling 
+:meth:`espressomd.integrate.Integrator.run` 100 times with ``steps=1`` is equivalent to calling it once with ``steps=100``.
+
+When resuming a simulation, you can either use the forces that are stored on the particles by using the additional parameter ``reuse_forces = True``, or recalculate the forces again from the current configuration ``reuse_forces = False``.
+Setting ``reuse_forces = True`` is useful when restarting a simulation from a checkpoint to obtain exactlty the same result as if the integration had continued without interruption.
+You can also use ``recalc_forces = True`` to recalculate forces even if they are already correctly computed.
 
 .. _Isotropic NpT integrator:
 
 Isotropic NpT integrator
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-:meth:`espressomd.integrate.IntegratorHandle.set_isotropic_npt`
+Simuations in the NpT ensemble are performed with the isotropic NpT integrator :meth:`~espressomd.integrate.IntegratorHandle.set_isotropic_npt`.
+A code snippet would look like::
 
-As the NpT thermostat alters the way the equations of motion are integrated, it is
-discussed here and only a brief summary is given in :ref:`Thermostats`.
+    import espressomd
 
-To activate the NpT integrator, use :meth:`~espressomd.integrate.IntegratorHandle.set_isotropic_npt`
-with parameters:
+    system = espressomd.System(box_l=[1, 1, 1])
+    system.thermostat.set_npt(kT=1.0, gamma0=1.0, gammav=1.0, seed=42)
+    system.integrator.set_isotropic_npt(ext_pressure=1.0, piston=1.0)
+
+The parameters of the integrator are
 
 * ``ext_pressure``: The external pressure
 * ``piston``: The mass of the applied piston
 * ``direction``: Flags to enable/disable box dimensions to be subject to fluctuations. By default, all directions are enabled.
 
-Additionally, a NpT thermostat has to be set by :meth:`~espressomd.thermostat.Thermostat.set_npt()`
+Additionally, an NpT thermostat has to be set by :meth:`~espressomd.thermostat.Thermostat.set_npt()`
 with parameters:
 
 * ``kT``: Thermal energy of the heat bath
 * ``gamma0``: Friction coefficient of the bath
 * ``gammav``: Artificial friction coefficient for the volume fluctuations.
 
-A code snippet would look like::
-
-    import espressomd
-
-    system = espressomd.System(box_l=[1, 1, 1])
-    system.thermostat.set_npt(kT=1.0, gamma0=1.0, gammav=1.0, seed=42)
-    system.integrator.set_isotropic_npt(ext_pressure=1.0, piston=1.0)
-
-The physical meaning of these parameters is described below:
+The physical meaning of these parameters and the equations of motion are described below. 
+We recommend reading :ref:`Langevin thermostat` before continuing.
 
 The relaxation towards a desired pressure :math:`P` (parameter ``ext_pressure``)
 is enabled by treating the box
@@ -138,12 +136,12 @@ associated with the volume is postulated. This results in a "force" on the box s
 
 where
 
-.. math:: \mathcal{P} = \frac{1}{Vd} \sum_{i,j} f_{ij}x_{ij} + \frac{1}{Vd} \sum_i m_i v_i^2
+.. math:: \mathcal{P} = \frac{1}{Vd} \sum_{i,j} \vec{f}_{ij}\vec{x}_{ij} + \frac{1}{Vd} \sum_i m_i v_i^2 , 
 
-Here :math:`\mathcal{P}` is the instantaneous pressure, :math:`d` the dimension
-of the system (number of flags set by ``direction``), :math:`f_{ij}` the
+is the instantaneous pressure, with :math:`d` the dimension
+of the system (number of flags set by ``direction``), :math:`\vec{f}_{ij}` the
 short range interaction force between particles :math:`i` and :math:`j` and
-:math:`x_{ij}= x_j - x_i`.
+:math:`\vec{x}_{ij}= \vec{x}_j - \vec{x}_i`.
 
 In addition to this deterministic force, a friction :math:`-\frac{\gamma^V}{Q}\Pi(t)`
 and noise :math:`\sqrt{k_B T \gamma^V} \eta(t)` are added for the box
@@ -160,12 +158,12 @@ The discretisation consists of the following steps (see :cite:`kolb99a` for a fu
 
 1. Calculate the particle velocities at the half step
 
-   .. math:: v'(t+dt/2) = v(t) + \frac{F(x(t),v(t-dt/2),t)}{m} dt/2
+   .. math:: \vec{v}'(t+dt/2) = \vec{v}(t) + \frac{\vec{F}(\vec{x}(t),\vec{v}(t-dt/2),t)}{m} dt/2
 
 2. Calculate the instantaneous pressure and "volume momentum"
 
-   .. math:: \mathcal{P} = \mathcal{P}(x(t),V(t),f(x(t)), v'(t+dt/2))
-   .. math:: \Pi(t+dt/2) = \Pi(t) + (\mathcal{P}-P) dt/2 -\frac{\gamma^V}{Q}\Pi(t) dt/2  +  \sqrt{k_B T \gamma^V dt} \overline{\eta}
+   .. math:: \mathcal{P} = \mathcal{P}(\vec{x}(t),V(t),\vec{f}(\vec{x}(t)), \vec{v}'(t+dt/2))
+   .. math:: \Pi(t+dt/2) = \Pi(t) + (\mathcal{P}-P) dt/2 -\frac{\gamma^V}{Q}\Pi(t) dt/2  +  \sqrt{k_B T \gamma^V dt} {\eta_*}
 
 3. Calculate box volume and scaling parameter :math:`L` at half step and full step, scale the simulation box accordingly
 
@@ -176,27 +174,27 @@ The discretisation consists of the following steps (see :cite:`kolb99a` for a fu
 
 4. Update particle positions and scale velocities
 
-   .. math:: x(t+dt) = \frac{L(t+dt)}{L(t)} \left[ x(t) + \frac{L^2(t)}{L^2(t+dt/2)} v(t+dt/2) dt \right]
-   .. math:: v(t+dt/2) = \frac{L(t)}{L(t+dt)} v'(t+dt/2)
+   .. math:: \vec{x}(t+dt) = \frac{L(t+dt)}{L(t)} \left[ \vec{x}(t) + \frac{L^2(t)}{L^2(t+dt/2)} \vec{v}(t+dt/2) dt \right]
+   .. math:: \vec{v}(t+dt/2) = \frac{L(t)}{L(t+dt)} \vec{v}'(t+dt/2)
 
 5. Calculate forces, instantaneous pressure and "volume momentum"
 
-   .. math:: F = F(x(t+dt),v(t+dt/2),t)
-   .. math:: \mathcal{P} = \mathcal{P}(x(t+dt),V(t+dt),f(x(t+dt)), v(t+dt/2))
-   .. math:: \Pi(t+dt) = \Pi(t+dt/2) + (\mathcal{P}-P) dt/2 -\frac{\gamma^V}{Q}\Pi(t+dt/2) dt/2  +  \sqrt{k_B T \gamma^V dt} \overline{\eta}
+   .. math:: \vec{F} = \vec{F}(\vec{x}(t+dt),\vec{v}(t+dt/2),t)
+   .. math:: \mathcal{P} = \mathcal{P}(\vec{x}(t+dt),V(t+dt),\vec{f}(\vec{x}(t+dt)), \vec{v}(t+dt/2))
+   .. math:: \Pi(t+dt) = \Pi(t+dt/2) + (\mathcal{P}-P) dt/2 -\frac{\gamma^V}{Q}\Pi(t+dt/2) dt/2  +  \sqrt{k_B T \gamma^V dt} {\eta_*}
 
-   with uncorrelated numbers :math:`\overline{\eta}` drawn from a random uniform process :math:`\eta(t)`
+   with uncorrelated numbers :math:`{\eta_*}` drawn from a random uniform process.
 
 6. Update the velocities
 
-   .. math:: v(t+dt) = v(t+dt/2) + \frac{F(t+dt)}{m} dt/2
+   .. math:: \vec{v}(t+dt) = \vec{v}(t+dt/2) + \frac{\vec{F}(t+dt)}{m} dt/2
 
 Notes:
 
-* The NpT algorithm is only tested for all 3 directions enabled for scaling. Usage of ``direction`` is considered an experimental feature.
+* The NpT algorithm is only tested for ``direction = 3 * [True]``. Usage of other ``direction`` is considered an experimental feature.
 * In step 4, only those coordinates are scaled for which ``direction`` is set.
 * For the instantaneous pressure, the same limitations of applicability hold as described in :ref:`Pressure`.
-* The particle forces :math:`F` include interactions as well as a friction (:math:`\gamma^0`) and noise term (:math:`\sqrt{k_B T \gamma^0 dt} \overline{\eta}`) analogous to the terms in the :ref:`Langevin thermostat`.
+* The particle forces :math:`\vec{F}` include interactions as well as a friction (:math:`\gamma^0`) and noise term (:math:`\sqrt{k_B T \gamma^0 dt} {\eta_*}`) analogous to the terms in the :ref:`Langevin thermostat`.
 * The particle forces are only calculated in step 5 and then reused in step 1 of the next iteration. See :ref:`Velocity Verlet Algorithm` for the implications of that.
 * The NpT algorithm doesn't support :ref:`Lees-Edwards boundary conditions`.
 * The NpT algorithm doesn't support propagation of angular velocities.
@@ -205,36 +203,40 @@ Notes:
 
 Steepest descent
 ^^^^^^^^^^^^^^^^
+To activate steepest descent, use :meth:`espressomd.integrate.IntegratorHandle.set_steepest_descent`.
+A code snippet could look like::
 
-:meth:`espressomd.integrate.IntegratorHandle.set_steepest_descent`
+    max_steps = 20 # maximal number of steps
+    system.integrator.set_steepest_descent(
+        f_max=0, gamma=0.1, max_displacement=0.1)
+    system.integrator.run(max_steps)   
+    system.integrator.set_vv()  # to switch back to velocity Verlet
+
+The 'equation of motion' in discretised form reads
+
+.. math:: \vec{x}(t + \Delta t) = \vec{x}(t) + \min\left(|\gamma\vec{F}(t)\Delta t|, \vec{r}_{\text{max}}\right) \cdot \vec{F}(t)/|\vec{F}(t)|
+
+with :math:`\vec{r}_{\text{max}}` the maximal displacement, :math:`\gamma`
+the friction coefficient, :math:`\vec{x}` the particle position,
+:math:`\vec{F}` the force on the particle, and :math:`\Delta t` the time step.
 
 This feature is used to propagate each particle by a small distance parallel to the force acting on it.
 When only conservative forces for which a potential exists are in use, this is equivalent to a steepest descent energy minimization.
 A common application is removing overlap between randomly placed particles.
-
 Please note that the behavior is undefined if a thermostat is activated,
-in which case the integrator will generate an error. The integrator runs
-the following steepest descent algorithm:
+in which case the integrator will generate an error. 
 
-.. math:: \vec{r}_{i+1} = \vec{r}_i + \min(\gamma \vec{F}_i, \vec{r}_{\text{max_displacement}}),
-
-while the maximal force/torque is bigger than ``f_max`` or for at most ``steps`` times. The energy
+Steepest descent is applied
+while the maximal force/torque is bigger than ``f_max``, or for at most ``max_steps`` times. The energy
 is relaxed by ``gamma``, while the change per coordinate per step is limited to ``max_displacement``.
 The combination of ``gamma`` and ``max_displacement`` can be used to get a poor man's adaptive update.
 Rotational degrees of freedom are treated similarly: each particle is
 rotated around an axis parallel to the torque acting on the particle,
-with ``max_displacement`` interpreted as the maximal rotation angle.
+with ``max_displacement`` interpreted as the maximal rotation angle in radians.
 Please be aware of the fact that this needs not to converge to a local
 minimum in periodic boundary conditions. Translational and rotational
 coordinates that are fixed using the ``fix`` and ``rotation`` attribute of particles are not altered.
 
-Usage example::
-
-    system.integrator.set_steepest_descent(
-        f_max=0, gamma=0.1, max_displacement=0.1)
-    system.integrator.run(20)   # maximal number of steps
-    system.integrator.set_vv()  # to switch back to velocity Verlet
-
 .. _Using a custom convergence criterion:
 
 Using a custom convergence criterion
@@ -244,23 +246,23 @@ The ``f_max`` parameter can be set to zero to prevent the integrator from
 halting when a specific force/torque is reached. The integration can then
 be carried out in a loop with a custom convergence criterion::
 
-    min_sigma = 1  # size of the smallest particle
-    max_sigma = 5  # size of the largest particle
-    min_dist = 0.0
+    min_dist_target = 1 # minimum distance that all particles should have
+    
     system.integrator.set_steepest_descent(f_max=0, gamma=10,
-                                           max_displacement=min_sigma * 0.01)
-    # gradient descent until particles are separated by at least max_sigma
-    while min_dist < max_sigma:
+                                           max_displacement= 0.01)
+    # gradient descent until particles are separated by at least min_dist_target
+    min_dist = 0.0
+    while min_dist < min_dist_target:
         min_dist = system.analysis.min_dist()
         system.integrator.run(10)
     system.integrator.set_vv()
 
 When writing a custom convergence criterion based on forces or torques, keep
 in mind that particles whose motion and rotation are fixed in space along
-some or all axes with ``fix`` or ``rotation`` need to be filtered from the
-force/torque observable used in the custom convergence criterion. Since these
-two properties can be cast to boolean values, they can be used as masks to
-remove forces/torques that are ignored by the integrator::
+some or all axes with ``fix`` or ``rotation`` still experience forces and torques.
+Therefore, they need to be filtered from the
+force/torque observable used in the custom convergence criterion. A code snippet
+that achieves this filtering could look like::
 
     particles = system.part.all()
     max_force = np.max(np.linalg.norm(particles.f * np.logical_not(particles.fix), axis=1))
@@ -313,8 +315,44 @@ mesh surface deformation.
 Brownian Dynamics
 ^^^^^^^^^^^^^^^^^
 
-Brownian Dynamics integrator :cite:`schlick10a`.
-See details in :ref:`Brownian thermostat`.
+To activate Brownian dynamics, use :meth:`espressomd.integrate.IntegratorHandle.set_brownian_dynamics`.
+A code snippet would look like::
+
+    import espressomd
+    system = espressomd.System(box_l=[1, 1, 1])
+    system.thermostat.set_brownian(kT=1.0, gamma=1.0, seed=41)
+    system.integrator.set_brownian_dynamics()
+
+In addition to the integrator, the corresponding thermostat has to be set.
+The thermostat holds the parameters used in the Brownian equation of motion.
+
+The particle trajectories are governed by
+
+.. math:: \dot{\vec{x}}_i(t) = \gamma^{-1} \vec{F}_i(\{\vec{x}_j\}, \{\vec{v}_j\}, t) + \sqrt{2 k_B T \gamma^{-1}} \vec{\eta}_i(t),
+
+where :math:`\vec{F}_i` are all deterministic forces from interactions and :math:`\vec{\eta}_i` 
+are random forces with zero mean and unit variance.
+This equation of motion follows from Langevin's equation of motion (see :ref:`Langevin thermostat`)
+by setting the mass of the particle to zero.
+
+|es|'s discretisation is based on :cite:`schlick10a`, :cite:`ermak78a`
+and reads
+
+.. math:: \vec{x}(t+ dt) = \gamma^{-1} \vec{F}(\vec{x}(t), \vec{v}(t), t) dt + \sqrt{2 k_B T \gamma^{-1} dt} \vec{\eta}_*(t)
+
+where :math:`\vec{\eta_*}` are pseudo-random numbers with zero mean and unit variance (particle indices are omitted for clarity).
+Velocities are obtained directly from 
+
+.. math:: \vec{v}(t) = \gamma^{-1} \vec{F} + \sqrt{2 k_B T \gamma^{-1} dt^{-1}} \vec{\eta}_{*}(t)
+
+Be aware that the velocity contains random terms and is therefore not continuous in time.
+
+Rotational motion is implemented analogously.
+Note: the rotational Brownian dynamics implementation is only compatible with particles which have
+the isotropic moment of inertia tensor. 
+Otherwise, the viscous terminal angular velocity
+is not defined, i.e., it has no constant direction.
+
 
 .. _Stokesian Dynamics:
 
@@ -368,10 +406,7 @@ Note that this setup represents a system at zero temperature. In order to
 thermalize the system, the SD thermostat needs to be activated (see
 :ref:`Stokesian thermostat`).
 
-.. _Important_SD:
-
-Important
-"""""""""
+**Note:**
 
 The particles must be prevented from overlapping. It is mathematically allowed
 for the particles to overlap to a certain degree. However, once the distance
@@ -394,7 +429,7 @@ sphere diameters.
 Thermostats
 -----------
 
-To add a thermostat, call the appropriate setter::
+To add a thermostat, call the appropriate setter, e.g., ::
 
     system.thermostat.set_langevin(kT=1.0, gamma=1.0, seed=41)
 
@@ -403,18 +438,17 @@ subsections.
 
 You may combine different thermostats by turning them on sequentially.
 Not all combinations of thermostats are sensible, though, and some
-integrators only work with a specific thermostat. The list of possible
-combinations of integrators and thermostats is hardcoded and automatically
-check against at the start of integration.
+thermostats only work with specific integrators. 
+The list of possible combinations of integrators and thermostats is hardcoded and automatically
+checked against at the start of integration.
 Note that there is only one temperature for all thermostats.
 The list of active thermostats can be cleared at any time with
 :py:meth:`system.thermostat.turn_off() <espressomd.thermostat.Thermostat.turn_off>`.
 
 Since |es| does not enforce a particular unit system, it cannot know about
-the current value of the Boltzmann constant. Therefore, when specifying
-the temperature of a thermostat, you actually do not define the
-temperature, but the value of the thermal energy :math:`k_B T` in the
-current unit system (see the discussion on units, Section :ref:`On units`).
+the current value of the Boltzmann constant. Therefore, instead of specifying
+the temperature, you have to provide a value for the thermal energy :math:`k_B T` in the
+current unit system (see the discussion on units, Section (:ref:`On units`)).
 
 All thermostats have a ``seed`` argument that controls the state of the random
 number generator (Philox Counter-based RNG). This seed is required on first
@@ -437,46 +471,51 @@ Best explained in an example::
     system = espressomd.System(box_l=[1, 1, 1])
     system.thermostat.set_langevin(kT=1.0, gamma=1.0, seed=41)
 
-As explained before the temperature is set as thermal energy :math:`k_\mathrm{B} T`.
+The temperature is set as thermal energy :math:`k_\mathrm{B} T`.
 
 The Langevin thermostat is based on an extension of Newton's equation of motion to
 account for drag and collisions with a fluid:
 
-.. math::  m_i \dot{v}_i(t) = f_i(\{x_j\},v_i,t) - \gamma v_i(t) + \sqrt{2\gamma k_B T} \eta_i(t).
+.. math::  m_i \dot{\vec{v}}_i(t) = \vec{f}_i(\{\vec{x}_j\}, \, \vec{v}_i,t) - \gamma \vec{v}_i(t) + \sqrt{2\gamma k_B T} \vec{\eta}_i(t).
 
-Here, :math:`f_i` are all deterministic forces from interactions,
-:math:`\gamma` the bare friction coefficient and :math:`\eta` a random, "thermal" force.
+Here, :math:`\vec{f}_i` are all deterministic forces from interactions,
+:math:`\gamma` the friction coefficient and :math:`\vec{\eta}` a random, "thermal" force.
 The friction term accounts for dissipation in a surrounding fluid whereas
 the random force  mimics collisions of the particle with solvent molecules
 at temperature :math:`T` and satisfies
 
-.. math:: <\eta(t)> = 0 , <\eta^\alpha_i(t)\eta^\beta_j(t')> = \delta_{\alpha\beta} \delta_{ij}\delta(t-t')
+.. math:: <\vec{\eta}(t)> = \vec{0} , <\eta^\alpha_i(t)\eta^\beta_j(t')> = \delta_{\alpha\beta} \delta_{ij}\delta(t-t')
 
 (:math:`<\cdot>` denotes the ensemble average and :math:`\alpha,\beta` are spatial coordinates).
 
 In the |es| implementation of the Langevin thermostat,
 the additional terms only enter in the force calculation.
-This reduces the accuracy of the velocity Verlet integrator
-by one order in :math:`dt` because forces are now velocity-dependent.
+The general form of the equation of motion is still the same as
+for Newton's equations, therefore the velocity Verlet integrator is 
+used.
+The accuracy of the velocity Verlet integrator is reduced by
+one order in :math:`dt` because forces are now velocity-dependent.
+
+The random process :math:`\vec{\eta}(t)` is discretized by drawing an uncorrelated random numbers
+:math:`\vec{\eta_*}` for each particle.
+The distribution of :math:`{\vec{\eta}_*}` is uniform and satisfies
 
-The random process :math:`\eta(t)` is discretized by drawing an uncorrelated random number
-:math:`\overline{\eta}` for each component of all the particle forces.
-The distribution of :math:`\overline{\eta}` is uniform and satisfies
+.. math:: <\vec{\eta}_*> = \vec{0} ,\, <\eta_*^\alpha \eta_*^\beta> =  \frac{\delta_{\alpha,\beta}}{dt},
 
-.. math:: <\overline{\eta}> = 0 , <\overline{\eta}\overline{\eta}> = 1/dt
+approximating the delta-correlation of the continuous equation.
 
 If the feature ``ROTATION`` is compiled in, the rotational degrees of freedom are
 also coupled to the thermostat. If only the first two arguments are
 specified then the friction coefficient for the rotation is set to the
 same value as that for the translation.
 A separate rotational friction coefficient can be set by inputting
-``gamma_rotate``. The two options allow one to switch the translational and rotational
+``gamma_rotation``. The two options allow one to switch the translational and rotational
 thermalization on or off separately, maintaining the frictional behavior. This
 can be useful, for instance, in high Péclet number active matter systems, where
 one wants to thermalize only the rotational degrees of freedom while
 translational degrees of freedom are affected by the self-propulsion.
 
-The keywords ``gamma`` and ``gamma_rotate`` can be specified as a scalar,
+The keywords ``gamma`` and ``gamma_rotation`` can be specified as a scalar,
 or, with feature ``PARTICLE_ANISOTROPY`` compiled in, as the three eigenvalues
 of the respective friction coefficient tensor. This is enables the simulation of
 the anisotropic diffusion of anisotropic colloids (rods, etc.).
@@ -491,74 +530,11 @@ friction coefficient for every particle individually via the feature
 Brownian thermostat
 ^^^^^^^^^^^^^^^^^^^
 
-Brownian thermostat is a formal name of a thermostat enabling the
-Brownian Dynamics feature (see :cite:`schlick10a`) which implies
-a propagation scheme involving systematic and thermal parts of the
-classical Ermak-McCammom's (see :cite:`ermak78a`)
-Brownian Dynamics. Currently it is implemented without
-hydrodynamic interactions, i.e.
-with a diagonal diffusion tensor.
-The hydrodynamic interactions feature will be available later
-as a part of the present Brownian Dynamics or
-implemented separately within the Stokesian Dynamics.
-
 In order to activate the Brownian thermostat, the member function
 :py:attr:`~espressomd.thermostat.Thermostat.set_brownian` of the thermostat
 class :class:`espressomd.thermostat.Thermostat` has to be invoked.
-The system integrator should be also changed.
-Best explained in an example::
-
-    import espressomd
-    system = espressomd.System(box_l=[1, 1, 1])
-    system.thermostat.set_brownian(kT=1.0, gamma=1.0, seed=41)
-    system.integrator.set_brownian_dynamics()
-
-where ``gamma`` (hereinafter :math:`\gamma`) is a viscous friction coefficient.
-In terms of the Python interface and setup, the Brownian thermostat is very
-similar to the :ref:`Langevin thermostat`. The feature
-``THERMOSTAT_PER_PARTICLE`` is used to control the per-particle
-temperature and the friction coefficient setup. The major differences are
-its internal integrator implementation and other temporal constraints.
-The integrator is still a symplectic velocity Verlet-like one.
-It is implemented via a viscous drag part and a random walk of both the position and
-velocity. Due to a nature of the Brownian Dynamics method, its time step :math:`\Delta t`
-should be large enough compared to the relaxation time
-:math:`m/\gamma` where :math:`m` is the particle mass.
-This requirement is just a conceptual one
-without specific implementation technical restrictions.
-Note that with all similarities of
-Langevin and Brownian Dynamics, the Langevin thermostat temporal constraint
-is opposite. A velocity is restarting from zero at every step.
-Formally, the previous step velocity at the beginning of the the :math:`\Delta t` interval
-is dissipated further
-and does not contribute to the end one as well as to the positional random walk.
-Another temporal constraint
-which is valid for both Langevin and Brownian Dynamics: conservative forces
-should not change significantly over the :math:`\Delta t` interval.
-
-The viscous terminal velocity :math:`\Delta v` and corresponding positional
-step :math:`\Delta r` are fully driven by conservative forces :math:`F`:
-
-.. math:: \Delta r = \frac{F \cdot \Delta t}{\gamma}
-
-.. math:: \Delta v = \frac{F}{\gamma}
-
-A positional random walk variance of each coordinate :math:`\sigma_p^2`
-corresponds to a diffusion within the Wiener process:
-
-.. math:: \sigma_p^2 = 2 \frac{kT}{\gamma} \cdot \Delta t
-
-Each velocity component random walk variance :math:`\sigma_v^2` is defined by the heat
-component:
-
-.. math:: \sigma_v^2 = \frac{kT}{m}
-
-Note: the velocity random walk is propagated from zero at each step.
-
-A rotational motion is implemented similarly.
-Note: the rotational Brownian dynamics implementation is compatible with particles which have
-the isotropic moment of inertia tensor only. Otherwise, the viscous terminal angular velocity
-is not defined, i.e. it has no constant direction over the time.
+The system integrator must be also changed.
+For details, see :ref:`Brownian Dynamics`.
 
 .. _Isotropic NpT thermostat:
 
@@ -569,16 +545,7 @@ This feature allows to simulate an (on average) homogeneous and isotropic system
 In order to use this feature, ``NPT`` has to be defined in the :file:`myconfig.hpp`.
 Activate the NpT thermostat with the command :py:meth:`~espressomd.thermostat.Thermostat.set_npt`
 and setup the integrator for the NpT ensemble with :py:meth:`~espressomd.integrate.IntegratorHandle.set_isotropic_npt`.
-
-For example::
-
-    import espressomd
-
-    system = espressomd.System(box_l=[1, 1, 1])
-    system.thermostat.set_npt(kT=1.0, gamma0=1.0, gammav=1.0, seed=41)
-    system.integrator.set_isotropic_npt(ext_pressure=1.0, piston=1.0)
-
-For an explanation of the algorithm involved, see :ref:`Isotropic NpT integrator`.
+For details, see :ref:`Isotropic NpT integrator`.
 
 Be aware that this feature is neither properly examined for all systems
 nor is it maintained regularly. If you use it and notice strange
@@ -595,7 +562,7 @@ are not applied to every particle individually but instead
 encoded in a dissipative interaction between particles :cite:`soddemann03a`.
 
 To realize a complete DPD fluid model in |es|, three parts are needed:
-the DPD thermostat, which controls the temperate, a dissipative interaction
+the DPD thermostat, which controls the temperature, a dissipative interaction
 between the particles that make up the fluid, see :ref:`DPD interaction`,
 and a repulsive conservative force, see :ref:`Hat interaction`.
 
@@ -608,9 +575,7 @@ The friction coefficients and cutoff are controlled via the
 
 The friction (dissipative) and noise (random) term are coupled via the
 fluctuation-dissipation theorem. The friction term is a function of the
-relative velocity of particle pairs. The DPD thermostat is better for
-dynamics than the Langevin thermostat, since it mimics hydrodynamics in
-the system.
+relative velocity of particle pairs. In addition to the physics covered by the Langevin thermostat, the DPD thermostat mimics hydrodynamics in the system.
 
 As a conservative force any interaction potential can be used,
 see :ref:`Isotropic non-bonded interactions`. A common choice is
@@ -638,17 +603,12 @@ Lattice-Boltzmann thermostat
 
 The :ref:`Lattice-Boltzmann` thermostat acts similar to the :ref:`Langevin thermostat` in that the governing equation for particles is
 
-.. math::  m_i \dot{v}_i(t) = f_i(\{x_j\},v_i,t) - \gamma (v_i(t)-u(x_i(t),t)) + \sqrt{2\gamma k_B T} \eta_i(t).
-
-where :math:`u(x,t)` is the fluid velocity at position :math:`x` and time :math:`t`.
-To preserve momentum, an equal and opposite friction force and random force act on the fluid.
+.. math::  m_i \dot{\vec{v}}_i(t) = \vec{f}_i(\{\vec{x}_j\},\vec{v}_i,t) - \gamma (\vec{v}_i(t)-\vec{u}(\vec{x}_i(t),t)) + \sqrt{2\gamma k_B T} \vec{\eta}_i(t).
 
-Numerically the fluid velocity is determined from the lattice-Boltzmann node velocities
-by interpolating as described in :ref:`Interpolating velocities`.
-The backcoupling of friction forces and noise to the fluid is also done by distributing those forces amongst the nearest LB nodes.
-Details for both the interpolation and the force distribution can be found in :cite:`ahlrichs99a` and :cite:`dunweg09a`.
+where :math:`\vec{u}(\vec{x},t)` is the fluid velocity at position :math:`\vec{x}` and time :math:`t`.
+Different from the Langevin thermostat, here, the friction is calculated with respect to a moving fluid. 
 
-The LB fluid can be used to thermalize particles, while also including their hydrodynamic interactions.
+An LB fluid must be used to provide the fluid velocity, while also including hydrodynamic interactions between particles.
 The LB thermostat expects an instance of either :class:`espressomd.lb.LBFluidWalberla` or :class:`espressomd.lb.LBFluidWalberlaGPU`.
 Temperature is set via the ``kT`` argument of the LB fluid.
 
@@ -657,21 +617,33 @@ parameter ``gamma``. To enable the LB thermostat, use::
 
     import espressomd
     import espressomd.lb
-    system = espressomd.System(box_l=[1, 1, 1])
-    lbf = espressomd.lb.LBFluidWalberla(agrid=1, density=1, kinematic_viscosity=1, tau=0.01)
-    self.system.lb = lbf
+    system = espressomd.System(box_l=[8., 8., 8.])
+    system.time_step = 0.01
+    system.cell_system.skin = 0.4
+    lbf = espressomd.lb.LBFluidWalberla(agrid=1., tau=0.01, density=1.,
+                                        kinematic_viscosity=1.)
+    system.lb = lbf
     system.thermostat.set_lb(LB_fluid=lbf, seed=123, gamma=1.5)
+    system.part.add(pos=[0., 0., 0.], ext_force=[0., 0., 1.])
+    system.integrator.run(10)
 
-No other thermostatting mechanism is necessary
-then. Please switch off any other thermostat before starting the LB
-thermostatting mechanism.
+Numerically the fluid velocity is determined from the lattice-Boltzmann node velocities
+by interpolating as described in :ref:`Interpolating velocities`.
+To preserve momentum, friction and random forces are also applied to the fluid, with equal magnitude and opposite sign.
+This backcoupling of forces on the fluid is done by distributing the forces amongst the nearest LB nodes.
+Details for both the interpolation and the force distribution can be found in :cite:`ahlrichs99a` and :cite:`dunweg09a`.
 
 The LBM implementation provides a fully thermalized LB fluid, all
 nonconserved modes, including the pressure tensor, fluctuate correctly
 according to the given temperature and the relaxation parameters. All
-fluctuations can be switched off by setting the temperature to 0.
+fluctuations can be switched off by setting the temperature to zero.
+The deterministic part of the hydrodynamic interaction is then still active.
+
+If the LB thermostat is active, no other thermostatting mechanism is necessary. 
+Please switch off any other thermostat before starting the LB
+thermostatting mechanism.
 
-.. note:: Coupling between LB and MD only happens if the LB thermostat is set with a :math:`\gamma \ge 0.0`.
+.. note:: Coupling between LB and MD only happens if the LB thermostat is set with a :math:`\gamma > 0.0`.
 
 .. _Stokesian thermostat:
 
@@ -697,4 +669,4 @@ needs to be activated via::
     system.integrator.run(100)
 
 where ``kT`` denotes the desired temperature of the system, and ``seed`` the
-seed for the random number generator.
+seed for the random number generator. For details, see :ref:`Stokesian Dynamics`.
diff --git a/doc/sphinx/introduction.rst b/doc/sphinx/introduction.rst
index 7ec21ba3f93..ecb50e180f1 100644
--- a/doc/sphinx/introduction.rst
+++ b/doc/sphinx/introduction.rst
@@ -485,8 +485,6 @@ report so to the developers using the instructions in :ref:`Contributing`.
 +--------------------------------+------------------------+------------------+------------+
 | MMM1D                          | Single                 | Good             | No         |
 +--------------------------------+------------------------+------------------+------------+
-| MMM1D on GPU                   | Single                 | Single           | No         |
-+--------------------------------+------------------------+------------------+------------+
 | ELC                            | Good                   | Good             | Yes        |
 +--------------------------------+------------------------+------------------+------------+
 | ICC*                           | Group                  | Group            | Yes        |
@@ -644,7 +642,7 @@ You may also provide the patch level, when relevant. If you developed code
 for |es| and made it available in a publicly accessible repository, you
 should consider providing the corresponding URL, for example in a footnote:
 
-    The method was implemented for ESPResSo 4.2.1[24] and the source code is
+    The method was implemented for ESPResSo 4.2.2[24] and the source code is
     available online\ :superscript:`note 1`.
 
     | ____________
diff --git a/doc/sphinx/io.rst b/doc/sphinx/io.rst
index a8f0a8b7435..1e8a0fbb11a 100644
--- a/doc/sphinx/io.rst
+++ b/doc/sphinx/io.rst
@@ -186,8 +186,7 @@ Be aware of the following limitations:
   several electrostatic and magnetostatic solvers automatically introduce
   a deviation of the order of 1e-7, either due to floating-point rounding
   errors (:class:`~espressomd.electrostatics.P3MGPU`), or due to re-tuning
-  using the most recent system state (:class:`~espressomd.electrostatics.MMM1D`,
-  :class:`~espressomd.electrostatics.MMM1DGPU`).
+  using the most recent system state (:class:`~espressomd.electrostatics.MMM1D`).
   When in doubt, you can easily verify the absence of a "force jump" when
   loading from a checkpoint by replacing the electrostatics actor with your
   combination of features in files :file:`samples/save_checkpoint.py` and
diff --git a/doc/sphinx/lb.rst b/doc/sphinx/lb.rst
index 7ceecf4e1a1..9f1f0189699 100644
--- a/doc/sphinx/lb.rst
+++ b/doc/sphinx/lb.rst
@@ -387,6 +387,16 @@ of the LBM in analogy to the example for the CPU given in section
     system.lb = lbf
     system.integrator.run(100)
 
+The waLBerla library supports multi-GPU simulations.
+Without a suitable CUDA-aware MPI library, multi-GPU simulations are slower
+than single-GPU simulations, and would only be relevant for LB systems that
+are too large to fit in the memory of a single GPU device.
+Multi-GPU support in |es| is an experimental feature whose API may change at any time.
+It can be activated by invoking the following expression before the creation
+of the first LB GPU instance::
+
+    system.cuda_init_handle.call_method("set_device_id_per_rank")
+
 .. _Electrohydrodynamics:
 
 Electrohydrodynamics
diff --git a/doc/sphinx/magnetostatics.rst b/doc/sphinx/magnetostatics.rst
index c8b990c8798..e2b3e923988 100644
--- a/doc/sphinx/magnetostatics.rst
+++ b/doc/sphinx/magnetostatics.rst
@@ -168,31 +168,6 @@ via an observable.
 Both the CPU and GPU implementations support MPI-parallelization.
 
 
-.. _Barnes-Hut octree sum on GPU:
-
-Barnes-Hut octree sum on GPU
-----------------------------
-
-:class:`espressomd.magnetostatics.DipolarBarnesHutGpu`
-
-This interaction calculates energies and forces between dipoles by
-summing over the spatial octree cells (aka ``leaves``).
-Far enough cells are considered as a single dipole with a cumulative
-vector in the cell center of mass. Parameters which determine that the
-cell is far enough are :math:`I_{\mathrm{tol}}^2` and
-:math:`\varepsilon^2` which define a fraction of the cell and
-an additive distance respectively. For the detailed description of the
-Barnes-Hut method application to the dipole-dipole interactions, please
-refer to :cite:`polyakov13a`.
-
-To use the method, create an instance of :class:`~espressomd.magnetostatics.DipolarBarnesHutGpu`
-and attach it to the system::
-
-    import espressomd.magnetostatics
-    bh = espressomd.magnetostatics.DipolarBarnesHutGpu(prefactor=1., epssq=200.0, itolsq=8.0)
-    system.magnetostatics.solver = bh
-
-
 .. _ScaFaCoS magnetostatics:
 
 ScaFaCoS magnetostatics
diff --git a/doc/sphinx/particles.rst b/doc/sphinx/particles.rst
index 0106aa354c6..a6fafbe3f01 100644
--- a/doc/sphinx/particles.rst
+++ b/doc/sphinx/particles.rst
@@ -289,7 +289,6 @@ and :attr:`~espressomd.propagation.Propagation.ROT_VS_RELATIVE`.
    particles you create::
 
        import espressomd
-
        system = espressomd.System(box_l=[10., 10., 10.])
        p1 = system.part.add(pos=[1., 2., 3.])
 
@@ -301,7 +300,9 @@ and :attr:`~espressomd.propagation.Propagation.ROT_VS_RELATIVE`.
        p2.vs_auto_relate_to(p1)
 
    The :meth:`~espressomd.particle_data.ParticleHandle.is_virtual`
-   method on particle ``p2`` will now return ``True``.
+   method of particle ``p2`` will now return ``True``, and its
+   :attr:`~espressomd.particle_data.ParticleHandle.propagation`
+   attribute will return the correct combination of flags.
 
 #. Repeat the previous step with more virtual sites, if desired.
 
@@ -317,7 +318,8 @@ Please note:
    virtual site in the non-virtual particles body-fixed frame. This
    information is saved in the virtual site's
    :attr:`~espressomd.particle_data.ParticleHandle.vs_relative` attribute.
-   Take care, not to overwrite it after using ``vs_auto_relate``.
+   Take care, not to overwrite it after using
+   :meth:`~espressomd.particle_data.ParticleHandle.vs_auto_relate_to`.
 
 -  Virtual sites can not be placed relative to other virtual sites, as
    the order in which the positions of virtual sites are updated is not
@@ -326,7 +328,7 @@ Please note:
 
 -  In case you know the correct quaternions, you can also setup a virtual
    site using its :attr:`~espressomd.particle_data.ParticleHandle.vs_relative`
-   and :attr:`~espressomd.particle_data.ParticleHandle.virtual` attributes.
+   and :attr:`~espressomd.particle_data.ParticleHandle.propagation` attributes.
 
 -  In a simulation on more than one CPU, the effective cell size needs
    to be larger than the largest distance between a non-virtual particle
@@ -346,6 +348,14 @@ Please note:
 -  The presence of rigid bodies constructed by means of virtual sites
    adds a contribution to the scalar pressure and pressure tensor.
 
+-  The :meth:`~espressomd.particle_data.ParticleHandle.vs_auto_relate_to`
+   has additional keyword arguments for controlling whether the virtual site
+   should be coupled to a lattice-Boltzmann fluid (``couple_to_lb=True``) or
+   to the Langevin thermostat (``couple_to_langevin=True``), or both
+   (in that case LB is used for translation and Langevin for rotation);
+   this is achieved internally by adding extra propagation flags.
+
+
 .. _Inertialess lattice-Boltzmann tracers:
 
 Inertialess lattice-Boltzmann tracers
@@ -354,13 +364,92 @@ Inertialess lattice-Boltzmann tracers
 Using the propagation mode :attr:`~espressomd.propagation.Propagation.TRANS_LB_TRACER`,
 the virtual sites follow the motion of a LB fluid. This is achieved by integrating
 their position using the fluid velocity at the virtual sites' position.
-Forces acting on the virtual sites are directly transferred as force density
+Forces acting on the virtual sites are directly transferred as a force density
 onto the lattice-Boltzmann fluid, making the coupling free of inertia.
 Please note that the velocity attribute of the virtual particles
 does not carry valid information for this virtual sites scheme.
 The feature stems from the implementation of the
 :ref:`Immersed Boundary Method for soft elastic objects`, but can be used independently.
 
+In the following example, a particle is advected by a fluid flowing along the x-axis::
+
+    import espressomd
+    import espressomd.lb
+    import espressomd.propagation
+    Propagation = espressomd.propagation.Propagation
+    system = espressomd.System(box_l=[8., 8., 8.])
+    system.time_step = 0.01
+    system.cell_system.skin = 0.
+    lbf = espressomd.lb.LBFluidWalberla(agrid=1., tau=0.01, density=1.,
+                                        kinematic_viscosity=1.)
+    system.lb = lbf
+    system.thermostat.set_lb(LB_fluid=lbf, seed=123, gamma=1.5)
+    lbf[:, :, :].velocity = [0.1, 0., 0.]
+    p = system.part.add(pos=[0., 0., 0.], propagation=Propagation.TRANS_LB_TRACER)
+    system.integrator.run(10)
+    print(p.pos.round(3))
+
+
+.. _Per-particle propagation:
+
+Per-particle propagation
+------------------------
+
+Particle positions, quaternions, velocities and angular velocities are integrated
+according to the main integrator, which may be coupled to a thermostat and a barostat
+(see :ref:`Particle integration and propagation` for more details).
+The default integrator is the :ref:`Velocity Verlet algorithm`.
+
+Which equations of motion are being used can be controlled on a per-particle level.
+This is achieved by setting the particle
+:attr:`~espressomd.particle_data.ParticleHandle.propagation` attribute with a
+combination of propagation flags from :class:`~espressomd.propagation.Propagation`.
+
+Depending on which main integrator is selected, different "secondary" integrators
+become available. The velocity Verlet integrator is available as a secondary
+integrator, using flags :class:`~espressomd.propagation.Propagation.TRANS_NEWTON`
+for translation following Newton's equations of motion and
+:class:`~espressomd.propagation.Propagation.ROT_EULER` for rotation
+following Euler's equations of rotation; in this way, selected particles
+can be decoupled from a thermostat.
+:ref:`Virtual sites` also rely on secondary integrators, such as
+:class:`~espressomd.propagation.Propagation.TRANS_VS_RELATIVE` and
+:class:`~espressomd.propagation.Propagation.ROT_VS_RELATIVE` for
+:ref:`Rigid arrangements of particles` or
+:class:`~espressomd.propagation.Propagation.TRANS_LB_TRACER` for
+:ref:`Inertialess lattice-Boltzmann tracers`.
+
+In the following example, particle 1 follows Langevin dynamics (NVT ensemble),
+while particle 2 follows Newtonian dynamics (NVE ensemble)::
+
+    import espressomd
+    import espressomd.propagation
+    Propagation = espressomd.propagation.Propagation
+    system = espressomd.System(box_l=[8., 8., 8.])
+    system.time_step = 0.01
+    system.cell_system.skin = 0.
+    system.thermostat.set_langevin(kT=0.001, gamma=2., seed=42)
+    p1 = system.part.add(pos=[0., 0., 0.], v=[1., 0., 0.],
+                         omega_lab=[1., 0., 0.], rotation=[True, True, True])
+    p2 = system.part.add(pos=[0., 0., 0.], v=[1., 0., 0.],
+                         omega_lab=[1., 0., 0.], rotation=[True, True, True])
+    p1.propagation = Propagation.TRANS_LANGEVIN | Propagation.ROT_LANGEVIN
+    p2.propagation = Propagation.TRANS_NEWTON | Propagation.ROT_EULER
+    system.integrator.run(1)
+
+Not all combinations of propagation flags are allowed!
+
+The friction coefficient of thermostats can be controlled on a per-particle level too.
+Values stored in particle attributes :attr:`~espressomd.particle_data.ParticleHandle.gamma`
+and :attr:`~espressomd.particle_data.ParticleHandle.gamma_rot` will override
+the friction coefficients of most thermostats.
+Requires feature ``THERMOSTAT_PER_PARTICLE``.
+This is used for example to model
+:ref:`particle polarizability with thermalized cold Drude oscillators`.
+These attributes can also be defined as 3D vectors to model particle anisotropy.
+Requires feature ``PARTICLE_ANISOTROPY``.
+
+
 .. _Interacting with groups of particles:
 
 Interacting with groups of particles
diff --git a/doc/sphinx/running.rst b/doc/sphinx/running.rst
index d99937f7ff7..164e102cc71 100644
--- a/doc/sphinx/running.rst
+++ b/doc/sphinx/running.rst
@@ -770,16 +770,16 @@ and long-range forces (FFT summation) contribute equally to the runtime:
 
 .. code-block:: none
 
-    $ CALI_CONFIG_PROFILE=runtime-report ./pypresso ../samples/p3m.py --cpu
-    Path                         Inclusive time Exclusive time    Time %
-    integrate                             14.18           0.01      0.08
-      Integration loop                    13.84           0.43      2.88
-        force_calc                        13.41           0.20      1.35
-          copy_forces_from_GPU             0.01           0.01      0.07
-          short_range_loop                 6.55           6.55     44.02
-          calc_long_range_forces           6.40           6.40     43.00
-          init_forces                      0.24           0.24      1.58
-          copy_particles_to_GPU            0.01           0.01      0.07
+    $ CALI_CONFIG=runtime-report ./pypresso ../samples/p3m.py --cpu
+    Path                          Min time/rank Max time/rank Avg time/rank   Time %
+    integrate                         0.13          0.13          0.13          0.52
+      Integration loop                1.49          1.49          1.49          6.03
+        calculate_forces              1.14          1.14          1.14          4.62
+          copy_particles_to_GPU       0.01          0.01          0.01          0.03
+          init_forces                 0.14          0.14          0.14          0.56
+          calc_long_range_forces      8.78          8.78          8.78         35.66
+          short_range_loop           10.77         10.77         10.77         43.76
+          copy_forces_from_GPU        0.02          0.02          0.02          0.08
 
 For the GPU implementation of the P3M algorithm, the long-range force
 calculation is cheaper, however the transfer of particle data to and from
@@ -787,16 +787,16 @@ the GPU incur additional costs that are not negligible:
 
 .. code-block:: none
 
-    $ CALI_CONFIG_PROFILE=runtime-report ./pypresso ../samples/p3m.py --gpu
-    Path                         Inclusive time Exclusive time    Time %
-    integrate                             14.30           0.03      0.14
-      Integration loop                    13.87           1.76      7.90
-        force_calc                        12.12           0.82      3.68
-          copy_forces_from_GPU             2.09           2.09      9.42
-          short_range_loop                 3.20           3.20     14.38
-          calc_long_range_forces           3.75           3.75     16.87
-          init_forces                      1.25           1.25      5.61
-          copy_particles_to_GPU            1.01           1.01      4.56
+    $ CALI_CONFIG=runtime-report ./pypresso ../samples/p3m.py --gpu
+    Path                          Min time/rank Max time/rank Avg time/rank   Time %
+    integrate                         0.42          0.42          0.42          1.03
+      Integration loop                0.50          0.50          0.50          1.22
+        calculate_forces              0.62          0.62          0.62          1.51
+          copy_particles_to_GPU       0.27          0.27          0.27          0.66
+          init_forces                 0.09          0.09          0.09          0.22
+          calc_long_range_forces      0.60          0.60          0.60          1.46
+          short_range_loop            0.85          0.85          0.85          2.06
+          copy_forces_from_GPU        1.06          1.06          1.06          2.58
 
 For a more fine-grained report on GPU kernels:
 
diff --git a/doc/tutorials/CMakeLists.txt b/doc/tutorials/CMakeLists.txt
index 18798d65b63..2f23dcfad81 100644
--- a/doc/tutorials/CMakeLists.txt
+++ b/doc/tutorials/CMakeLists.txt
@@ -49,13 +49,13 @@ function(NB_EXPORT)
   if(NOT "${NB_EXPORT_SUFFIX}" STREQUAL "")
     set(NB_EXPORT_TARGET "${NB_EXPORT_TARGET}_${NB_EXPORT_SUFFIX}")
   endif()
-  get_filename_component(NB_FILE_BASE ${NB_FILE} NAME_WE)
-  get_filename_component(NB_FILE_EXT ${NB_FILE} EXT)
-  set(HTML_FILE "${NB_FILE_BASE}.html")
-  set(PY_FILE "${NB_FILE_BASE}.py")
+  cmake_path(GET NB_FILE STEM NB_FILE_STEM)
+  cmake_path(GET NB_FILE EXTENSION NB_FILE_EXT)
+  set(HTML_FILE "${NB_FILE_STEM}.html")
+  set(PY_FILE "${NB_FILE_STEM}.py")
 
-  if(${NB_EXPORT_HTML_RUN})
-    set(NB_FILE_RUN "${NB_FILE_BASE}.run${NB_FILE_EXT}")
+  if(NB_EXPORT_HTML_RUN)
+    set(NB_FILE_RUN "${NB_FILE_STEM}.run${NB_FILE_EXT}")
     add_custom_command(
       OUTPUT ${NB_FILE_RUN}
       DEPENDS
diff --git a/doc/tutorials/charged_system/charged_system.ipynb b/doc/tutorials/charged_system/charged_system.ipynb
index 032d0ec7e73..df07a1e4e88 100644
--- a/doc/tutorials/charged_system/charged_system.ipynb
+++ b/doc/tutorials/charged_system/charged_system.ipynb
@@ -103,6 +103,7 @@
     "WCA_EPSILON = 1.0\n",
     "ION_DIAMETER = 1.0\n",
     "ROD_RADIUS = 1.0\n",
+    "MASS=1.0\n",
     "# particle types\n",
     "ROD_TYPE = 1\n",
     "COUNTERION_TYPE = 2"
@@ -308,45 +309,6 @@
     "For this, we use the steepest descent integrator with a relative convergence criterion for forces and energies."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6ed2ce87",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def remove_overlap(system, sd_params):\n",
-    "    # Removes overlap by steepest descent until forces or energies converge\n",
-    "    # Set up steepest descent integration\n",
-    "    system.integrator.set_steepest_descent(f_max=0,\n",
-    "                                           gamma=sd_params['damping'],\n",
-    "                                           max_displacement=sd_params['max_displacement'])\n",
-    "\n",
-    "    # Initialize integrator to obtain initial forces\n",
-    "    system.integrator.run(0)\n",
-    "    maxforce = np.max(np.linalg.norm(system.part.all().f, axis=1))\n",
-    "    energy = system.analysis.energy()['total']\n",
-    "\n",
-    "    i = 0\n",
-    "    while i < sd_params['max_steps'] // sd_params['emstep']:\n",
-    "        prev_maxforce = maxforce\n",
-    "        prev_energy = energy\n",
-    "        system.integrator.run(sd_params['emstep'])\n",
-    "        maxforce = np.max(np.linalg.norm(system.part.all().f, axis=1))\n",
-    "        relforce = np.abs((maxforce - prev_maxforce) / prev_maxforce)\n",
-    "        energy = system.analysis.energy()['total']\n",
-    "        relener = np.abs((energy - prev_energy) / prev_energy)\n",
-    "        if i > 1 and (i + 1) % 4 == 0:\n",
-    "            print(f\"minimization step: {(i+1)*sd_params['emstep']:4.0f}\"\n",
-    "                  f\"    max. rel. force change:{relforce:+3.3e}\"\n",
-    "                  f\"    rel. energy change:{relener:+3.3e}\")\n",
-    "        if relforce < sd_params['f_tol'] or relener < sd_params['e_tol']:\n",
-    "            break\n",
-    "        i += 1\n",
-    "\n",
-    "    system.integrator.set_vv()"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -354,14 +316,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "STEEPEST_DESCENT_PARAMS = {'f_tol': 1e-2,\n",
-    "                           'e_tol': 1e-5,\n",
-    "                           'damping': 30,\n",
-    "                           'max_steps': 10000,\n",
-    "                           'max_displacement': 0.01,\n",
-    "                           'emstep': 10}\n",
-    "\n",
-    "remove_overlap(system, STEEPEST_DESCENT_PARAMS)"
+    "def remove_overlap(system):\n",
+    "    FMAX = 0.01 * ION_DIAMETER * MASS / system.time_step**2\n",
+    "    system.integrator.set_steepest_descent(\n",
+    "        f_max=FMAX,\n",
+    "        gamma=10,\n",
+    "        max_displacement=0.01)\n",
+    "    system.integrator.run(5000)\n",
+    "    assert np.all(np.abs(system.part.all().f) < FMAX), \"Overlap removal did not converge!\"\n",
+    "    system.integrator.set_vv()\n",
+    "    \n",
+    "remove_overlap(system)"
    ]
   },
   {
@@ -649,7 +614,7 @@
     "        run['params']['rod_charge_dens'], N_rod_beads, ROD_TYPE)\n",
     "    p3m = espressomd.electrostatics.P3M(**p3m_params)\n",
     "    system.electrostatics.solver = p3m\n",
-    "    remove_overlap(system, STEEPEST_DESCENT_PARAMS)\n",
+    "    remove_overlap(system)\n",
     "    system.thermostat.set_langevin(**LANGEVIN_PARAMS)\n",
     "    print('', end='', flush=True)\n",
     "    integrate_system(system, WARMUP_STEPS)\n",
@@ -885,7 +850,7 @@
     "\n",
     "p3m = espressomd.electrostatics.P3M(**p3m_params)\n",
     "system.electrostatics.solver = p3m\n",
-    "remove_overlap(system, STEEPEST_DESCENT_PARAMS)\n",
+    "remove_overlap(system)\n",
     "system.thermostat.set_langevin(**LANGEVIN_PARAMS)\n",
     "print('', end='', flush=True)\n",
     "integrate_system(system, WARMUP_STEPS)\n",
diff --git a/doc/tutorials/constant_pH/constant_pH.ipynb b/doc/tutorials/constant_pH/constant_pH.ipynb
index f5f0577aec3..e147a6c3bb7 100644
--- a/doc/tutorials/constant_pH/constant_pH.ipynb
+++ b/doc/tutorials/constant_pH/constant_pH.ipynb
@@ -145,11 +145,8 @@
     "plt.rcParams.update({'font.size': 18})\n",
     "\n",
     "import numpy as np\n",
-    "import pkg_resources\n",
     "import pint  # module for working with units and dimensions\n",
     "import time\n",
-    "assert pkg_resources.packaging.specifiers.SpecifierSet('>=0.10.1').contains(pint.__version__), \\\n",
-    "  f'pint version {pint.__version__} is too old: several numpy operations can cast away the unit'\n",
     "\n",
     "import espressomd\n",
     "espressomd.assert_features(['WCA', 'ELECTROSTATICS'])\n",
@@ -548,15 +545,27 @@
    "source": [
     "if USE_WCA:\n",
     "    # set the WCA interaction between all particle pairs\n",
+    "    wca_sigma = 1.0\n",
+    "    wca_epsilon = 1.0\n",
     "    for type_1 in TYPES.values():\n",
     "        for type_2 in TYPES.values():\n",
     "            if type_1 >= type_2:\n",
-    "                system.non_bonded_inter[type_1, type_2].wca.set_params(epsilon=1.0, sigma=1.0)\n",
+    "                system.non_bonded_inter[type_1, type_2].wca.set_params(epsilon=wca_epsilon, sigma=wca_sigma)\n",
     "\n",
     "    # relax the overlaps with steepest descent\n",
-    "    system.integrator.set_steepest_descent(f_max=0, gamma=0.1, max_displacement=0.1)\n",
-    "    system.integrator.run(20)\n",
-    "    system.integrator.set_vv()  # to switch back to velocity Verlet\n",
+    "    mass = 1.0\n",
+    "    FMAX = 0.01 * wca_sigma * mass / system.time_step**2\n",
+    "\n",
+    "    system.integrator.set_steepest_descent(\n",
+    "        f_max=FMAX,\n",
+    "        gamma=0.1,\n",
+    "        max_displacement=0.1)\n",
+    "\n",
+    "    system.integrator.run(5000)\n",
+    "    assert np.all(np.abs(system.part.all().f)<FMAX), \"Overlap removal did not converge!\"\n",
+    "\n",
+    "    # to switch back to velocity Verlet\n",
+    "    system.integrator.set_vv()\n",
     "\n",
     "# add thermostat and short integration to let the system relax further\n",
     "system.thermostat.set_langevin(kT=KT_REDUCED, gamma=1.0, seed=7)\n",
diff --git a/doc/tutorials/electrodes/CMakeLists.txt b/doc/tutorials/electrodes/CMakeLists.txt
index 8343dd1fbd4..4ba4352845b 100644
--- a/doc/tutorials/electrodes/CMakeLists.txt
+++ b/doc/tutorials/electrodes/CMakeLists.txt
@@ -22,5 +22,5 @@ configure_tutorial_target(TARGET tutorial_electrodes DEPENDS
 
 nb_export(TARGET tutorial_electrodes SUFFIX "1" FILE "electrodes_part1.ipynb"
           HTML_RUN)
-# TODO: fix time step issues (#4798) before adding HTML_RUN back
-nb_export(TARGET tutorial_electrodes SUFFIX "2" FILE "electrodes_part2.ipynb")
+nb_export(TARGET tutorial_electrodes SUFFIX "2" FILE "electrodes_part2.ipynb"
+          HTML_RUN)
diff --git a/doc/tutorials/electrodes/electrodes_part1.ipynb b/doc/tutorials/electrodes/electrodes_part1.ipynb
index 6af99ed2890..734fe8d7fb3 100644
--- a/doc/tutorials/electrodes/electrodes_part1.ipynb
+++ b/doc/tutorials/electrodes/electrodes_part1.ipynb
@@ -296,7 +296,7 @@
    "id": "20957c03",
    "metadata": {},
    "source": [
-    "### Task\n",
+    "**Task**\n",
     "\n",
     "* Set up [ELC](https://espressomd.github.io/doc/espressomd.html#espressomd.electrostatics.ELC) with ``p3m`` as its actor."
    ]
@@ -345,7 +345,7 @@
    "id": "23398839",
    "metadata": {},
    "source": [
-    "### TASK\n",
+    "**Task**\n",
     "\n",
     "* Using the (area) density of ICC particles defined in the cell above, calculate the x/y positions of the particles for a uniform, quadratic grid. \n",
     "* Add fixed particles on the electrodes. Make sure to use the correct ``type``. Give the top (bottom) plate a total charge of $+1$ ($-1$). \n",
@@ -390,12 +390,12 @@
    "id": "5326e038",
    "metadata": {},
    "source": [
-    "### Task\n",
+    "**Task**\n",
     "\n",
     "* Set ``elc`` as ``system.electrostatics.solver``\n",
     "* Create an [ICC object]((https://espressomd.github.io/doc/espressomd.html#espressomd.electrostatic_extensions.ICC) and set it as ``system.electrostatics.extension``\n",
     "\n",
-    "### Hints\n",
+    "**Hints**\n",
     "\n",
     "* ICC variables are defined in the second code cell from the top.\n",
     "* Make sure to not mark our test particles ``p1`` and ``p2`` (with ids 0 and 1) as ICC particles."
diff --git a/doc/tutorials/electrodes/electrodes_part2.ipynb b/doc/tutorials/electrodes/electrodes_part2.ipynb
index ffe79e19bce..4cae3ebcd54 100644
--- a/doc/tutorials/electrodes/electrodes_part2.ipynb
+++ b/doc/tutorials/electrodes/electrodes_part2.ipynb
@@ -5,7 +5,7 @@
    "id": "357a65e2",
    "metadata": {},
    "source": [
-    "# Basic simulation of electrodes in ESPResSo part II: Electrolyte capacitor and Poisson–Boltzmann theory"
+    "# Basic simulation of electrodes in ESPResSo part II: Electrolytic capacitor and Poisson–Boltzmann theory"
    ]
   },
   {
@@ -306,7 +306,7 @@
    "id": "867de4db",
    "metadata": {},
    "source": [
-    "### Task\n",
+    "**Task**\n",
     "\n",
     "* write a function \n",
     "`get_box_dimension(concentration, distance, n_ionpairs=N_IONPAIRS)`\n",
@@ -413,7 +413,8 @@
    "id": "48abb259",
    "metadata": {},
    "source": [
-    "### Task\n",
+    "**Task**\n",
+    "\n",
     "* add two wall constraints at $z=0$ and $z=L_z$ to stop particles from\n",
     "crossing the boundaries and model the electrodes.\n",
     "Refer to \n",
@@ -458,7 +459,7 @@
    "source": [
     "#### 1.2.2 Add particles for the ions\n",
     "\n",
-    "### Task\n",
+    "**Task**\n",
     "\n",
     "* place ion pairs at random positions between the electrodes.\n",
     "\n",
@@ -507,7 +508,7 @@
    "source": [
     "#### 1.2.3 Add interactions:\n",
     "\n",
-    "### Task\n",
+    "**Task**\n",
     "\n",
     "* For excluded volume interactions, add a WCA potential. \n",
     "\n",
@@ -556,7 +557,7 @@
     "This function will take care of tuning the P3M and ELC parameters.\n",
     "For our purposes, an accuracy of $10^{-3}$ is sufficient.\n",
     "\n",
-    "### Task\n",
+    "**Task**\n",
     "\n",
     "* Write a function `setup_electrostatic_solver(potential_diff)` that\n",
     "returns the ELC instance."
@@ -570,13 +571,16 @@
    "outputs": [],
    "source": [
     "# SOLUTION CELL\n",
+    "\n",
     "def setup_electrostatic_solver(potential_diff):\n",
     "    delta_mid_top = -1.  # (Fully metallic case both -1)                 \n",
     "    delta_mid_bot = -1.\n",
-    "    accuracy = 1e-3\n",
+    "    p3m_accuracy = 1e-3\n",
     "    elc_accuracy = 1e-3\n",
     "    p3m = espressomd.electrostatics.P3M(prefactor=BJERRUM_LENGTH,\n",
-    "                                        accuracy=accuracy,\n",
+    "                                        accuracy=p3m_accuracy,\n",
+    "                                        mesh=[12, 12, 48], # pinned for tuning reproducibility\n",
+    "                                        cao=4, # pinned for tuning reproducibility\n",
     "                                        tune=True,\n",
     "                                        verbose=False)\n",
     "    elc = espressomd.electrostatics.ELC(actor=p3m,\n",
@@ -589,32 +593,6 @@
     "    return elc"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "efbf4cf9",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "03ab39a1",
-   "metadata": {},
-   "source": [
-    "Now add the solver to the system:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "25219528",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "system.electrostatics.solver = setup_electrostatic_solver(POTENTIAL_DIFF)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "5fed3232",
@@ -625,54 +603,27 @@
     "### 2.1 Steepest descent\n",
     "\n",
     "Before we can start the simulation, we need to remove the overlap between particles.\n",
-    "For this, we use the steepest descent integrator.\n",
-    "Afterwards, we switch to a Velocity Verlet integrator and set up a Langevin thermostat.\n",
-    "Note, that we only analyze static properties, thus the damping and temperature chosen\n",
-    "here only determine the simulation time towards the equilibrium distribution."
+    "For this, we use the steepest descent integrator."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "51a25228",
+   "id": "7da4b3f6",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# suitable minimization parameters for this system\n",
-    "F_TOL = 1e-2\n",
-    "DAMPING = 30\n",
-    "MAX_STEPS = 10000\n",
-    "MAX_DISPLACEMENT = 0.01 * LJ_SIGMA\n",
-    "EM_STEP = 10"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1a3cacd2",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
    "source": [
     "# Set up steepest descent integration\n",
-    "system.integrator.set_steepest_descent(f_max=0,  # use a relative convergence criterion only\n",
-    "                                       gamma=DAMPING,\n",
-    "                                       max_displacement=MAX_DISPLACEMENT)\n",
-    "\n",
-    "# Initialize integrator to obtain initial forces\n",
-    "system.integrator.run(0)\n",
-    "old_force = np.max(np.linalg.norm(system.part.all().f, axis=1))\n",
-    "\n",
-    "\n",
-    "while system.time / system.time_step < MAX_STEPS:\n",
-    "    system.integrator.run(EM_STEP)\n",
-    "    force = np.max(np.linalg.norm(system.part.all().f, axis=1))\n",
-    "    rel_force = np.abs((force - old_force) / old_force)\n",
-    "    print(f'rel. force change: {rel_force:.2e}')\n",
-    "    if rel_force < F_TOL:\n",
-    "        break\n",
-    "    old_force = force"
+    "MASS = 1.0\n",
+    "FMAX = 0.01 * LJ_SIGMA * MASS / system.time_step**2\n",
+    "\n",
+    "system.integrator.set_steepest_descent(\n",
+    "        f_max=FMAX,\n",
+    "        gamma=30.,\n",
+    "        max_displacement=0.01)\n",
+    "\n",
+    "system.integrator.run(5000)\n",
+    "assert np.all(np.abs(system.part.all().f)<FMAX), \"Overlap removal did not converge!\""
    ]
   },
   {
@@ -680,56 +631,46 @@
    "id": "abbfc272",
    "metadata": {},
    "source": [
-    "### 2.2 Equilibrate the ion distribution"
+    "### 2.2 Warmup\n",
+    "\n",
+    "We now switch to a Velocity Verlet integrator and set up a Langevin thermostat.\n",
+    "Note, that we only analyze static properties, thus the damping and temperature chosen\n",
+    "here only determine the simulation time towards the equilibrium distribution."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "45c444f5",
+   "id": "38137a83",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Switch to velocity verlet integration using a Langevin thermostat\n",
+    "# Equilibration parameters\n",
+    "system.time_step = 0.003 # this time step limits particle movement to ~5% sigma per integration step\n",
     "system.integrator.set_vv()\n",
-    "system.thermostat.set_langevin(kT=1.0, gamma=0.1, seed=42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e9c7fe2f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Equlibration parameters\n",
-    "STEPS_PER_SAMPLE = 200\n",
-    "N_SAMPLES_EQUIL = 50\n",
-    "N_PART = 2 * N_IONPAIRS\n",
-    "\n",
-    "times = np.zeros(N_SAMPLES_EQUIL)\n",
-    "e_total = np.zeros_like(times)\n",
-    "e_kin = np.zeros_like(times)\n",
+    "system.thermostat.set_langevin(kT=1., gamma=2., seed=42)\n",
+    "system.electrostatics.solver = setup_electrostatic_solver(POTENTIAL_DIFF)\n",
     "\n",
-    "for i in tqdm.trange(N_SAMPLES_EQUIL):\n",
-    "    times[i] = system.time\n",
+    "times = []\n",
+    "e_kin = []\n",
+    "for i in tqdm.trange(100):\n",
     "    energy = system.analysis.energy()\n",
-    "    e_total[i] = energy['total']\n",
-    "    e_kin[i] = energy['kinetic']\n",
-    "    system.integrator.run(STEPS_PER_SAMPLE)"
+    "    e_kin.append(energy['kinetic'])\n",
+    "    times.append(system.time)\n",
+    "    system.integrator.run(10)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c45afc27",
+   "id": "970825d8-6087-4b38-a39e-f67e718cbd6e",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Plot the convergence of the total energy\n",
     "plt.figure(figsize=(10, 6))\n",
-    "plt.plot(times, e_total, label='total')\n",
-    "plt.plot(times, e_kin, label='kinetic')\n",
+    "plt.plot(times, len(e_kin) * [N_IONPAIRS * 2 * 3. / 2.], label='heat bath')\n",
+    "plt.plot(times, e_kin, label='kinetic energy')\n",
     "plt.xlabel('Simulation time')\n",
     "plt.ylabel('Energy')\n",
     "plt.legend()\n",
@@ -741,7 +682,32 @@
    "id": "1f1f7892",
    "metadata": {},
    "source": [
-    "Convergence after $t\\sim50$ time units."
+    "Convergence after $t \\sim 5$ time units."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9a22f6b5-33f9-4723-8e1c-41717cf6bb03",
+   "metadata": {},
+   "source": [
+    "### 2.3 Equilibrate the ion distribution\n",
+    "\n",
+    "Now we let ions diffuse to the electrodes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e35f031d-4188-4d70-8852-7ff3ce571904",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N_SAMPLES_EQUIL = 20\n",
+    "STEPS_PER_EQUIL = 200\n",
+    "\n",
+    "system.thermostat.set_langevin(kT=1.0, gamma=0.1, seed=42)\n",
+    "for tm in tqdm.trange(N_SAMPLES_EQUIL):\n",
+    "    system.integrator.run(STEPS_PER_EQUIL)"
    ]
   },
   {
@@ -760,7 +726,7 @@
     "The time average is obtained through a\n",
     "[espressomd.accumulators.MeanVarianceCalculator](espressomd.accumulators.MeanVarianceCalculator).\n",
     "\n",
-    "### Task\n",
+    "**Task**\n",
     "\n",
     "* Write a function `setup_densityprofile_accumulators(bin_width)` that returns the\n",
     "`bin_centers` and the accumulators for both ion species in the $z$-range $[0,d]$.\n",
@@ -831,20 +797,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "N_SAMPLES_PROD = 10\n",
+    "N_SAMPLES_PROD = 20\n",
+    "STEPS_PER_SAMPLE = 200\n",
     "\n",
     "# Add the accumulators\n",
-    "system.auto_update_accumulators.clear()\n",
     "system.auto_update_accumulators.add(density_accumulator_cation)\n",
     "system.auto_update_accumulators.add(density_accumulator_anion)\n",
-    "    \n",
-    "times = []\n",
-    "e_total = []\n",
+    "\n",
     "for tm in tqdm.trange(N_SAMPLES_PROD):\n",
     "    system.integrator.run(STEPS_PER_SAMPLE)\n",
-    "    times.append(system.time)\n",
-    "    energy = system.analysis.energy()\n",
-    "    e_total.append(energy['total'])\n",
     "\n",
     "cation_profile_mean = density_accumulator_cation.mean()[0, 0, :]\n",
     "anion_profile_mean = density_accumulator_anion.mean()[0, 0, :]"
@@ -1061,11 +1022,12 @@
     "MIN_PHI = 0.5\n",
     "MAX_PHI = 10\n",
     "N_PHI = 7\n",
-    "N_SAMPLES_EQUIL_CAP = 5\n",
+    "N_SAMPLES_EQUIL_CAP = 15\n",
     "N_SAMPLES_CAP = 5\n",
     "\n",
     "# sample from high to low potential to improve sampling\n",
     "for potential_diff in tqdm.tqdm(np.linspace(MIN_PHI, MAX_PHI, N_PHI)[::-1]):\n",
+    "    system.auto_update_accumulators.clear()\n",
     "    system.electrostatics.solver = setup_electrostatic_solver(potential_diff)\n",
     "    system.integrator.run(N_SAMPLES_EQUIL_CAP * STEPS_PER_SAMPLE)\n",
     "    sigmas = []\n",
diff --git a/doc/tutorials/error_analysis/error_analysis_part2.ipynb b/doc/tutorials/error_analysis/error_analysis_part2.ipynb
index d2060d7beca..fb220822104 100644
--- a/doc/tutorials/error_analysis/error_analysis_part2.ipynb
+++ b/doc/tutorials/error_analysis/error_analysis_part2.ipynb
@@ -210,7 +210,7 @@
     "fig = plt.figure(figsize=(10, 6))\n",
     "plt.plot(autocov)\n",
     "plt.xlabel(\"lag time $j$\")\n",
-    "plt.ylabel(\"$\\hat{K}^{XX}_j$\")\n",
+    "plt.ylabel(r\"$\\hat{K}^{XX}_j$\")\n",
     "plt.show()"
    ]
   },
@@ -251,7 +251,7 @@
     "plt.gca().axhline(0, color=\"gray\", linewidth=1)\n",
     "plt.plot(autocov)\n",
     "plt.xlabel(\"lag time $j$\")\n",
-    "plt.ylabel(\"$\\hat{K}^{XX}_j$\")\n",
+    "plt.ylabel(r\"$\\hat{K}^{XX}_j$\")\n",
     "plt.show()"
    ]
   },
@@ -304,7 +304,7 @@
     "plt.xlim((1, N_MAX))\n",
     "plt.xscale(\"log\")\n",
     "plt.xlabel(\"lag time $j$\")\n",
-    "plt.ylabel(\"$\\hat{K}^{XX}_j$\")\n",
+    "plt.ylabel(r\"$\\hat{K}^{XX}_j$\")\n",
     "plt.legend()\n",
     "plt.show()\n",
     "\n",
@@ -498,7 +498,7 @@
     "    plt.gca().axhline(0, color=\"gray\",linewidth=1)\n",
     "    plt.plot(acf)\n",
     "    plt.xlabel(\"lag time $j$\")\n",
-    "    plt.ylabel(\"$\\hat{K}^{XX}_j$\")\n",
+    "    plt.ylabel(r\"$\\hat{K}^{XX}_j$\")\n",
     "    plt.show()\n",
     "\n",
     "    # create integrated ACF plot\n",
diff --git a/doc/tutorials/ferrofluid/ferrofluid_part1.ipynb b/doc/tutorials/ferrofluid/ferrofluid_part1.ipynb
index 314c02e45a1..82b8012552a 100644
--- a/doc/tutorials/ferrofluid/ferrofluid_part1.ipynb
+++ b/doc/tutorials/ferrofluid/ferrofluid_part1.ipynb
@@ -290,6 +290,7 @@
     "\n",
     "# Particles\n",
     "N_PART = 1200\n",
+    "MASS=1.0\n",
     "\n",
     "# Area fraction of the mono-layer\n",
     "PHI = 0.1\n",
@@ -494,29 +495,6 @@
     "Now we choose the steepest descent integrator to remove possible overlaps of the particles."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cf428a28",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Set integrator to steepest descent method\n",
-    "system.integrator.set_steepest_descent(\n",
-    "    f_max=0, gamma=0.1, max_displacement=0.05)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8499b618",
-   "metadata": {},
-   "source": [
-    "## Exercise:\n",
-    "\n",
-    "Perform a steepest descent energy minimization.\n",
-    "Track the relative energy change $E_{\\text{rel}}$ per minimization loop (where the integrator is run for 10 steps) and terminate once $E_{\\text{rel}} \\le 0.05$, i.e. when there is less than a 5% difference in the relative energy change in between iterations."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -524,30 +502,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# SOLUTION CELL\n",
-    "import sys\n",
+    "# Choice of Fmax: Force acting on a particle moving 0.01*Sigma in a single time step\n",
+    "FMAX = 0.01 * LJ_SIGMA * MASS / system.time_step**2\n",
     "\n",
-    "energy = system.analysis.energy()['total']\n",
-    "relative_energy_change = 1.0\n",
-    "while relative_energy_change > 0.05:\n",
-    "    system.integrator.run(10)\n",
-    "    energy_new = system.analysis.energy()['total']\n",
-    "    # Prevent division by zero errors:\n",
-    "    if energy < sys.float_info.epsilon:\n",
-    "        break\n",
-    "    relative_energy_change = (energy - energy_new) / energy\n",
-    "    print(f'Minimization, relative change in energy: {relative_energy_change:.4f}')\n",
-    "    energy = energy_new"
+    "system.integrator.set_steepest_descent(\n",
+    "        f_max=FMAX,\n",
+    "        gamma=0.1,\n",
+    "        max_displacement=0.05)\n",
+    "\n",
+    "system.integrator.run(5000)\n",
+    "assert np.all(np.abs(system.part.all().f) < FMAX), \"Overlap removal did not converge!\""
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "63fdafa0",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "id": "7d476671",
diff --git a/doc/tutorials/ferrofluid/ferrofluid_part2.ipynb b/doc/tutorials/ferrofluid/ferrofluid_part2.ipynb
index 4d58268f734..453d5f9e979 100644
--- a/doc/tutorials/ferrofluid/ferrofluid_part2.ipynb
+++ b/doc/tutorials/ferrofluid/ferrofluid_part2.ipynb
@@ -169,11 +169,17 @@
     "particles = system.part.add(pos=pos, rotation=N_PART * [(True, True, True)], dip=dip, fix=N_PART * [(False, False, True)])\n",
     "\n",
     "# Remove overlap between particles by means of the steepest descent method\n",
+    "MASS = 1.0\n",
+    "FMAX = 0.01 * LJ_SIGMA * MASS / system.time_step**2\n",
+    "\n",
     "system.integrator.set_steepest_descent(\n",
-    "    f_max=0, gamma=0.1, max_displacement=0.05)\n",
+    "        f_max=FMAX,\n",
+    "        gamma=0.1,\n",
+    "        max_displacement=0.05)\n",
+    "\n",
+    "system.integrator.run(5000)\n",
+    "assert np.all(np.abs(system.part.all().f) < FMAX), \"Overlap removal did not converge!\"\n",
     "\n",
-    "while system.analysis.energy()[\"total\"] > 5 * KT * N_PART:\n",
-    "    system.integrator.run(20)\n",
     "\n",
     "# Switch to velocity Verlet integrator\n",
     "system.integrator.set_vv()\n",
@@ -217,7 +223,7 @@
    "outputs": [],
    "source": [
     "# magnetic field times dipole moment\n",
-    "H_dipm = ALPHA * KT\n",
+    "H_dipm = ALPHA * KT / MU_0\n",
     "H_field = [H_dipm, 0, 0]"
    ]
   },
diff --git a/doc/tutorials/ferrofluid/ferrofluid_part3.ipynb b/doc/tutorials/ferrofluid/ferrofluid_part3.ipynb
index 5f4843fa35b..b7f896f46a6 100644
--- a/doc/tutorials/ferrofluid/ferrofluid_part3.ipynb
+++ b/doc/tutorials/ferrofluid/ferrofluid_part3.ipynb
@@ -238,11 +238,14 @@
     "particles = system.part.add(pos=pos, rotation=N * [(True, True, True)], dip=dip)\n",
     "\n",
     "# Remove overlap between particles by means of the steepest descent method\n",
+    "mass = 1.0\n",
+    "f_max = 0.01 * lj_sigma * mass / system.time_step**2\n",
     "system.integrator.set_steepest_descent(\n",
-    "    f_max=0, gamma=0.1, max_displacement=0.05)\n",
-    "\n",
-    "while system.analysis.energy()[\"total\"] > 5 * kT * N:\n",
-    "    system.integrator.run(20)\n",
+    "        f_max=f_max,\n",
+    "        gamma=0.1,\n",
+    "        max_displacement=0.05)\n",
+    "system.integrator.run(5000)\n",
+    "assert np.all(np.abs(system.part.all().f) < f_max), \"Overlap removal did not converge!\"\n",
     "\n",
     "# Switch to velocity Verlet integrator\n",
     "system.integrator.set_vv()\n",
@@ -557,7 +560,7 @@
     "    if alpha == 0:\n",
     "        continue\n",
     "    # set magnetic field constraint\n",
-    "    H_dipm = (alpha * kT)\n",
+    "    H_dipm = alpha * kT / mu_0\n",
     "    H_field = [H_dipm, 0, 0]\n",
     "    H_constraint = espressomd.constraints.HomogeneousMagneticField(H=H_field)\n",
     "    system.constraints.add(H_constraint)\n",
diff --git a/doc/tutorials/langevin_dynamics/langevin_dynamics.ipynb b/doc/tutorials/langevin_dynamics/langevin_dynamics.ipynb
index 372ca029cee..804725d1799 100644
--- a/doc/tutorials/langevin_dynamics/langevin_dynamics.ipynb
+++ b/doc/tutorials/langevin_dynamics/langevin_dynamics.ipynb
@@ -489,7 +489,7 @@
     "# SOLUTION CELL\n",
     "plt.figure(figsize=(10, 6))\n",
     "plt.xlabel(r'$\\gamma$')\n",
-    "plt.ylabel('Diffusion coefficient [$\\sigma^2/t$]')\n",
+    "plt.ylabel(r'Diffusion coefficient [$\\sigma^2/t$]')\n",
     "x = np.linspace(0.9 * min(gammas), 1.1 * max(gammas), 50)\n",
     "y = KT / x\n",
     "plt.plot(x, y, '-', label=r'$k_\\mathrm{B}T\\gamma^{-1}$')\n",
diff --git a/doc/tutorials/lennard_jones/lennard_jones.ipynb b/doc/tutorials/lennard_jones/lennard_jones.ipynb
index f6d846e9a44..80f8b9c2943 100644
--- a/doc/tutorials/lennard_jones/lennard_jones.ipynb
+++ b/doc/tutorials/lennard_jones/lennard_jones.ipynb
@@ -120,8 +120,8 @@
     "plt.plot(xs, ys_lj, label='LJ')\n",
     "plt.plot(xs, ys_WCA, label='WCA')\n",
     "plt.axhline(y=0, color='grey')\n",
-    "plt.xlabel(\"$r/\\sigma$\")\n",
-    "plt.ylabel(\"$V(r)/(k_{\\mathrm{B}}T)$\")\n",
+    "plt.xlabel(r\"$r/\\sigma$\")\n",
+    "plt.ylabel(r\"$V(r)/(k_{\\mathrm{B}}T)$\")\n",
     "plt.legend()\n",
     "plt.ylim(-1.5, 2.5)\n",
     "plt.show()"
@@ -539,18 +539,14 @@
    "id": "b560a981",
    "metadata": {},
    "source": [
-    "**Exercise:**\n",
-    "\n",
-    "* Use [<tt>espressomd.integrate.set_steepest_descent</tt>](https://espressomd.github.io/doc/integration.html#steepest-descent) to relax the initial configuration.\n",
-    "  Use a maximal displacement $\\vec{r}_{\\mathrm{max}}$ of <tt>MAX_DISPLACEMENT</tt>.\n",
-    "  The particle displacement is related to the particle force via a damping constant $\\gamma$, such that $\\vec{r}(t + dt) = \\vec{r}(t) + \\min(\\gamma \\vec{F}(t), \\vec{r}_{\\mathrm{max}})$. Use a damping constant  <tt>gamma = DAMPING</tt>.\n",
-    "* Use the relative change of the system maximal force as a convergence criterion.\n",
-    "  Check the documentation [<tt>espressomd.particle_data</tt> module](https://espressomd.github.io/doc/espressomd.html#module-espressomd.particle_data) to obtain the forces.\n",
-    "  The steepest descent has converged if the relative force change between two rounds of minimizations is less than the threshold value <tt>F_TOL</tt>. Note that by default [<tt>espressomd.integrate.set_steepest_descent</tt>](https://espressomd.github.io/doc/espressomd.html#espressomd.integrate.SteepestDescent) will halt when the system maximal force is less than some value <tt>f_max</tt>. When a custom convergence criterion is implemented, as it is the case here, the default convergence criterion needs to be disabled by setting <tt>f_max=0</tt>.\n",
-    "* Break the minimization loop after a maximal number of <tt>MAX_STEPS</tt> steps or if convergence is achieved.\n",
-    "  Check for convergence every <tt>EMSTEP</tt> steps.\n",
+    "We will use [<tt>espressomd.integrate.set_steepest_descent()</tt>](https://espressomd.github.io/doc/integration.html#steepest-descent) to relax the initial configuration.\n",
+    "The particle displacement is related to the particle force via a damping constant $\\gamma$, such that:\n",
+    "$$\\vec{x}_i(t + \\Delta t) = \\vec{x}_i(t) + \\min\\left(|\\gamma\\vec{F}_i(t)\\Delta t|, r_{\\text{max}}\\right) \\cdot \\vec{F}_i(t)/|\\vec{F}_i(t)|$$\n",
     "\n",
-    "***Hint:*** To obtain the initial forces one has to initialize the integrator using <tt>integ_steps=0</tt>, i.e. call <tt>system.integrator.run(0)</tt> before accessing the force array."
+    "with $r_{\\text{max}}$ the maximal displacement, $\\gamma$ the friction coefficient, $\\vec{x}$ the particle position,\n",
+    "$\\vec{F}$ the force on the particle, $\\Delta t$ the time step, and $i$ the vector index.\n",
+    "We will integrate until the largest particle force in the system falls below a specific threshold value `FMAX`,\n",
+    "chosen in such a way that integrating the system with that force would lead to a displacement inferior or equal to 1% of the particle diameter."
    ]
   },
   {
@@ -560,35 +556,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# SOLUTION CELL\n",
-    "# Set up steepest descent integration\n",
-    "system.integrator.set_steepest_descent(f_max=0,  # use a relative convergence criterion only\n",
-    "                                       gamma=DAMPING,\n",
-    "                                       max_displacement=MAX_DISPLACEMENT)\n",
+    "MASS = 1.0\n",
+    "FMAX = 0.01 * LJ_SIG * MASS / system.time_step**2\n",
     "\n",
-    "# Initialize integrator to obtain initial forces\n",
-    "system.integrator.run(0)\n",
-    "old_force = np.max(np.linalg.norm(system.part.all().f, axis=1))\n",
+    "system.integrator.set_steepest_descent(\n",
+    "        f_max=FMAX,\n",
+    "        gamma=10,\n",
+    "        max_displacement=0.01)\n",
     "\n",
-    "\n",
-    "while system.time / system.time_step < MAX_STEPS:\n",
-    "    system.integrator.run(EM_STEP)\n",
-    "    force = np.max(np.linalg.norm(system.part.all().f, axis=1))\n",
-    "    rel_force = np.abs((force - old_force) / old_force)\n",
-    "    print(f'rel. force change: {rel_force:.2e}')\n",
-    "    if rel_force < F_TOL:\n",
-    "        break\n",
-    "    old_force = force"
+    "system.integrator.run(200)\n",
+    "assert np.all(np.abs(system.part.all().f) < FMAX), \"Overlap removal did not converge!\""
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "62f80d15",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/doc/tutorials/polymers/polymers.ipynb b/doc/tutorials/polymers/polymers.ipynb
index 18320e7aeea..30760f3803f 100644
--- a/doc/tutorials/polymers/polymers.ipynb
+++ b/doc/tutorials/polymers/polymers.ipynb
@@ -147,7 +147,7 @@
     "```\n",
     "\n",
     "creates a Lennard-Jones interaction with $\\varepsilon=1.$, $\\sigma=1.$,\n",
-    "$r_{\\text{cut}} = 1.1225$ and $\\varepsilon_{\\text{shift}}=0.25$ between particles\n",
+    "$r_{\\text{cut}} = 2^{1/6} \\simeq 1.1225$ and $\\varepsilon_{\\text{shift}}=0.25$ between particles\n",
     "of type 0, which is the desired repulsive interaction. The command\n",
     "\n",
     "```python\n",
@@ -349,6 +349,7 @@
     "STEPS = 100\n",
     "KT = 1.0\n",
     "GAMMA = 5.0\n",
+    "MASS = 1.0\n",
     "POLYMER_PARAMS = {'n_polymers': 1, 'bond_length': 1, 'seed': 42, 'min_distance': 0.9}\n",
     "POLYMER_MODEL = 'Rouse'\n",
     "assert POLYMER_MODEL in ('Rouse', 'Zimm')\n",
@@ -358,11 +359,15 @@
     "\n",
     "# System setup\n",
     "system = espressomd.System(box_l=3 * [BOX_L])\n",
+    "system.time_step = TIME_STEP\n",
     "system.cell_system.skin = 0.4\n",
     "\n",
     "# Lennard-Jones interaction\n",
+    "LJ_SIGMA=1.0\n",
+    "LJ_EPSILON=1.0\n",
+    "LJ_CUTOFF=2.0**(1.0 / 6.0)\n",
     "system.non_bonded_inter[0, 0].lennard_jones.set_params(\n",
-    "    epsilon=1.0, sigma=1.0, shift=\"auto\", cutoff=2.0**(1.0 / 6.0))\n",
+    "    epsilon=LJ_EPSILON, sigma=LJ_SIGMA, shift=\"auto\", cutoff=LJ_CUTOFF)\n",
     "\n",
     "# Fene interaction\n",
     "fene = espressomd.interactions.FeneBond(k=7, r_0=1, d_r_max=2)\n",
@@ -377,21 +382,23 @@
     "rh_results = []\n",
     "rf_results = []\n",
     "rg_results = []\n",
-    "for index, N in enumerate(N_MONOMERS):\n",
+    "for N in N_MONOMERS:\n",
     "    logging.info(f\"Polymer size: {N}\")\n",
     "    build_polymer(system, N, POLYMER_PARAMS, fene)\n",
     "\n",
-    "    logging.info(\"Warming up the polymer chain.\")\n",
-    "    system.time_step = 0.002\n",
+    "    logging.info(\"Removing overlaps ...\")\n",
+    "    FMAX = 0.001 * LJ_SIGMA * MASS / system.time_step**2\n",
     "    system.integrator.set_steepest_descent(\n",
-    "        f_max=1.0,\n",
+    "        f_max=FMAX,\n",
     "        gamma=10,\n",
     "        max_displacement=0.01)\n",
-    "    system.integrator.run(2000)\n",
+    "\n",
+    "    system.integrator.run(100)\n",
+    "    assert np.all(np.abs(system.part.all().f) < FMAX), \"Overlap removal did not converge!\"\n",
     "    system.integrator.set_vv()\n",
-    "    logging.info(\"Warmup finished.\")\n",
+    "    logging.info(\"Remove overlap finished.\")\n",
     "\n",
-    "    logging.info(\"Equilibration.\")\n",
+    "    logging.info(\"Equilibration ...\")\n",
     "    system.time_step = TIME_STEP\n",
     "    system.thermostat.set_langevin(kT=1.0, gamma=50, seed=42)\n",
     "    system.integrator.run(2000)\n",
@@ -404,7 +411,7 @@
     "    elif POLYMER_MODEL == 'Zimm':\n",
     "        solvent_lbm(system, KT, GAMMA)\n",
     "\n",
-    "    logging.info(\"Warming up the system with the fluid.\")\n",
+    "    logging.info(\"Warming up the system with the fluid ...\")\n",
     "    system.integrator.run(1000)\n",
     "    logging.info(\"Warming up the system with the fluid finished.\")\n",
     "\n",
@@ -553,8 +560,7 @@
     "        popt, _ = scipy.optimize.curve_fit(\n",
     "            lambda x, a, b: kirkwood_zimm(x, a, b, rh_exponent), n_monomers, diffusion)\n",
     "        y = kirkwood_zimm(x, popt[0], popt[1], rh_exponent)\n",
-    "        label = f'''\\\n",
-    "        $D^{{\\\\mathrm{{fit}}}} = \\\n",
+    "        label = f'''$D^{{\\\\mathrm{{fit}}}} = \\\n",
     "            \\\\frac{{{popt[0]:.2f}}}{{N}} + \\\n",
     "            \\\\frac{{{popt[1] * 6 * np.pi:.3f} }}{{6\\\\pi}} \\\\cdot \\\n",
     "            \\\\frac{{{1}}}{{N^{{{rh_exponent:.2f}}}}}$ \\\n",
@@ -651,7 +657,7 @@
     "             ls='', marker='o', capsize=5, capthick=1,\n",
     "             label=r'$R_g^{\\mathrm{simulation}}$')\n",
     "plt.xlabel('Number of monomers $N$')\n",
-    "plt.ylabel('Radius of gyration [$\\sigma$]')\n",
+    "plt.ylabel(r'Radius of gyration [$\\sigma$]')\n",
     "plt.legend()\n",
     "plt.show()"
    ]
@@ -699,9 +705,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "06b65488",
-   "metadata": {
-    "scrolled": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "rh_summary = standard_error_mean_autocorrelation(rh_results, r'\\operatorname{acf}(R_h)')\n",
@@ -717,7 +721,7 @@
     "             ls='', marker='o', capsize=5, capthick=1,\n",
     "             label=r'$R_h^{\\mathrm{simulation}}$')\n",
     "plt.xlabel('Number of monomers $N$')\n",
-    "plt.ylabel('Hydrodynamic radius [$\\sigma$]')\n",
+    "plt.ylabel(r'Hydrodynamic radius [$\\sigma$]')\n",
     "plt.legend()\n",
     "plt.show()"
    ]
diff --git a/doc/tutorials/widom_insertion/widom_insertion.ipynb b/doc/tutorials/widom_insertion/widom_insertion.ipynb
index 7e3dff1e60c..04e9140afd7 100644
--- a/doc/tutorials/widom_insertion/widom_insertion.ipynb
+++ b/doc/tutorials/widom_insertion/widom_insertion.ipynb
@@ -173,6 +173,7 @@
     "\n",
     "# number of salt ion pairs\n",
     "N_ION_PAIRS = 50\n",
+    "MASS=1.0\n",
     "\n",
     "# particle types and charges\n",
     "types = {\n",
@@ -307,10 +308,15 @@
    "source": [
     "# SOLUTION CELL\n",
     "def warmup():\n",
-    "    system.integrator.set_steepest_descent(f_max=0, gamma=1e-3, max_displacement=0.01)\n",
-    "\n",
-    "    print(\"Removing overlaps...\", flush=True)\n",
-    "    system.integrator.run(10000)\n",
+    "    FMAX = 0.01 * LJ_SIGMA * MASS / system.time_step**2\n",
+    "\n",
+    "    system.integrator.set_steepest_descent(\n",
+    "        f_max=FMAX,\n",
+    "        gamma=1e-3,\n",
+    "        max_displacement=0.01)\n",
+    "    print(\"Remove overlaps...\", flush=True)\n",
+    "    system.integrator.run(5000)\n",
+    "    assert np.all(np.abs(system.part.all().f) < FMAX), \"Overlap removal did not converge!\"\n",
     "\n",
     "    system.integrator.set_vv()\n",
     "    system.thermostat.set_langevin(kT=KT, gamma=GAMMA, seed=42)\n",
diff --git a/maintainer/CI/build_cmake.sh b/maintainer/CI/build_cmake.sh
index 7a95caaae96..4d4ce68d157 100755
--- a/maintainer/CI/build_cmake.sh
+++ b/maintainer/CI/build_cmake.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright (C) 2016-2022 The ESPResSo project
+# Copyright (C) 2016-2024 The ESPResSo project
 # Copyright (C) 2014 Olaf Lenz
 #
 # Copying and distribution of this file, with or without modification,
@@ -120,7 +120,6 @@ set_default_value make_check_benchmarks false
 set_default_value with_fast_math false
 set_default_value with_cuda false
 set_default_value with_cuda_compiler "nvcc"
-set_default_value with_cxx_standard 17
 set_default_value build_type "RelWithAssert"
 set_default_value with_ccache false
 set_default_value with_hdf5 true
@@ -130,7 +129,7 @@ set_default_value with_scafacos false
 set_default_value with_walberla false
 set_default_value with_walberla_avx false
 set_default_value with_stokesian_dynamics false
-set_default_value test_timeout 300
+set_default_value test_timeout 500
 set_default_value hide_gpu false
 set_default_value mpiexec_preflags ""
 
@@ -148,54 +147,22 @@ if [ "${with_fast_math}" = true ]; then
     cmake_param_protected="-DCMAKE_CXX_FLAGS=-ffast-math"
 fi
 
-cmake_params="-D CMAKE_BUILD_TYPE=${build_type} -D CMAKE_CXX_STANDARD=${with_cxx_standard} -D ESPRESSO_WARNINGS_ARE_ERRORS=ON ${cmake_params}"
+cmake_params="-D CMAKE_BUILD_TYPE=${build_type} -D ESPRESSO_WARNINGS_ARE_ERRORS=ON ${cmake_params}"
 cmake_params="${cmake_params} -D CMAKE_INSTALL_PREFIX=/tmp/espresso-unit-tests -D ESPRESSO_INSIDE_DOCKER=ON"
 cmake_params="${cmake_params} -D ESPRESSO_CTEST_ARGS:STRING=-j${check_procs} -D ESPRESSO_TEST_TIMEOUT=${test_timeout}"
 
-if [ "${make_check_benchmarks}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_BENCHMARKS=ON"
-fi
-
-if [ "${with_ccache}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CCACHE=ON"
-fi
-
-if [ "${with_caliper}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CALIPER=ON"
-fi
-
-if [ "${with_hdf5}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_HDF5=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_HDF5=OFF"
-fi
-
-if [ "${with_fftw}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_FFTW=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_FFTW=OFF"
-fi
-
-if [ "${with_gsl}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_GSL=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_GSL=OFF"
-fi
-
-if [ "${with_scafacos}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_SCAFACOS=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_SCAFACOS=OFF"
-fi
-
-if [ "${with_stokesian_dynamics}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_STOKESIAN_DYNAMICS=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_STOKESIAN_DYNAMICS=OFF"
-fi
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_BENCHMARKS=${make_check_benchmarks}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CCACHE=${with_ccache}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CALIPER=${with_caliper}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_HDF5=${with_hdf5}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_FFTW=${with_fftw}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_GSL=${with_gsl}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_SCAFACOS=${with_scafacos}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_STOKESIAN_DYNAMICS=${with_stokesian_dynamics}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_WALBERLA=${with_walberla}"
 
 if [ "${with_walberla}" = true ]; then
-  cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_WALBERLA=ON -D ESPRESSO_BUILD_WITH_WALBERLA_FFT=ON"
+  cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_WALBERLA_FFT=ON"
   if [ "${with_walberla_avx}" = true ]; then
     cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_WALBERLA_AVX=ON"
   fi
@@ -204,39 +171,18 @@ if [ "${with_walberla}" = true ]; then
   mpiexec_preflags="${mpiexec_preflags:+$mpiexec_preflags;}--bind-to;none"
 fi
 
-if [ "${with_coverage}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_COVERAGE=ON ${cmake_params}"
-fi
-
-if [ "${with_coverage_python}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_COVERAGE_PYTHON=ON ${cmake_params}"
-fi
-
-if [ "${with_asan}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_ASAN=ON ${cmake_params}"
-fi
-
-if [ "${with_ubsan}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_UBSAN=ON ${cmake_params}"
-fi
-
-if [ "${with_static_analysis}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_CLANG_TIDY=ON ${cmake_params}"
-fi
-
-if [ "${run_checks}" = true ]; then
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_TESTS=ON"
-else
-    cmake_params="${cmake_params} -D ESPRESSO_BUILD_TESTS=OFF"
-fi
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_COVERAGE=${with_coverage}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_COVERAGE_PYTHON=${with_coverage_python}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_ASAN=${with_asan}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_UBSAN=${with_ubsan}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CLANG_TIDY=${with_static_analysis}"
+cmake_params="${cmake_params} -D ESPRESSO_BUILD_WITH_CUDA=${with_cuda}"
 
 if [ "${with_cuda}" = true ]; then
-    cmake_params="-D ESPRESSO_BUILD_WITH_CUDA=ON -D CUDAToolkit_ROOT=/usr/lib/cuda ${cmake_params}"
-    if [ "${CUDACXX}" = "" ]; then
-        cmake_params="-D CMAKE_CUDA_FLAGS='--compiler-bindir=/usr/bin/g++-10' ${cmake_params}"
+    cmake_params="${cmake_params} -D CUDAToolkit_ROOT=/usr/lib/cuda"
+    if [ "${CUDACXX}" = "" ] && [ "${CXX}" != "" ]; then
+        cmake_params="${cmake_params} -D CMAKE_CUDA_FLAGS='--compiler-bindir=$(which "${CXX}")'"
     fi
-else
-    cmake_params="-D ESPRESSO_BUILD_WITH_CUDA=OFF ${cmake_params}"
 fi
 
 command -v nvidia-smi && nvidia-smi || true
@@ -264,19 +210,28 @@ cd "${builddir}"
 
 # load MPI module if necessary
 if [ -f "/etc/os-release" ]; then
-    grep -q suse /etc/os-release && . /etc/profile.d/modules.sh && module load gnu-openmpi
-    grep -q 'rhel\|fedora' /etc/os-release && for f in /etc/profile.d/*module*.sh; do . "${f}"; done && module load mpi
-    grep -q "Ubuntu 22.04" /etc/os-release && export MPIEXEC_PREFLAGS="--mca;btl_vader_single_copy_mechanism;none${mpiexec_preflags:+;$mpiexec_preflags}"
+    grep -q "suse" /etc/os-release && . /etc/profile.d/modules.sh && module load gnu-openmpi
+    grep -q "rhel\|fedora" /etc/os-release && for f in /etc/profile.d/*module*.sh; do . "${f}"; done && module load mpi
+fi
+
+# setup environment
+if grep -q "Ubuntu" /etc/os-release; then
+    default_gcov="$(which "gcov")"
+    custom_gcov="$(which "${GCOV:-gcov}")"
+    if [ ! "${custom_gcov}" = "${default_gcov}" ] && [ -d "${HOME}/.local/var/lib/alternatives" ]; then
+        update-alternatives --altdir "${HOME}/.local/etc/alternatives" \
+                            --admindir "${HOME}/.local/var/lib/alternatives" \
+                            --install "${HOME}/.local/bin/gcov" "gcov" "${custom_gcov}" 10
+    fi
 fi
 
 # CONFIGURE
 start "CONFIGURE"
 
-MYCONFIG_DIR="${srcdir}/maintainer/configs"
 if [ "${myconfig}" = "default" ]; then
     echo "Using default myconfig."
 else
-    myconfig_file="${MYCONFIG_DIR}/${myconfig}.hpp"
+    myconfig_file="${srcdir}/maintainer/configs/${myconfig}.hpp"
     if [ ! -e "${myconfig_file}" ]; then
         echo "${myconfig_file} does not exist!"
         exit 1
@@ -306,9 +261,8 @@ end "BUILD"
 # library. See details in https://github.com/espressomd/espresso/issues/2249
 # Can't do this check on CUDA though because nvcc creates a host function
 # that just calls exit() for each device function, and can't do this with
-# coverage because gcov 9.0 adds code that calls exit(), and can't do this
 # with walberla because the library calls exit() in assertions.
-if [[ "${with_coverage}" == false && ( "${with_cuda}" == false || "${with_cuda_compiler}" != "nvcc" ) && "${with_walberla}" != "true" ]]; then
+if [[ ( "${with_cuda}" == false || "${with_cuda_compiler}" != "nvcc" ) && "${with_walberla}" != "true" ]]; then
     if nm -o -C $(find . -name '*.so') | grep '[^a-z]exit@@GLIBC'; then
         echo "Found calls to exit() function in shared libraries."
         exit 1
@@ -400,12 +354,7 @@ if [ "${with_coverage}" = true ] || [ "${with_coverage_python}" = true ]; then
     if [ "${with_coverage}" = true ]; then
         echo "Running lcov and gcov..."
         codecov_opts="${codecov_opts} --gcov"
-        lcov --gcov-tool "${GCOV:-gcov}" -q --directory . --ignore-errors graph --capture --output-file coverage.info # capture coverage info
-        lcov --gcov-tool "${GCOV:-gcov}" -q --remove coverage.info '/usr/*' --output-file coverage.info # filter out system
-        lcov --gcov-tool "${GCOV:-gcov}" -q --remove coverage.info '*/doc/*' --output-file coverage.info # filter out docs
-        if [ -d _deps/ ]; then
-          lcov --gcov-tool "${GCOV:-gcov}" -q --remove coverage.info $(realpath _deps/)'/*' --output-file coverage.info # filter out external projects
-        fi
+        "${srcdir}/maintainer/CI/run_lcov.sh" coverage.info
     fi
     if [ "${with_coverage_python}" = true ]; then
         echo "Running python3-coverage..."
diff --git a/maintainer/CI/doc_warnings.sh b/maintainer/CI/doc_warnings.sh
index dc084d1313d..d9f4eac093a 100755
--- a/maintainer/CI/doc_warnings.sh
+++ b/maintainer/CI/doc_warnings.sh
@@ -31,7 +31,7 @@
 # not enclosed within <a href="..."></a> tags. Sphinx doesn't use line
 # wrapping, so these broken links can be found via text search. The first
 # negative lookahead filters out common Python types (for performance reasons).
-regex_sphinx_broken_link='<code class=\"xref py py-[a-z]+ docutils literal notranslate\"><span class=\"pre\">(?!(int|float|complex|bool|str|bytes|array|bytearray|memoryview|object|list|tuple|range|slice|dict|set|frozenset|(?:numpy\.|np\.)?(?:nd)?array)<)[^<>]+?</span></code>(?!</a>)'
+regex_sphinx_broken_link='<code class=\"xref py py-[a-z]+ docutils literal notranslate\"><span class=\"pre\">(?!(int|float|complex|bool|str|bytes|array|bytearray|memoryview|object|list|tuple|range|slice|dict|set|frozenset|(?:numpy\.|np\.)?(?:nd)?array|EnumType|IntEnum|StrEnum|ReprEnum|Enum|IntFlag|Flag)<)[^<>]+?</span></code>(?!</a>)'
 
 if [ ! -f doc/sphinx/html/index.html ]; then
     echo "Please run Sphinx first."
diff --git a/maintainer/CI/jupyter_warnings.py b/maintainer/CI/jupyter_warnings.py
index 520754b73ec..8c7b6ec161a 100755
--- a/maintainer/CI/jupyter_warnings.py
+++ b/maintainer/CI/jupyter_warnings.py
@@ -23,9 +23,11 @@
 """
 
 import sys
+import json
 import pathlib
 
 import lxml.etree
+import jupyter_core.paths
 import nbformat
 import nbconvert
 
@@ -58,7 +60,15 @@ def detect_invalid_urls(nb, build_root='.', html_exporter=None):
     '''
     # convert notebooks to HTML
     if html_exporter is None:
-        html_exporter = nbconvert.HTMLExporter()
+        kwargs = {}
+        for path in jupyter_core.paths.jupyter_config_path():
+            filepath = pathlib.Path(path) / "jupyter_nbconvert_config.json"
+            if filepath.is_file():
+                with open(filepath) as f:
+                    config = json.load(f)
+                kwargs = config.get("HTMLExporter", {})
+                break
+        html_exporter = nbconvert.HTMLExporter(**kwargs)
     html_exporter.template_name = 'classic'
     html_string = html_exporter.from_notebook_node(nb)[0]
     # parse HTML
diff --git a/maintainer/CI/run_lcov.sh b/maintainer/CI/run_lcov.sh
new file mode 100755
index 00000000000..22052acf45b
--- /dev/null
+++ b/maintainer/CI/run_lcov.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env sh
+#
+# Copyright (C) 2017-2024 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+set -e
+
+output="${1:-coverage.info}"
+bindir="$(realpath .)"
+srcdir="$(sed -nr "s/^ESPResSo_SOURCE_DIR:STATIC=(.+)/\1/p" "${bindir}/CMakeCache.txt")"
+
+if [ "${srcdir}" = "" ]; then
+  echo "Cannot extract ESPResSo_SOURCE_DIR variable from the CMake cache" >&2
+  exit 2
+fi
+
+lcov --gcov-tool "${GCOV:-gcov}" \
+     --quiet \
+     --ignore-errors graph,mismatch,mismatch,gcov,unused \
+     --directory . \
+     --filter brace,blank,range,region \
+     --capture \
+     --rc lcov_json_module="JSON::XS" \
+     --exclude "/usr/*" \
+     --exclude "*/tmpxft_*cudafe1.stub.*" \
+     --exclude "${bindir}/_deps/*" \
+     --exclude "${bindir}/src/python/espressomd/*" \
+     --exclude "${srcdir}/src/walberla_bridge/src/*/generated_kernels/*" \
+     --exclude "${srcdir}/libs/*" \
+     --output-file "${output}"
diff --git a/maintainer/benchmarks/benchmarks.py b/maintainer/benchmarks/benchmarks.py
index 6182c9eafa1..3d39d2fd1dc 100644
--- a/maintainer/benchmarks/benchmarks.py
+++ b/maintainer/benchmarks/benchmarks.py
@@ -84,7 +84,7 @@ def get_timings(system, n_steps, n_iterations, verbose=True):
             energy = system.analysis.energy()["total"]
             verlet = system.cell_system.get_state()["verlet_reuse"]
             print(
-                f"step {i}, time: {1000*t:.1f} ms, verlet: {verlet:.2f}, energy: {energy:.2e}")
+                f"step {i}, time: {1000 * t:.1f} ms, verlet: {verlet:.2f}, energy: {energy:.2e}")
     return np.array(timings)
 
 
@@ -131,7 +131,7 @@ def write_report(filepath, n_proc, timings, n_steps, label=''):
     cmd = " ".join(x for x in sys.argv[1:] if not x.startswith("--output"))
     avg, ci = get_average_time(timings)
     header = '"script","arguments","cores","mean","ci","nsteps","duration","label"\n'
-    report = f'"{script}","{cmd}",{n_proc},{avg:.3e},{ci:.3e},{n_steps},{np.sum(timings):.1f},"{label}"\n'
+    report = f'"{script}","{cmd}",{n_proc},{avg:.3e},{ci:.3e},{n_steps},{np.sum(timings):.1f},"{label}"\n'  # nopep8
     if pathlib.Path(filepath).is_file():
         header = ''
     with open(filepath, "a") as f:
diff --git a/maintainer/benchmarks/mc_acid_base_reservoir.py b/maintainer/benchmarks/mc_acid_base_reservoir.py
index edecf9bb68f..1eb87804fc2 100644
--- a/maintainer/benchmarks/mc_acid_base_reservoir.py
+++ b/maintainer/benchmarks/mc_acid_base_reservoir.py
@@ -24,7 +24,6 @@
 import espressomd
 import espressomd.electrostatics
 import espressomd.reaction_methods
-import pkg_resources
 import argparse
 
 parser = argparse.ArgumentParser(description="Benchmark MC simulations in the grand-reaction ensemble. "
@@ -45,8 +44,6 @@
 # process and check arguments
 assert args.particles_per_core >= 100, "you need to use at least 100 particles per core to avoid finite-size effects in the simulation"
 espressomd.assert_features(['WCA', 'ELECTROSTATICS'])
-assert pkg_resources.packaging.specifiers.SpecifierSet('>=0.10.1').contains(pint.__version__), \
-    f'pint version {pint.__version__} is too old: several numpy operations can cast away the unit'
 
 
 def calc_ideal_alpha(pH, pKa):
@@ -96,7 +93,7 @@ def calc_donnan_coefficient(c_acid, I_res, charge=-1):
 NUM_SAMPLES = 100
 INTEGRATION_STEPS_PER_SAMPLE = 100
 assert TOTAL_NUM_MC_STEPS % NUM_SAMPLES == 0, \
-    f"Total number of MC steps must be divisible by total number of samples, got {TOTAL_NUM_MC_STEPS} and {NUM_SAMPLES}"
+    f"Total number of MC steps must be divisible by total number of samples, got {TOTAL_NUM_MC_STEPS} and {NUM_SAMPLES}"  # nopep8
 MC_STEPS_PER_SAMPLE = TOTAL_NUM_MC_STEPS // NUM_SAMPLES
 
 # definitions of reduced units
@@ -269,7 +266,8 @@ def report_progress(system, i, next_i):
     n_All = len(system.part)
     if i == next_i:
         print(
-            f"run {i:d} time {system.time:.3g} completed {i / NUM_SAMPLES * 100:.0f}%",
+            f"run {i:d} time {system.time:.3g} completed "
+            f"{i / NUM_SAMPLES * 100:.0f}%",
             f"instantaneous values: All {n_All:d}  Na {n_Na:d}  Cl {n_Cl:d}",
             f"A {n_A:d}  alpha {n_A / N_ACID:.3f}")
         if i == 0:
@@ -311,7 +309,8 @@ def report_progress(system, i, next_i):
         energy = system.analysis.energy()["total"]
         verlet = system.cell_system.get_state()["verlet_reuse"]
         print(
-            f"step {i}, time MD: {t_MD:.2e}, time MC: {t_MC:.2e}, verlet: {verlet:.2f}, energy: {energy:.2e}")
+            f"step {i}, time MD: {t_MD:.2e}, time MC: {t_MC:.2e}, "
+            f"verlet: {verlet:.2f}, energy: {energy:.2e}")
 
     # average time
     avg_MC, ci_MC = benchmarks.get_average_time(timings_MC)
diff --git a/maintainer/check_features.py b/maintainer/check_features.py
index 05a411e97ef..a102c0d1ee6 100755
--- a/maintainer/check_features.py
+++ b/maintainer/check_features.py
@@ -26,7 +26,7 @@
 import featuredefs
 
 if len(sys.argv) != 2:
-    print("Usage: %s FILE" % sys.argv[0])
+    print(f"Usage: {sys.argv[0]} FILE")
     exit(2)
 
 fdefs = featuredefs.defs(sys.argv[1])
diff --git a/maintainer/configs/maxset.hpp b/maintainer/configs/maxset.hpp
index ccce79f02cf..de5775c1bf7 100644
--- a/maintainer/configs/maxset.hpp
+++ b/maintainer/configs/maxset.hpp
@@ -28,9 +28,6 @@
 #define DPD
 
 #define ELECTROSTATICS
-#ifdef CUDA
-#define MMM1D_GPU
-#endif
 #define DIPOLES
 #ifdef SCAFACOS
 #define SCAFACOS_DIPOLES
diff --git a/maintainer/format/autopep8.sh b/maintainer/format/autopep8.sh
index c59ee8d694d..175910909f9 100755
--- a/maintainer/format/autopep8.sh
+++ b/maintainer/format/autopep8.sh
@@ -1,4 +1,5 @@
 #!/bin/sh
+#
 # Copyright (C) 2018-2022 The ESPResSo project
 #
 # This file is part of ESPResSo.
@@ -15,10 +16,11 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
 
 
-AUTOPEP8_VER=1.6.0
-PYCODESTYLE_VER=2.8.0
+AUTOPEP8_VER=2.1.0
+PYCODESTYLE_VER=2.11.1
 
 python3 -m autopep8 --help 2>&1 > /dev/null
 if [ "$?" = "0" ]; then
diff --git a/maintainer/format/clang-format.sh b/maintainer/format/clang-format.sh
index 0ee45b41a6c..347c9cbb879 100755
--- a/maintainer/format/clang-format.sh
+++ b/maintainer/format/clang-format.sh
@@ -1,4 +1,5 @@
 #!/bin/sh
+#
 # Copyright (C) 2018-2022 The ESPResSo project
 #
 # This file is part of ESPResSo.
@@ -15,8 +16,9 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
 
-CLANG_FORMAT_VER=14.0
+CLANG_FORMAT_VER=18.1
 if hash clang-format-${CLANG_FORMAT_VER} 2>/dev/null; then
     CLANGFORMAT="$(which clang-format-${CLANG_FORMAT_VER})"
 elif hash clang-format-${CLANG_FORMAT_VER%.*} 2>/dev/null; then
diff --git a/maintainer/format/cmake-format.sh b/maintainer/format/cmake-format.sh
index edc73cf6795..e00ea203157 100755
--- a/maintainer/format/cmake-format.sh
+++ b/maintainer/format/cmake-format.sh
@@ -1,4 +1,5 @@
 #!/bin/sh
+#
 # Copyright (C) 2018-2022 The ESPResSo project
 #
 # This file is part of ESPResSo.
@@ -15,6 +16,7 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
 
 CMAKE_FORMAT_VER=0.6.13
 python3 -m cmakelang.format 2>&1 > /dev/null
diff --git a/maintainer/format/ex_flag.sh b/maintainer/format/ex_flag.sh
index b14c9356637..6b58e2a4bdc 100755
--- a/maintainer/format/ex_flag.sh
+++ b/maintainer/format/ex_flag.sh
@@ -1,4 +1,5 @@
 #!/bin/sh
+#
 # Copyright (C) 2018-2022 The ESPResSo project
 #
 # This file is part of ESPResSo.
@@ -15,6 +16,6 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
+#
 
 chmod -x "$@"
diff --git a/maintainer/lint/pre_commit.sh b/maintainer/lint/pre_commit.sh
index 96eb4f9ffaf..3d7b2d4d445 100755
--- a/maintainer/lint/pre_commit.sh
+++ b/maintainer/lint/pre_commit.sh
@@ -1,4 +1,5 @@
 #!/bin/sh
+#
 # Copyright (C) 2018-2022 The ESPResSo project
 #
 # This file is part of ESPResSo.
@@ -15,9 +16,9 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
 
-
-python3 -m pre_commit 2>&1 >/dev/null
+pre-commit 2>&1 >/dev/null
 if [ "$?" = "0" ]; then
     precommit="python3 -m pre_commit"
 else
diff --git a/maintainer/lint/pylint.sh b/maintainer/lint/pylint.sh
index fc17a66e654..f8ca595a97c 100755
--- a/maintainer/lint/pylint.sh
+++ b/maintainer/lint/pylint.sh
@@ -1,5 +1,6 @@
 #!/bin/sh
-# Copyright (C) 2018-2022 The ESPResSo project
+#
+# Copyright (C) 2018-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -15,6 +16,7 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
 
 
 python3 -m pylint --help 2>&1 > /dev/null
diff --git a/maintainer/walberla_kernels/Readme.md b/maintainer/walberla_kernels/Readme.md
index 37ecd2a7cea..ef7f2fb0c5e 100644
--- a/maintainer/walberla_kernels/Readme.md
+++ b/maintainer/walberla_kernels/Readme.md
@@ -20,7 +20,7 @@ The kernels can be regenerated with this shell script:
 
 ```sh
 # adapt these paths to the build environment
-export VERSION=1.2
+export VERSION=1.3.3
 export DEPS="${HOME}/walberla_deps"
 export PYTHONPATH="${DEPS}/${VERSION}/lbmpy:${DEPS}/${VERSION}/pystencils:${DEPS}/devel/walberla/python/"
 
@@ -34,6 +34,8 @@ function generate_ek_kernels {
 function format_lb_kernels {
   $(git rev-parse --show-toplevel)/maintainer/format/clang-format.sh -i *.h
   $(git rev-parse --show-toplevel)/maintainer/format/clang-format.sh -i *.cpp -style "{Language: Cpp, ColumnLimit: 0}"
+  $(git rev-parse --show-toplevel)/maintainer/format/clang-format.sh -i *.cu  -style "{Language: Cpp, ColumnLimit: 0}"
+  $(git rev-parse --show-toplevel)/maintainer/format/clang-format.sh -i *.cuh -style "{Language: Cpp}"
 }
 function format_ek_kernels {
   $(git rev-parse --show-toplevel)/maintainer/format/clang-format.sh -i *.h
@@ -44,7 +46,10 @@ function format_ek_kernels {
 cd $(git rev-parse --show-toplevel)/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/
 generate_lb_kernels
 generate_lb_kernels --single-precision
+generate_lb_kernels --gpu
+generate_lb_kernels --gpu --single-precision
 format_lb_kernels
+git diff src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_*CUDA*.cu # verify pragmas
 
 # EK kernels
 cd $(git rev-parse --show-toplevel)/src/walberla_bridge/src/electrokinetics/generated_kernels/
@@ -54,6 +59,10 @@ format_ek_kernels
 mv ReactionKernel*.{cpp,h} $(git rev-parse --show-toplevel)/src/walberla_bridge/src/electrokinetics/reactions/generated_kernels/
 ```
 
+The code generation is not deterministic, therefore the list of changes might
+be quite large. If you only adapted a few lines in a specific template file,
+then you only need to commit the corresponding output files.
+
 WARNING: The code generation sorts the arguments alphabetically by symbol name.
 If you rename something, you may have to adapt the order of arguments in the
 calling code!
diff --git a/maintainer/walberla_kernels/code_generation_context.py b/maintainer/walberla_kernels/code_generation_context.py
index 583a3fca578..77f86183e0e 100644
--- a/maintainer/walberla_kernels/code_generation_context.py
+++ b/maintainer/walberla_kernels/code_generation_context.py
@@ -19,7 +19,6 @@
 
 import os
 import re
-import jinja2
 import hashlib
 import lbmpy
 import lbmpy_walberla
@@ -27,48 +26,6 @@
 import pystencils_walberla
 
 
-def adapt_pystencils():
-    """
-    Adapt pystencils to the SFINAE method (add the block offset lambda
-    callback and the time_step increment).
-    """
-    old_add_pystencils_filters_to_jinja_env = pystencils_walberla.codegen.add_pystencils_filters_to_jinja_env
-
-    def new_add_pystencils_filters_to_jinja_env(jinja_env):
-        # save original pystencils to adapt
-        old_add_pystencils_filters_to_jinja_env(jinja_env)
-        old_generate_members = jinja_env.filters["generate_members"]
-        old_generate_refs_for_kernel_parameters = jinja_env.filters[
-            "generate_refs_for_kernel_parameters"]
-
-        @jinja2.pass_context
-        def new_generate_members(*args, **kwargs):
-            output = old_generate_members(*args, **kwargs)
-            token = " block_offset_0_;"
-            if token in output:
-                i = output.index(token)
-                vartype = output[:i].split("\n")[-1].strip()
-                output += f"\nstd::function<void(IBlock *, {vartype}&, {vartype}&, {vartype}&)> block_offset_generator = [](IBlock * const, {vartype}&, {vartype}&, {vartype}&) {{ }};"
-            return output
-
-        def new_generate_refs_for_kernel_parameters(*args, **kwargs):
-            output = old_generate_refs_for_kernel_parameters(*args, **kwargs)
-            if "block_offset_0" in output:
-                old_token = "auto & block_offset_"
-                new_token = "auto block_offset_"
-                assert output.count(old_token) == 3, \
-                    f"could not find '{old_token}' in '''\n{output}\n'''"
-                output = output.replace(old_token, new_token)
-                output += "\nblock_offset_generator(block, block_offset_0, block_offset_1, block_offset_2);"
-            return output
-
-        # replace pystencils
-        jinja_env.filters["generate_members"] = new_generate_members
-        jinja_env.filters["generate_refs_for_kernel_parameters"] = new_generate_refs_for_kernel_parameters
-
-    pystencils_walberla.codegen.add_pystencils_filters_to_jinja_env = new_add_pystencils_filters_to_jinja_env
-
-
 def earmark_generated_kernels():
     """
     Add an earmark at the beginning of generated kernels to document the
@@ -83,12 +40,13 @@ def earmark_generated_kernels():
             walberla_commit = f.read()
     token = "// kernel generated with"
     earmark = (
-        f"{token} pystencils v{pystencils.__version__}, lbmpy v{lbmpy.__version__}, "
-        f"lbmpy_walberla/pystencils_walberla from waLBerla commit {walberla_commit}"
+        f"{token} pystencils v{pystencils.__version__}, "
+        f"lbmpy v{lbmpy.__version__}, "
+        f"lbmpy_walberla/pystencils_walberla from "
+        f"waLBerla commit {walberla_commit}"
     )
     for filename in os.listdir("."):
-        if not filename.endswith(
-                ".tmpl.h") and filename.endswith((".h", ".cpp", ".cu")):
+        if filename.endswith((".h", ".cpp", ".cu", ".cuh")):
             with open(filename, "r+") as f:
                 content = f.read()
                 if token not in content:
@@ -100,7 +58,7 @@ def earmark_generated_kernels():
                         pos = content.find("//=====", 5)
                         pos = content.find("\n", pos) + 1
                     f.seek(pos)
-                    f.write(f"\n{earmark}\n{content[pos:]}")
+                    f.write(f"\n{earmark}\n{content[pos:].rstrip()}\n")
 
 
 def guard_generated_kernels_clang_format():
@@ -117,9 +75,9 @@ def guard_generated_kernels_clang_format():
             if not all_ns:
                 continue
             for ns in all_ns:
-                content = re.sub(rf"(?<=[^a-zA-Z0-9_]){ns}(?=[^a-zA-Z0-9_])",
-                                 f"internal_{hashlib.md5(ns.encode('utf-8')).hexdigest()}",
-                                 content)
+                ns_hash = hashlib.md5(ns.encode('utf-8')).hexdigest()
+                content = re.sub(f"(?<=[^a-zA-Z0-9_]){ns}(?=[^a-zA-Z0-9_])",
+                                 f"internal_{ns_hash}", content)
             with open(filename, "w") as f:
                 f.write(content)
 
@@ -138,7 +96,6 @@ def __init__(self):
         sys.argv = sys.argv[:1]
         super().__init__()
         sys.argv = old_sys_argv
-        adapt_pystencils()
 
     def __exit__(self, *args, **kwargs):
         super().__exit__(*args, **kwargs)
diff --git a/maintainer/walberla_kernels/custom_additional_extensions.py b/maintainer/walberla_kernels/custom_additional_extensions.py
index 3ff0b83cdd2..1b09ea483cf 100644
--- a/maintainer/walberla_kernels/custom_additional_extensions.py
+++ b/maintainer/walberla_kernels/custom_additional_extensions.py
@@ -88,10 +88,8 @@ def __call__(self, field, direction_symbol, index_field, **kwargs):
         conds = [
             sp.Equality(
                 direction_symbol,
-                ps.typing.CastFunc(
-                    d + 1,
-                    np.int32)) for d in range(
-                len(accesses))]
+                ps.typing.CastFunc(d + 1, np.int32))
+            for d in range(len(accesses))]
 
         # use conditional
         conditional = None
@@ -137,8 +135,7 @@ def __init__(self, stencil, boundary_object):
 
     @property
     def constructor_arguments(self):
-        return f", std::function<{self.data_type}(const Cell &, const shared_ptr<StructuredBlockForest>&, IBlock&)>& " \
-               "dirichletCallback "
+        return f", std::function<{self.data_type}(const Cell &, const shared_ptr<StructuredBlockForest>&, IBlock&)>& dirichletCallback "  # nopep8
 
     @property
     def initialiser_list(self):
@@ -153,15 +150,15 @@ def additional_parameters_for_fill_function(self):
         return " const shared_ptr<StructuredBlockForest> &blocks, "
 
     def data_initialisation(self, _):
-        init_list = [f"{self.data_type} InitialisatonAdditionalData = elementInitaliser(Cell(it.x(), it.y(), it.z()), "
-                     "blocks, *block);", "element.value = InitialisatonAdditionalData;"]
+        init_list = [
+            f"{self.data_type} InitialisatonAdditionalData = elementInitaliser(Cell(it.x(), it.y(), it.z()), blocks, *block);",  # nopep8
+            "element.value = InitialisatonAdditionalData;"]
 
         return "\n".join(init_list)
 
     @property
     def additional_member_variable(self):
-        return f"std::function<{self.data_type}(const Cell &, const shared_ptr<StructuredBlockForest>&, IBlock&)> " \
-               "elementInitaliser; "
+        return f"std::function<{self.data_type}(const Cell &, const shared_ptr<StructuredBlockForest>&, IBlock&)> elementInitaliser; "  # nopep8
 
 
 class FluxAdditionalDataHandler(
@@ -174,8 +171,7 @@ def __init__(self, stencil, boundary_object):
 
     @property
     def constructor_arguments(self):
-        return f", std::function<Vector3<{self.data_type}>(const Cell &, const shared_ptr<StructuredBlockForest>&, IBlock&)>& " \
-               "fluxCallback "
+        return f", std::function<Vector3<{self.data_type}>(const Cell &, const shared_ptr<StructuredBlockForest>&, IBlock&)>& fluxCallback "  # nopep8
 
     @property
     def initialiser_list(self):
@@ -193,8 +189,8 @@ def data_initialisation(self, direction):
         dirVec = self.stencil_info[direction][1]
 
         init_list = [
-            f"Vector3<{self.data_type}> InitialisatonAdditionalData = elementInitaliser(Cell(it.x() + {dirVec[0]}, it.y() + {dirVec[1]}, it.z() + {dirVec[2]}), "
-            "blocks, *block);", "element.flux_0 = InitialisatonAdditionalData[0];",
+            f"Vector3<{self.data_type}> InitialisatonAdditionalData = elementInitaliser(Cell(it.x() + {dirVec[0]}, it.y() + {dirVec[1]}, it.z() + {dirVec[2]}), blocks, *block);",  # nopep8
+            "element.flux_0 = InitialisatonAdditionalData[0];",
             "element.flux_1 = InitialisatonAdditionalData[1];"]
         if self._dim == 3:
             init_list.append(
@@ -204,13 +200,11 @@ def data_initialisation(self, direction):
 
     @property
     def additional_member_variable(self):
-        return f"std::function<Vector3<{self.data_type}>(const Cell &, const shared_ptr<StructuredBlockForest>&, IBlock&)> " \
-               "elementInitaliser; "
+        return f"std::function<Vector3<{self.data_type}>(const Cell &, const shared_ptr<StructuredBlockForest>&, IBlock&)> elementInitaliser; "  # nopep8
 
 
-# this custom boundary generator is necessary because our boundary
-# condition writes to several fields at once which is impossible with the
-# shipped one
+# this custom boundary generator is necessary because our boundary condition
+# writes to several fields at once which is impossible with the shipped one
 def generate_boundary(
         generation_context,
         stencil,
@@ -250,9 +244,8 @@ def generate_boundary(
         index_struct_dtype,
         layout=[0],
         shape=(
-            ps.typing.TypedSymbol(
-                "indexVectorSize", ps.typing.BasicType(np.int32)
-            ),
+            ps.typing.TypedSymbol("indexVectorSize",
+                                  ps.typing.BasicType(np.int32)),
             1,
         ),
         strides=(1, 1),
diff --git a/maintainer/walberla_kernels/generate_ek_kernels.py b/maintainer/walberla_kernels/generate_ek_kernels.py
index e54f19023f0..54a20a107e7 100644
--- a/maintainer/walberla_kernels/generate_ek_kernels.py
+++ b/maintainer/walberla_kernels/generate_ek_kernels.py
@@ -22,6 +22,7 @@
 import sympy as sp
 import lbmpy
 import argparse
+import packaging.specifiers
 
 import pystencils_espresso
 import code_generation_context
@@ -35,6 +36,12 @@
                     help='Use single-precision')
 args = parser.parse_args()
 
+# Make sure we have the correct versions of the required dependencies
+for module, requirement in [(ps, "==1.2"), (lbmpy, "==1.2")]:
+    assert packaging.specifiers.SpecifierSet(requirement).contains(module.__version__), \
+        f"{module.__name__} version {module.__version__} " \
+        f"doesn't match requirement {requirement}"
+
 double_precision: bool = not args.single_precision
 
 data_type_cpp = "double" if double_precision else "float"
diff --git a/maintainer/walberla_kernels/generate_lb_kernels.py b/maintainer/walberla_kernels/generate_lb_kernels.py
index dc3083450b4..9afd75925c1 100644
--- a/maintainer/walberla_kernels/generate_lb_kernels.py
+++ b/maintainer/walberla_kernels/generate_lb_kernels.py
@@ -18,9 +18,10 @@
 #
 
 import argparse
-import pkg_resources
+import packaging.specifiers
 
 import sympy as sp
+import numpy as np
 
 import pystencils as ps
 import pystencils_walberla
@@ -52,9 +53,10 @@
     target = ps.Target.CPU
 
 # Make sure we have the correct versions of the required dependencies
-for module, requirement in [(ps, "==1.2"), (lbmpy, "==1.2")]:
-    assert pkg_resources.packaging.specifiers.SpecifierSet(requirement).contains(module.__version__), \
-        f"{module.__name__} version {module.__version__} doesn't match requirement {requirement}"
+for module, requirement in [(ps, "==1.3.3"), (lbmpy, "==1.3.3")]:
+    assert packaging.specifiers.SpecifierSet(requirement).contains(module.__version__), \
+        f"{module.__name__} version {module.__version__} " \
+        f"doesn't match requirement {requirement}"
 
 
 def paramlist(parameters, keys):
@@ -66,6 +68,7 @@ def paramlist(parameters, keys):
 with code_generation_context.CodeGeneration() as ctx:
     ctx.double_accuracy = not args.single_precision
     if target == ps.Target.GPU:
+        ctx.gpu = True
         ctx.cuda = True
 
     # vectorization parameters
@@ -115,7 +118,7 @@ def paramlist(parameters, keys):
 
     # generate initial densities
     for params, target_suffix in paramlist(parameters, (default_key,)):
-        pystencils_walberla.codegen.generate_sweep(
+        pystencils_walberla.generate_sweep(
             ctx,
             f"InitialPDFsSetter{precision_prefix}{target_suffix}",
             pystencils_espresso.generate_setters(ctx, method, params),
@@ -146,38 +149,44 @@ def paramlist(parameters, keys):
             params
         )
 
+    block_offsets = tuple(
+        ps.TypedSymbol(f"block_offset_{i}", np.uint32)
+        for i in range(3))
+
     # generate thermalized LB
     collision_rule_thermalized = lbmpy.creationfunctions.create_lb_collision_rule(
         method,
         zero_centered=False,
         fluctuating={
             "temperature": kT,
-            "block_offsets": "walberla",
+            "block_offsets": block_offsets,
             "rng_node": precision_rng
         },
         optimization={"cse_global": True,
                       "double_precision": ctx.double_accuracy}
     )
     for params, target_suffix in paramlist(parameters, ("GPU", "CPU", "AVX")):
+        stem = f"CollideSweep{precision_prefix}Thermalized{target_suffix}"
         pystencils_espresso.generate_collision_sweep(
             ctx,
             method,
             collision_rule_thermalized,
-            f"CollideSweep{precision_prefix}Thermalized{target_suffix}",
-            params
+            stem,
+            params,
+            block_offset=block_offsets,
         )
 
     # generate accessors
     for _, target_suffix in paramlist(parameters, ("GPU", "CPU")):
-        filename = f"FieldAccessors{precision_prefix}{target_suffix}"
+        stem = f"FieldAccessors{precision_prefix}{target_suffix}"
         if target == ps.Target.GPU:
             templates = {
-                f"{filename}.h": "templates/FieldAccessors.tmpl.cuh",
-                f"{filename}.cu": "templates/FieldAccessors.tmpl.cu",
+                f"{stem}.cuh": "templates/FieldAccessors.tmpl.cuh",
+                f"{stem}.cu": "templates/FieldAccessors.tmpl.cu",
             }
         else:
             templates = {
-                f"{filename}.h": "templates/FieldAccessors.tmpl.h",
+                f"{stem}.h": "templates/FieldAccessors.tmpl.h",
             }
         walberla_lbm_generation.generate_macroscopic_values_accessors(
             ctx, config, method, templates
diff --git a/maintainer/walberla_kernels/lbmpy_espresso.py b/maintainer/walberla_kernels/lbmpy_espresso.py
index 5055fac308c..8a755d347b0 100644
--- a/maintainer/walberla_kernels/lbmpy_espresso.py
+++ b/maintainer/walberla_kernels/lbmpy_espresso.py
@@ -19,8 +19,8 @@
 
 import pystencils as ps
 
-import lbmpy.advanced_streaming.indexing
 import lbmpy.boundaries
+import lbmpy.custom_code_nodes
 
 import lbmpy_walberla.additional_data_handler
 
@@ -39,15 +39,20 @@ def data_initialisation(self, direction):
         This way, the dynamic UBB can be used to implement a LB boundary.
         '''
         code = super().data_initialisation(direction)
-        dirVec = self.stencil_info[direction][1]
-        token = ' = elementInitaliser(Cell(it.x(){}, it.y(){}, it.z(){}),'
-        old_initialiser = token.format('', '', '')
-        assert old_initialiser in code
-        new_initialiser = token.format(
-            '+' + str(dirVec[0]),
-            '+' + str(dirVec[1]),
-            '+' + str(dirVec[2])).replace('+-', '-')
-        return code.replace(old_initialiser, new_initialiser)
+        assert "InitialisationAdditionalData" in code
+        assert "elementInitialiser" in code
+        assert "element.vel_0" in code
+        bb_vec = self.stencil_info[direction][1]
+        cell_args = [f"it.{direction}() + {bb_vec[i]}".replace('+ -', '-')
+                     for i, direction in enumerate("xyz")]
+        code = [
+            "auto const InitialisationAdditionalData = elementInitialiser(",
+            f"Cell({', '.join(cell_args)}), blocks, *block);",
+            "element.vel_0 = InitialisationAdditionalData[0];",
+            "element.vel_1 = InitialisationAdditionalData[1];",
+            "element.vel_2 = InitialisationAdditionalData[2];",
+        ]
+        return "\n".join(code)
 
 
 class UBB(lbmpy.boundaries.UBB):
@@ -71,7 +76,7 @@ def __call__(self, f_out, f_in, dir_symbol,
         if len(assignments) > 1:
             out.extend(assignments[:-1])
 
-        neighbor_offset = lbmpy.advanced_streaming.indexing.NeighbourOffsetArrays.neighbour_offset(
+        neighbor_offset = lbmpy.custom_code_nodes.NeighbourOffsetArrays.neighbour_offset(
             dir_symbol, lb_method.stencil)
 
         assignment = assignments[-1]
diff --git a/maintainer/walberla_kernels/pystencils_espresso.py b/maintainer/walberla_kernels/pystencils_espresso.py
index 1980ba14387..91350eb7248 100644
--- a/maintainer/walberla_kernels/pystencils_espresso.py
+++ b/maintainer/walberla_kernels/pystencils_espresso.py
@@ -23,12 +23,13 @@
 import lbmpy.updatekernels
 import pystencils as ps
 import pystencils_walberla
+import pystencils_walberla.utility
 
 
 def skip_philox_unthermalized(code, result_symbols, rng_name):
     for r in result_symbols:
         statement = f" {r.name};"
-        assert statement in code, f"no declaration for variable '{r.name}' in '{code}'"
+        assert statement in code, f"no declaration for variable '{r.name}' in '{code}'"  # nopep8
         code = code.replace(statement, f" {r.name}{{}};", 1)
     statement = f"{rng_name}("
     assert code.count(statement) == 1, f"need 1 '{rng_name}' call in '{code}'"
@@ -107,11 +108,11 @@ def generate_fields(config, stencil):
 
 
 def generate_config(ctx, params):
-    return pystencils_walberla.codegen.config_from_context(ctx, **params)
+    return pystencils_walberla.utility.config_from_context(ctx, **params)
 
 
 def generate_collision_sweep(
-        ctx, lb_method, collision_rule, class_name, params):
+        ctx, lb_method, collision_rule, class_name, params, **kwargs):
     config = generate_config(ctx, params)
 
     # Symbols for PDF (twice, due to double buffering)
@@ -127,8 +128,8 @@ def generate_collision_sweep(
         collide_update_rule, config=config, **params)
     collide_ast.function_name = 'kernel_collide'
     collide_ast.assumed_inner_stride_one = True
-    pystencils_walberla.codegen.generate_sweep(
-        ctx, class_name, collide_ast, **params)
+    pystencils_walberla.generate_sweep(
+        ctx, class_name, collide_ast, **params, **kwargs)
 
 
 def generate_stream_sweep(ctx, lb_method, class_name, params):
@@ -144,7 +145,7 @@ def generate_stream_sweep(ctx, lb_method, class_name, params):
     stream_ast = ps.create_kernel(stream_update_rule, config=config, **params)
     stream_ast.function_name = 'kernel_stream'
     stream_ast.assumed_inner_stride_one = True
-    pystencils_walberla.codegen.generate_sweep(
+    pystencils_walberla.generate_sweep(
         ctx, class_name, stream_ast,
         field_swaps=[(fields['pdfs'], fields['pdfs_tmp'])], **params)
 
diff --git a/maintainer/walberla_kernels/templates/Boundary.tmpl.h b/maintainer/walberla_kernels/templates/Boundary.tmpl.h
index bdeaf57c06d..a51c9c10b58 100644
--- a/maintainer/walberla_kernels/templates/Boundary.tmpl.h
+++ b/maintainer/walberla_kernels/templates/Boundary.tmpl.h
@@ -31,8 +31,8 @@
 {% if target is equalto 'cpu' -%}
 #include <field/GhostLayerField.h>
 {%- elif target is equalto 'gpu' -%}
-#include <cuda/GPUField.h>
-#include <cuda/FieldCopy.h>
+#include <gpu/GPUField.h>
+#include <gpu/FieldCopy.h>
 {%- endif %}
 #include <domain_decomposition/BlockDataID.h>
 #include <domain_decomposition/IBlock.h>
@@ -49,12 +49,14 @@
 #include {{header}}
 {% endfor %}
 
-#ifdef __GNUC__
-#define RESTRICT __restrict__
-#elif _MSC_VER
-#define RESTRICT __restrict
-#else
-#define RESTRICT
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#elif defined(__GNUC__) or defined(__GNUG__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
 namespace walberla {
@@ -85,7 +87,7 @@ class {{class_name}}
         {% if target == 'gpu' -%}
         ~IndexVectors() {
             for( auto & gpuVec: gpuVectors_)
-                cudaFree( gpuVec );
+                gpuFree( gpuVec );
         }
         {% endif -%}
 
@@ -100,7 +102,7 @@ class {{class_name}}
         {
             {% if target == 'gpu' -%}
             for( auto & gpuVec: gpuVectors_)
-                cudaFree( gpuVec );
+                gpuFree( gpuVec );
             gpuVectors_.resize( cpuVectors_.size() );
 
             WALBERLA_ASSERT_EQUAL(cpuVectors_.size(), NUM_TYPES);
@@ -108,8 +110,8 @@ class {{class_name}}
             {
                 auto & gpuVec = gpuVectors_[i];
                 auto & cpuVec = cpuVectors_[i];
-                cudaMalloc( &gpuVec, sizeof({{StructName}}) * cpuVec.size() );
-                cudaMemcpy( gpuVec, &cpuVec[0], sizeof({{StructName}}) * cpuVec.size(), cudaMemcpyHostToDevice );
+                gpuMalloc( &gpuVec, sizeof({{StructName}}) * cpuVec.size() );
+                gpuMemcpy( gpuVec, &cpuVec[0], sizeof({{StructName}}) * cpuVec.size(), gpuMemcpyHostToDevice );
             }
             {%- endif %}
         }
@@ -136,12 +138,12 @@ class {{class_name}}
     {};
 
     void run (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     );
 
     {% if generate_functor -%}
     void operator() (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     )
     {
         run( {{- ["block", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
@@ -149,28 +151,28 @@ class {{class_name}}
     {%- endif %}
 
     void inner (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     );
 
     void outer (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     );
 
-    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- ["this", interface_spec.high_level_args, ["stream"] if target == 'gpu' else []] | identifier_list -}} ]
                (IBlock * b)
                { this->run( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
     }
 
-    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ]
                (IBlock * b)
                { this->inner( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
     }
 
-    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ]
                (IBlock * b)
@@ -198,7 +200,7 @@ class {{class_name}}
         auto * flagField = block->getData< FlagField_T > ( flagFieldID );
         {{additional_data_handler.additional_field_data|indent(4)}}
 
-        assert(flagField->flagExists(boundaryFlagUID and
+        assert(flagField->flagExists(boundaryFlagUID) and
                flagField->flagExists(domainFlagUID));
 
         auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
@@ -293,7 +295,7 @@ class {{class_name}}
 private:
     void run_impl(
         {{- ["IBlock * block", "IndexVectors::Type type",
-             kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []]
+             kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []]
             | type_identifier_list -}}
    );
 
diff --git a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cu b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cu
new file mode 100644
index 00000000000..c9f8ae4dc97
--- /dev/null
+++ b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cu
@@ -0,0 +1,1001 @@
+/*
+ * Copyright (C) 2023-2024 The ESPResSo project
+ * Copyright (C) 2020 The waLBerla project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * @file
+ * Lattice field accessors.
+ * Adapted from the waLBerla source file
+ * https://i10git.cs.fau.de/walberla/walberla/-/blob/a16141524c58ab88386e2a0f8fdd7c63c5edd704/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
+ */
+
+#include <core/DataTypes.h>
+#include <core/cell/Cell.h>
+#include <core/cell/CellInterval.h>
+#include <core/math/Matrix{{D}}.h>
+#include <core/math/Vector{{D}}.h>
+
+#include <field/iterators/IteratorMacros.h>
+
+#include <gpu/FieldAccessor.h>
+#include <gpu/FieldIndexing.h>
+#include <gpu/GPUField.h>
+#include <gpu/Kernel.h>
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+
+#include <array>
+#include <vector>
+
+#if defined(__NVCC__)
+#define RESTRICT __restrict__
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177 // unused variable
+#elif defined(__clang__)
+#if defined(__CUDA__)
+#if defined(__CUDA_ARCH__)
+// clang compiling CUDA code in device mode
+#define RESTRICT __restrict__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-variable"
+#else
+// clang compiling CUDA code in host mode
+#define RESTRICT __restrict__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-variable"
+#endif
+#endif
+#elif defined(__GNUC__) or defined(__GNUG__)
+#define RESTRICT __restrict__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#elif defined(_MSC_VER)
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+/** @brief Get linear index of flattened data with original layout @c fzyx. */
+static __forceinline__ __device__ uint getLinearIndex( uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim, uint fOffset ) {
+  auto const x = threadIdx.x;
+  auto const y = blockIdx.x;
+  auto const z = blockIdx.y;
+  auto const f = blockIdx.z;
+  auto const ySize = gridDim.x;
+  auto const zSize = gridDim.y;
+  auto const fSize = fOffset;
+  return f                         +
+         z * fSize                 +
+         y * fSize * zSize         +
+         x * fSize * zSize * ySize ;
+}
+
+namespace walberla {
+namespace {{namespace}} {
+namespace accessor {
+
+namespace Population
+{
+// LCOV_EXCL_START
+    __global__ void kernel_get(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        {{dtype}} * RESTRICT pop )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{Q}}u);
+        pdf.set( blockIdx, threadIdx );
+        pop += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                pop[{{i}}u] = pdf.get({{i}}u);
+            {% endfor -%}
+        }
+    }
+
+    __global__ void kernel_set(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        {{dtype}} const * RESTRICT pop )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{Q}}u);
+        pdf.set( blockIdx, threadIdx );
+        pop += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                pdf.get({{i}}u) = pop[{{i}}u];
+            {% endfor -%}
+        }
+    }
+
+    __global__ void kernel_broadcast(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        {{dtype}} const * RESTRICT pop )
+    {
+        pdf.set( blockIdx, threadIdx );
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                pdf.get({{i}}u) = pop[{{i}}u];
+            {% endfor -%}
+        }
+    }
+
+    __global__ void kernel_set_vel(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        gpu::FieldAccessor< {{dtype}} > velocity,
+        gpu::FieldAccessor< {{dtype}} > force,
+        {{dtype}} const * RESTRICT pop )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{Q}}u);
+        pdf.set( blockIdx, threadIdx );
+        velocity.set( blockIdx, threadIdx );
+        force.set( blockIdx, threadIdx );
+        pop += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                const {{dtype}} f_{{i}} = pdf.get({{i}}u) = pop[{{i}}u];
+            {% endfor -%}
+            {{momentum_density_getter | substitute_force_getter_cu | indent(8) }}
+            const {{dtype}} rho_inv = {{dtype}} {1} / rho;
+            {% for i in range(D) -%}
+                velocity.get({{i}}u) = md_{{i}} * rho_inv;
+            {% endfor %}
+        }
+    }
+// LCOV_EXCL_STOP
+
+    std::array<{{dtype}}, {{Q}}u> get(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        Cell const & cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data({{Q}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        std::array<{{dtype}}, {{Q}}u> pop;
+        thrust::copy(dev_data.begin(), dev_data.end(), pop.data());
+        return pop;
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        std::array< {{dtype}}, {{Q}}u > const & pop,
+        Cell const & cell )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(pop.begin(), pop.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        CellInterval ci ( cell, cell );
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        gpu::GPUField< {{dtype}} > * velocity_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
+        std::array< {{dtype}}, {{Q}}u > const & pop,
+        Cell const & cell )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(pop.begin(), pop.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        CellInterval ci ( cell, cell );
+        auto kernel = gpu::make_kernel( kernel_set_vel );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
+    void initialize(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        std::array< {{dtype}}, {{Q}}u > const & pop )
+    {
+        CellInterval ci = pdf_field->xyzSizeWithGhostLayer();
+        thrust::device_vector< {{dtype}} > dev_data(pop.begin(), pop.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_broadcast );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+   }
+
+    std::vector< {{dtype}} > get(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(ci.numCells() * {{Q}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        std::vector< {{dtype}} > out(ci.numCells() * {{Q}}u);
+        thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+        return out;
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        std::vector< {{dtype}} > const & values,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(values.begin(), values.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        gpu::GPUField< {{dtype}} > * velocity_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
+        std::vector< {{dtype}} > const & values,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(values.begin(), values.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set_vel );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+} // namespace Population
+
+namespace Vector
+{
+// LCOV_EXCL_START
+    __global__ void kernel_get(
+        gpu::FieldAccessor< {{dtype}} > vec,
+        {{dtype}} * u_out )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
+        vec.set( blockIdx, threadIdx );
+        u_out += offset;
+        if (vec.isValidPosition()) {
+            {% for i in range(D) -%}
+                u_out[{{i}}u] = vec.get({{i}}u);
+            {% endfor %}
+        }
+    }
+
+    __global__ void kernel_set(
+        gpu::FieldAccessor< {{dtype}} > vec,
+        {{dtype}} const * RESTRICT u_in )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
+        vec.set( blockIdx, threadIdx );
+        u_in += offset;
+        if (vec.isValidPosition()) {
+            {% for i in range(D) -%}
+                vec.get({{i}}u) = u_in[{{i}}u];
+            {% endfor %}
+        }
+    }
+
+    __global__ void kernel_broadcast(
+        gpu::FieldAccessor< {{dtype}} > vec,
+        {{dtype}} const * RESTRICT u_in )
+    {
+        vec.set( blockIdx, threadIdx );
+        if (vec.isValidPosition()) {
+            {% for i in range(D) -%}
+                vec.get({{i}}u) = u_in[{{i}}u];
+            {% endfor %}
+        }
+    }
+
+    __global__ void kernel_add(
+        gpu::FieldAccessor< {{dtype}} > vec,
+        {{dtype}} const * RESTRICT u_in )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
+        vec.set( blockIdx, threadIdx );
+        u_in += offset;
+        if (vec.isValidPosition()) {
+            {% for i in range(D) -%}
+                vec.get({{i}}u) += u_in[{{i}}u];
+            {% endfor %}
+        }
+    }
+
+    __global__ void kernel_broadcast_add(
+        gpu::FieldAccessor< {{dtype}} > vec,
+        {{dtype}} const * RESTRICT u_in )
+    {
+        vec.set( blockIdx, threadIdx );
+        if (vec.isValidPosition()) {
+            {% for i in range(D) -%}
+                vec.get({{i}}u) += u_in[{{i}}u];
+            {% endfor %}
+        }
+    }
+// LCOV_EXCL_STOP
+
+    Vector{{D}}< {{dtype}} > get(
+        gpu::GPUField< {{dtype}} > const * vec_field,
+        Cell const & cell)
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data({{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *vec_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        Vector{{D}}< {{dtype}} > vec;
+        thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
+        return vec;
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > * vec_field,
+        Vector{{D}}< {{dtype}} > const & vec,
+        Cell const & cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data(vec.data(), vec.data() + {{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *vec_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
+    void add(
+        gpu::GPUField< {{dtype}} > * vec_field,
+        Vector{{D}}< {{dtype}} > const & vec,
+        Cell const &cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data(vec.data(), vec.data() + {{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_add );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *vec_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
+    void initialize(
+        gpu::GPUField< {{dtype}} > * vec_field,
+        Vector{{D}}< {{dtype}} > const & vec )
+    {
+        CellInterval ci = vec_field->xyzSizeWithGhostLayer();
+        thrust::device_vector< {{dtype}} > dev_data(vec.data(), vec.data() + {{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_broadcast );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *vec_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+   }
+
+    void add_to_all(
+        gpu::GPUField< {{dtype}} > * vec_field,
+        Vector{{D}}< {{dtype}} > const & vec )
+    {
+        CellInterval ci = vec_field->xyzSizeWithGhostLayer();
+        thrust::device_vector< {{dtype}} > dev_data(vec.data(), vec.data() + {{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_broadcast_add );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *vec_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
+    std::vector< {{dtype}} > get(
+        gpu::GPUField< {{dtype}} > const * vec_field,
+        CellInterval const & ci)
+    {
+        thrust::device_vector< {{dtype}} > dev_data(ci.numCells() * {{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *vec_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        std::vector< {{dtype}} > out(ci.numCells() * {{D}}u);
+        thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+        return out;
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > * vec_field,
+        std::vector< {{dtype}} > const & values,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(values.begin(), values.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *vec_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+} // namespace Vector
+
+namespace Interpolation
+{
+// LCOV_EXCL_START
+    /** @brief Calculate interpolation weights. */
+    static __forceinline__ __device__ void calculate_weights(
+        {{dtype}} const *RESTRICT const pos,
+        int *RESTRICT const corner,
+        {{dtype}} *RESTRICT const weights,
+        uint gl)
+    {
+      #pragma unroll
+      for (int dim = 0; dim < {{D}}; ++dim) {
+        auto const fractional_index = pos[dim] - {{dtype}}{0.5};
+        auto const nmp = floorf(fractional_index);
+        auto const distance = fractional_index - nmp - {{dtype}}{0.5};
+        corner[dim] = __{{dtype}}2int_rn(nmp) + static_cast<int>(gl);
+        weights[dim * 2 + 0] = {{dtype}}{0.5} - distance;
+        weights[dim * 2 + 1] = {{dtype}}{0.5} + distance;
+      }
+    }
+
+    __global__ void kernel_get(
+        gpu::FieldAccessor< {{dtype}} > vec,
+        {{dtype}} const *RESTRICT const pos,
+        {{dtype}} *RESTRICT const vel,
+        uint n_pos,
+        uint gl)
+    {
+
+      uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
+                       blockDim.x * blockIdx.x + threadIdx.x;
+
+      vec.set({0u, 0u, 0u}, {0u, 0u, 0u});
+      if (vec.isValidPosition() and pos_index < n_pos) {
+        auto const array_offset = pos_index * uint({{D}}u);
+        int corner[{{D}}];
+        {{dtype}} weights[{{D}}][2];
+        calculate_weights(pos + array_offset, corner, &weights[0][0], gl);
+        #pragma unroll
+        for (int i = 0; i < 2; i++) {
+          auto const cx = corner[0] + i;
+          auto const wx = weights[0][i];
+          #pragma unroll
+          for (int j = 0; j < 2; j++) {
+            auto const cy = corner[1] + j;
+            auto const wxy = wx * weights[1][j];
+            #pragma unroll
+            for (int k = 0; k < 2; k++) {
+              auto const cz = corner[2] + k;
+              auto const weight = wxy * weights[2][k];
+              {% for cf in range(D) -%}
+                vel[array_offset + {{cf}}u] += weight * vec.getNeighbor(cx, cy, cz, {{cf}}u);
+              {% endfor %}
+            }
+          }
+        }
+      }
+    }
+
+    __global__ void kernel_set(
+        gpu::FieldAccessor< {{dtype}} > vec,
+        {{dtype}} const *RESTRICT const pos,
+        {{dtype}} const *RESTRICT const forces,
+        uint n_pos,
+        uint gl )
+    {
+
+      uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
+                       blockDim.x * blockIdx.x + threadIdx.x;
+
+      vec.set({0u, 0u, 0u}, {0u, 0u, 0u});
+      if (vec.isValidPosition() and pos_index < n_pos) {
+        auto const array_offset = pos_index * uint({{D}}u);
+        int corner[{{D}}];
+        {{dtype}} weights[{{D}}][2];
+        calculate_weights(pos + array_offset, corner, &weights[0][0], gl);
+        #pragma unroll
+        for (int i = 0; i < 2; i++) {
+          auto const cx = corner[0] + i;
+          auto const wx = weights[0][i];
+          #pragma unroll
+          for (int j = 0; j < 2; j++) {
+            auto const cy = corner[1] + j;
+            auto const wxy = wx * weights[1][j];
+            #pragma unroll
+            for (int k = 0; k < 2; k++) {
+              auto const cz = corner[2] + k;
+              auto const weight = wxy * weights[2][k];
+              {% for cf in range(D) -%}
+                atomicAdd(&vec.getNeighbor(cx, cy, cz, {{cf}}u),
+                          weight * forces[array_offset + {{cf}}u]);
+              {% endfor %}
+            }
+          }
+        }
+      }
+    }
+// LCOV_EXCL_STOP
+
+    static dim3 calculate_dim_grid(uint const threads_x,
+                                   uint const blocks_per_grid_y,
+                                   uint const threads_per_block) {
+      assert(threads_x >= 1u);
+      assert(blocks_per_grid_y >= 1u);
+      assert(threads_per_block >= 1u);
+      auto const threads_y = threads_per_block * blocks_per_grid_y;
+      auto const blocks_per_grid_x = (threads_x + threads_y - 1) / threads_y;
+      return make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+    }
+
+    std::vector< {{dtype}} >
+    get(
+        gpu::GPUField< {{dtype}} > const *vec_field,
+        std::vector< {{dtype}} > const &pos,
+        uint gl )
+    {
+      thrust::device_vector< {{dtype}} > dev_pos(pos.begin(), pos.end());
+      thrust::device_vector< {{dtype}} > dev_vel(pos.size());
+      auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
+      auto const dev_vel_ptr = thrust::raw_pointer_cast(dev_vel.data());
+
+      auto const threads_per_block = uint(64u);
+      auto const n_pos = static_cast<uint>(pos.size() / {{D}}ul);
+      auto const dim_grid = calculate_dim_grid(n_pos, 4u, threads_per_block);
+      kernel_get<<<dim_grid, threads_per_block, 0u, nullptr>>>(
+          gpu::FieldIndexing< {{dtype}} >::withGhostLayerXYZ(*vec_field, gl).gpuAccess(),
+          dev_pos_ptr, dev_vel_ptr, n_pos, gl);
+
+      std::vector< {{dtype}} > out(pos.size());
+      thrust::copy(dev_vel.begin(), dev_vel.end(), out.data());
+      return out;
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > const *vec_field,
+        std::vector< {{dtype}} > const &pos,
+        std::vector< {{dtype}} > const &forces,
+        uint gl )
+    {
+      thrust::device_vector< {{dtype}} > dev_pos(pos.begin(), pos.end());
+      thrust::device_vector< {{dtype}} > dev_for(forces.begin(), forces.end());
+      auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
+      auto const dev_for_ptr = thrust::raw_pointer_cast(dev_for.data());
+
+      auto const threads_per_block = uint(64u);
+      auto const n_pos = static_cast<uint>(pos.size() / {{D}}ul);
+      auto const dim_grid = calculate_dim_grid(n_pos, 4u, threads_per_block);
+      kernel_set<<<dim_grid, threads_per_block, 0u, nullptr>>>(
+          gpu::FieldIndexing< {{dtype}} >::withGhostLayerXYZ(*vec_field, gl).gpuAccess(),
+          dev_pos_ptr, dev_for_ptr, n_pos, gl);
+    }
+} // namespace Interpolation
+
+namespace Equilibrium
+{
+// LCOV_EXCL_START
+    __device__ void kernel_set_device(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        {{dtype}} const * RESTRICT const u,
+        {{dtype}} rho )
+    {
+        {%if not compressible %}
+        rho -= {{dtype}}(1.0);
+        {%endif %}
+
+        {% for eqTerm in equilibrium -%}
+            pdf.get({{loop.index0 }}u) = {{eqTerm}};
+        {% endfor -%}
+    }
+// LCOV_EXCL_STOP
+} // namespace Equilibrium
+
+namespace Density
+{
+// LCOV_EXCL_START
+    __global__ void kernel_get(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        {{dtype}} * RESTRICT rho_out )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
+        pdf.set( blockIdx, threadIdx );
+        rho_out += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                {{dtype}} const f_{{i}} = pdf.get({{i}}u);
+            {% endfor -%}
+            {{density_getters | indent(12)}}
+            rho_out[0u] = rho;
+        }
+    }
+
+    __global__ void kernel_set(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        {{dtype}} const * RESTRICT rho_in )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
+        pdf.set( blockIdx, threadIdx );
+        rho_in += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                {{dtype}} const f_{{i}} = pdf.get({{i}}u);
+            {% endfor -%}
+            {{unshifted_momentum_density_getter | indent(12)}}
+
+            // calculate current velocity (before density change)
+            {{dtype}} const rho_inv = {{dtype}} {1} / rho;
+            {{dtype}} const u_old[{{D}}] = { {% for i in range(D) %}momdensity_{{i}} * rho_inv{% if not loop.last %}, {% endif %}{% endfor %} };
+
+            Equilibrium::kernel_set_device(pdf, u_old, rho_in[0u] {%if not compressible %} + {{dtype}} {1} {%endif%});
+        }
+    }
+// LCOV_EXCL_STOP
+
+    {{dtype}} get(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        Cell const & cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data(1u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        {{dtype}} rho = dev_data[0u];
+        return rho;
+    }
+
+    std::vector< {{dtype}} > get(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(ci.numCells());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        std::vector< {{dtype}} > out(dev_data.size());
+        thrust::copy(dev_data.begin(), dev_data.end(), out.begin());
+        return out;
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        const {{dtype}} rho,
+        Cell const & cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data(1u, rho);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        std::vector< {{dtype}} > const & values,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(values.begin(), values.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+} // namespace Density
+
+namespace Velocity
+{
+// LCOV_EXCL_START
+    __global__ void kernel_get(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        gpu::FieldAccessor< {{dtype}} > force,
+        {{dtype}} * RESTRICT u_out )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
+        pdf.set( blockIdx, threadIdx );
+        force.set( blockIdx, threadIdx );
+        u_out += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                {{dtype}} const f_{{i}} = pdf.get({{i}}u);
+            {% endfor -%}
+            {{momentum_density_getter | substitute_force_getter_cu | indent(8) }}
+            auto const rho_inv = {{dtype}} {1} / rho;
+            {% for i in range(D) -%}
+                u_out[{{i}}u] = md_{{i}} * rho_inv;
+            {% endfor %}
+        }
+    }
+
+    __global__ void kernel_set(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        gpu::FieldAccessor< {{dtype}} > velocity,
+        gpu::FieldAccessor< {{dtype}} > force,
+        {{dtype}} const * RESTRICT u_in )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
+        pdf.set( blockIdx, threadIdx );
+        velocity.set( blockIdx, threadIdx );
+        force.set( blockIdx, threadIdx );
+        u_in += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                {{dtype}} const f_{{i}} = pdf.get({{i}}u);
+            {% endfor -%}
+            {{dtype}} const * RESTRICT const u = u_in;
+            {{density_getters | indent(8)}}
+            {{density_velocity_setter_macroscopic_values | substitute_force_getter_cu | indent(8)}}
+            {% for i in range(D) -%}
+                velocity.get({{i}}u) = u_in[{{i}}u];
+            {% endfor %}
+            {{dtype}} u_new[{{D}}] = { {% for i in range(D) %}u_{{i}}{% if not loop.last %}, {% endif %}{% endfor %} };
+
+            Equilibrium::kernel_set_device(pdf, u_new, rho {%if not compressible %} + {{dtype}}(1) {%endif%});
+        }
+    }
+// LCOV_EXCL_STOP
+
+    Vector{{D}}< {{dtype}} > get(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
+        Cell const & cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data({{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        Vector{{D}}< {{dtype}} > vec;
+        thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
+        return vec;
+    }
+
+    std::vector< {{dtype}} > get(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data({{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        std::vector< {{dtype}} > out(dev_data.size());
+        thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+        return out;
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        gpu::GPUField< {{dtype}} > * velocity_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
+        Vector{{D}}< {{dtype}} > const & u,
+        Cell const & cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data(u.data(), u.data() + {{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
+    void set(
+        gpu::GPUField< {{dtype}} > * pdf_field,
+        gpu::GPUField< {{dtype}} > * velocity_field,
+        gpu::GPUField< {{dtype}} > const * force_field,
+        std::vector< {{dtype}} > const & values,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(values.begin(), values.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+} // namespace Velocity
+
+namespace Force {
+// LCOV_EXCL_START
+    __global__ void kernel_set(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        gpu::FieldAccessor< {{dtype}} > velocity,
+        gpu::FieldAccessor< {{dtype}} > force,
+        {{dtype}} const * RESTRICT f_in )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
+        pdf.set( blockIdx, threadIdx );
+        velocity.set( blockIdx, threadIdx );
+        force.set( blockIdx, threadIdx );
+        f_in += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                {{dtype}} const f_{{i}} = pdf.get({{i}}u);
+            {% endfor -%}
+
+            {{momentum_density_getter | substitute_force_getter_pattern("force->get\(x, ?y, ?z, ?([0-9])u?\)", "f_in[\g<1>u]") | indent(8) }}
+            auto const rho_inv = {{dtype}} {1} / rho;
+
+            {% for i in range(D) -%}
+                force.get({{i}}u) = f_in[{{i}}u];
+            {% endfor %}
+
+            {% for i in range(D) -%}
+                velocity.get({{i}}u) = md_{{i}} * rho_inv;
+            {% endfor %}
+        }
+    }
+// LCOV_EXCL_STOP
+
+    void
+    set( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > * force_field,
+         Vector{{D}}< {{dtype}} > const & u,
+         Cell const & cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data(u.data(), u.data() + {{D}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+
+    void
+    set( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data(values.begin(), values.end());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_set );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *velocity_field, ci ) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( const_cast<const {{dtype}} *>(dev_data_ptr) );
+        kernel();
+    }
+} // namespace Force
+
+namespace MomentumDensity
+{
+// LCOV_EXCL_START
+    __global__ void kernel_sum(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        gpu::FieldAccessor< {{dtype}} > force,
+        {{dtype}} * RESTRICT out )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D}}u);
+        pdf.set( blockIdx, threadIdx );
+        force.set( blockIdx, threadIdx );
+        out += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                {{dtype}} const f_{{i}} = pdf.get({{i}}u);
+            {% endfor -%}
+            {{momentum_density_getter | substitute_force_getter_cu | indent(8) }}
+            {% for i in range(D) -%}
+                out[{{i}}u] += md_{{i}};
+            {% endfor %}
+        }
+    }
+// LCOV_EXCL_STOP
+
+    Vector{{D}}< {{dtype}} > reduce(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        gpu::GPUField< {{dtype}} > const * force_field )
+    {
+        thrust::device_vector< {{dtype}} > dev_data({{D}}u, {{dtype}} {0});
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
+            Cell cell(x, y, z);
+            CellInterval ci ( cell, cell );
+            auto kernel = gpu::make_kernel( kernel_sum );
+            kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+            kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+            kernel.addParam( dev_data_ptr );
+            kernel();
+        });
+        Vector{{D}}< {{dtype}} > mom({{dtype}} {0});
+        thrust::copy(dev_data.begin(), dev_data.begin() + {{D}}u, mom.data());
+        return mom;
+    }
+} // namespace MomentumDensity
+
+namespace PressureTensor
+{
+// LCOV_EXCL_START
+    __global__ void kernel_get(
+        gpu::FieldAccessor< {{dtype}} > pdf,
+        {{dtype}} * RESTRICT p_out )
+    {
+        auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, {{D**2}}u);
+        pdf.set( blockIdx, threadIdx );
+        p_out += offset;
+        if (pdf.isValidPosition()) {
+            {% for i in range(Q) -%}
+                {{dtype}} const f_{{i}} = pdf.get({{i}}u);
+            {% endfor -%}
+            {{second_momentum_getter | indent(12) }}
+            {% for i in range(D**2) -%}
+                p_out[{{i}}u] = p_{{i}};
+            {% endfor %}
+        }
+    }
+// LCOV_EXCL_STOP
+
+    Matrix{{D}}< {{dtype}} > get(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        Cell const & cell )
+    {
+        CellInterval ci ( cell, cell );
+        thrust::device_vector< {{dtype}} > dev_data({{D**2}}u);
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        Matrix{{D}}< {{dtype}} > out;
+        thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+        return out;
+   }
+
+    std::vector< {{dtype}} > get(
+        gpu::GPUField< {{dtype}} > const * pdf_field,
+        CellInterval const & ci )
+    {
+        thrust::device_vector< {{dtype}} > dev_data({{D**2}}u * ci.numCells());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        std::vector< {{dtype}} > out(dev_data.size());
+        thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+        return out;
+   }
+} // namespace PressureTensor
+
+
+} // namespace accessor
+} // namespace {{namespace}}
+} // namespace walberla
diff --git a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cuh b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cuh
new file mode 100644
index 00000000000..65f776abee3
--- /dev/null
+++ b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cuh
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2023-2024 The ESPResSo project
+ * Copyright (C) 2020 The waLBerla project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * @file
+ * Lattice field accessors.
+ * Adapted from the waLBerla source file
+ * https://i10git.cs.fau.de/walberla/walberla/-/blob/a16141524c58ab88386e2a0f8fdd7c63c5edd704/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
+ */
+
+#pragma once
+
+#include <core/DataTypes.h>
+#include <core/cell/Cell.h>
+#include <core/cell/CellInterval.h>
+#include <core/math/Matrix{{D}}.h>
+#include <core/math/Vector{{D}}.h>
+
+#include <gpu/GPUField.h>
+
+#include <array>
+#include <tuple>
+#include <vector>
+
+namespace walberla {
+namespace {{namespace}} {
+namespace accessor {
+
+namespace Population {
+    /** @brief Get populations from a single cell. */
+    std::array<{{dtype}}, {{Q}}u>
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         Cell const & cell );
+    /** @brief Set populations on a single cell. */
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         std::array< {{dtype}}, {{Q}}u > const & pop,
+         Cell const & cell );
+    /** @brief Set populations and recalculate velocities on a single cell. */
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         std::array< {{dtype}}, {{Q}}u > const & pop,
+         Cell const & cell );
+    /** @brief Initialize all cells with the same value. */
+    void initialize(
+         gpu::GPUField< {{dtype}} > * pdf_field,
+         std::array< {{dtype}}, {{Q}}u > const & pop );
+    /** @brief Get populations from a cell interval. */
+    std::vector< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         CellInterval const & ci );
+    /** @brief Set populations on a cell interval. */
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci );
+    /** @brief Set populations and recalculate velocities on a cell interval. */
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci );
+} // namespace Population
+
+namespace Vector {
+    /** @brief Get value from a single cell. */
+    Vector{{D}}< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const * field,
+         Cell const & cell );
+    /** @brief Set value on a single cell. */
+    void set( gpu::GPUField< {{dtype}} > * field,
+              Vector{{D}}< {{dtype}} > const & vec,
+              Cell const & cell );
+    /** @brief Add value to a single cell. */
+    void add( gpu::GPUField< {{dtype}} > * field,
+              Vector{{D}}< {{dtype}} > const & vec,
+              Cell const & cell );
+    /** @brief Initialize all cells with the same value. */
+    void initialize( gpu::GPUField< {{dtype}} > * field,
+                    Vector{{D}}< {{dtype}} > const & vec);
+    /** @brief Add value to all cells. */
+    void add_to_all( gpu::GPUField< {{dtype}} > * field,
+                     Vector{{D}}< {{dtype}} > const & vec);
+    /** @brief Get values from a cell interval. */
+    std::vector< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const * vec_field,
+         CellInterval const & ci);
+    /** @brief Set values on a cell interval. */
+    void
+    set( gpu::GPUField< {{dtype}} > * vec_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci );
+
+} // namespace Vector
+
+namespace Interpolation {
+    std::vector< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const *vec_field,
+         std::vector< {{dtype}} > const &pos,
+         uint gl );
+    void
+    set( gpu::GPUField< {{dtype}} > const *vec_field,
+         std::vector< {{dtype}} > const &pos,
+         std::vector< {{dtype}} > const &forces,
+         uint gl );
+} // namespace Interpolation
+
+namespace Density {
+    {{dtype}}
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         Cell const & cell );
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         {{dtype}} const rho,
+         Cell const & cell );
+    std::vector< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         CellInterval const & ci );
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci );
+} // namespace Density
+
+namespace Velocity {
+    Vector{{D}}< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         Cell const & cell );
+    std::vector< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         CellInterval const & ci );
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         Vector{{D}}< {{dtype}} > const & u,
+         Cell const & cell );
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci );
+} // namespace Velocity
+
+namespace Force {
+    void
+    set( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > * force_field,
+         Vector{{D}}< {{dtype}} > const & u,
+         Cell const & cell );
+    void
+    set( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > * velocity_field,
+         gpu::GPUField< {{dtype}} > * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci );
+} // namespace Force
+
+namespace DensityAndVelocity {
+    std::tuple< {{dtype}} , Vector{{D}}< {{dtype}} > >
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         Cell const & cell );
+    void
+    set( gpu::GPUField< {{dtype}} > * pdf_field,
+         gpu::GPUField< {{dtype}} > * force_field,
+         Vector{{D}}< {{dtype}} > const & u,
+         {{dtype}} const rho,
+         Cell const & cell );
+} // namespace DensityAndVelocity
+
+namespace DensityAndMomentumDensity {
+    std::tuple< {{dtype}} , Vector{{D}}< {{dtype}} > >
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         gpu::GPUField< {{dtype}} > const * force_field,
+         Cell const & cell );
+} // namespace DensityAndMomentumDensity
+
+namespace MomentumDensity {
+    Vector{{D}}< {{dtype}} >
+    reduce( gpu::GPUField< {{dtype}} > const * pdf_field,
+            gpu::GPUField< {{dtype}} > const * force_field );
+} // namespace MomentumDensity
+
+namespace PressureTensor {
+    Matrix{{D}}< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         Cell const & cell );
+    std::vector< {{dtype}} >
+    get( gpu::GPUField< {{dtype}} > const * pdf_field,
+         CellInterval const & ci );
+} // namespace PressureTensor
+
+} // namespace accessor
+} // namespace {{namespace}}
+} // namespace walberla
diff --git a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h
index 37e1edcf9cd..d443243bbab 100644
--- a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h
+++ b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h
@@ -37,19 +37,18 @@
 
 #include <array>
 #include <cassert>
+#include <iterator>
 #include <tuple>
 #include <vector>
 
 #ifdef WALBERLA_CXX_COMPILER_IS_GNU
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
 #ifdef WALBERLA_CXX_COMPILER_IS_CLANG
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #endif
 
 namespace walberla {
@@ -58,7 +57,7 @@ namespace accessor {
 
 namespace Population
 {
-    inline std::array<{{dtype}}, {{Q}}u>
+    inline auto
     get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
          Cell const & cell )
     {
@@ -82,8 +81,30 @@ namespace Population
     }
 
     inline void
-    broadcast( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > * pdf_field,
-               std::array<{{dtype}}, {{Q}}u> const & pop)
+    set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
+         std::array<{{dtype}}, {{Q}}u> const & pop,
+         Cell const & cell )
+    {
+        auto & xyz0 = pdf_field->get(cell, uint_t{ 0u });
+        {% for i in range(Q) -%}
+            const {{dtype}} f_{{i}} = pdf_field->getF( &xyz0, uint_t{ {{i}}u }) = pop[{{i}}u];
+        {% endfor -%}
+
+        {% for c in "xyz" -%}
+            const auto {{c}} = cell.{{c}}();
+        {% endfor -%}
+        {{momentum_density_getter | substitute_force_getter_cpp | indent(8) }}
+        const auto rho_inv = {{dtype}} {1} / rho;
+        {% for i in range(D) -%}
+            velocity_field->get(cell, uint_t{ {{i}}u }) = md_{{i}} * rho_inv;
+        {% endfor -%}
+    }
+
+    inline void
+    initialize( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > * pdf_field,
+                std::array<{{dtype}}, {{Q}}u> const & pop)
      {
          WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdf_field, {
              {{dtype}} & xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
@@ -93,7 +114,7 @@ namespace Population
          });
      }
 
-    inline std::vector< {{dtype}} >
+    inline auto
     get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
          CellInterval const & ci )
     {
@@ -118,15 +139,42 @@ namespace Population
          CellInterval const & ci )
     {
         assert(uint_c(values.size()) == ci.numCells() * uint_t({{Q}}u));
-        auto values_ptr = values.data();
+        auto pop = values.data();
+        for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+            for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+                for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+                    {{dtype}} & xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
+                    {% for i in range(Q) -%}
+                        pdf_field->getF( &xyz0, uint_t{ {{i}}u }) = pop[{{i}}u];
+                    {% endfor -%}
+                    std::advance(pop, {{Q}});
+                }
+            }
+        }
+    }
+
+    inline void
+    set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci )
+    {
+        assert(uint_c(values.size()) == ci.numCells() * uint_t({{Q}}u));
+        auto pop = values.data();
         for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
             for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
                 for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
                     {{dtype}} & xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
                     {% for i in range(Q) -%}
-                        pdf_field->getF( &xyz0, uint_t{ {{i}}u }) = values_ptr[{{i}}u];
+                        const {{dtype}} f_{{i}} = pdf_field->getF( &xyz0, uint_t{ {{i}}u }) = pop[{{i}}u];
+                    {% endfor -%}
+                    {{momentum_density_getter | substitute_force_getter_cpp | indent(12) }}
+                    const auto rho_inv = {{dtype}} {1} / rho;
+                    {% for i in range(D) -%}
+                        velocity_field->get(x, y, z, uint_t{ {{i}}u }) = md_{{i}} * rho_inv;
                     {% endfor -%}
-                    values_ptr += {{Q}}u;
+                    std::advance(pop, {{Q}});
                 }
             }
         }
@@ -135,7 +183,7 @@ namespace Population
 
 namespace Vector
 {
-    inline Vector{{D}}< {{dtype}} >
+    inline auto
     get( GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * vec_field,
          Cell const & cell )
     {
@@ -170,8 +218,8 @@ namespace Vector
     }
 
     inline void
-    broadcast( GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * vec_field,
-               Vector{{D}}< {{dtype}} > const & vec)
+    initialize( GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * vec_field,
+                Vector{{D}}< {{dtype}} > const & vec)
      {
          WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(vec_field, {
              {{dtype}} & xyz0 = vec_field->get(x, y, z, uint_t{ 0u });
@@ -193,7 +241,7 @@ namespace Vector
          });
      }
 
-    inline std::vector< {{dtype}} >
+    inline auto
     get( GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * vec_field,
          CellInterval const & ci )
     {
@@ -226,7 +274,7 @@ namespace Vector
                     {% for i in range(D) -%}
                         vec_field->getF( &xyz0, uint_t{ {{i}}u }) = values_ptr[{{i}}u];
                     {% endfor -%}
-                    values_ptr += {{D}}u;
+                    std::advance(values_ptr, {{D}});
                 }
             }
         }
@@ -237,11 +285,11 @@ namespace EquilibriumDistribution
 {
     inline {{dtype}}
     get( stencil::Direction const direction,
-         Vector{{D}}< {{dtype}} > const & u = Vector{{D}}< {{dtype}} >( {{dtype}}(0.0) ),
-         {{dtype}} rho = {{dtype}}(1.0) )
+         Vector{{D}}< {{dtype}} > const & u = Vector{{D}}< {{dtype}} >( {{dtype}} {0} ),
+         {{dtype}} rho = {{dtype}} {1} )
     {
         {% if not compressible %}
-        rho -= {{dtype}}(1.0);
+        rho -= {{dtype}} {1};
         {% endif %}
         {{equilibrium_from_direction}}
     }
@@ -256,7 +304,7 @@ namespace Equilibrium
          Cell const & cell )
     {
         {%if not compressible %}
-        rho -= {{dtype}}(1.0);
+        rho -= {{dtype}} {1};
         {%endif %}
 
         {{dtype}} & xyz0 = pdf_field->get(cell, uint_t{ 0u });
@@ -293,13 +341,13 @@ namespace Density
         {{unshifted_momentum_density_getter | indent(8)}}
 
         // calculate current velocity (before density change)
-        const {{dtype}} conversion = {{dtype}}(1) / rho;
+        const {{dtype}} conversion = {{dtype}} {1} / rho;
         Vector{{D}}< {{dtype}} > velocity;
         {% for i in range(D) -%}
             velocity[{{i}}u] = momdensity_{{i}} * conversion;
         {% endfor %}
 
-        Equilibrium::set(pdf_field, velocity, rho_in {%if not compressible %} + {{dtype}}(1) {%endif%}, cell);
+        Equilibrium::set(pdf_field, velocity, rho_in {%if not compressible %} + {{dtype}} {1} {%endif%}, cell);
     }
 
     inline std::vector< {{dtype}} >
@@ -341,13 +389,13 @@ namespace Density
                     {{unshifted_momentum_density_getter | indent(12)}}
 
                     // calculate current velocity (before density change)
-                    const {{dtype}} conversion = {{dtype}}(1) / rho;
+                    const {{dtype}} conversion = {{dtype}} {1} / rho;
                     Vector{{D}}< {{dtype}} > velocity;
                     {% for i in range(D) -%}
                         velocity[{{i}}u] = momdensity_{{i}} * conversion;
                     {% endfor %}
 
-                    Equilibrium::set(pdf_field, velocity, *values_it {%if not compressible %} + {{dtype}}(1) {%endif%}, Cell{x, y, z});
+                    Equilibrium::set(pdf_field, velocity, *values_it {%if not compressible %} + {{dtype}} {1} {%endif%}, Cell{x, y, z});
                     ++values_it;
                 }
             }
@@ -357,8 +405,53 @@ namespace Density
 
 namespace Velocity
 {
+    inline auto
+    get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
+         Cell const & cell )
+    {
+        const {{dtype}} & xyz0 = pdf_field->get(cell, uint_t{ 0u });
+        {% for i in range(Q) -%}
+            const {{dtype}} f_{{i}} = pdf_field->getF( &xyz0, uint_t{ {{i}}u });
+        {% endfor -%}
+
+        {% for c in "xyz" -%}
+            const auto {{c}} = cell.{{c}}();
+        {% endfor -%}
+        {{momentum_density_getter | substitute_force_getter_cpp | indent(8) }}
+        const {{dtype}} rho_inv = {{dtype}} {1} / rho;
+
+        return Vector3<{{dtype}}>(md_0 * rho_inv, md_1 * rho_inv, md_2 * rho_inv);
+    }
+
+    inline auto
+    get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
+         CellInterval const & ci )
+    {
+        std::vector< {{dtype}} > out;
+        out.reserve(ci.numCells() * uint_t({{D}}u));
+        for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+            for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+                for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+                    const {{dtype}} & xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
+                    {% for i in range(Q) -%}
+                        const {{dtype}} f_{{i}} = pdf_field->getF( &xyz0, uint_t{ {{i}}u });
+                    {% endfor -%}
+                    {{momentum_density_getter | substitute_force_getter_cpp | indent(12) }}
+                    const {{dtype}} rho_inv = {{dtype}} {1} / rho;
+                    {% for i in range(D) -%}
+                        out.emplace_back(md_{{i}} * rho_inv);
+                    {% endfor -%}
+                }
+            }
+        }
+        return out;
+    }
+
     inline void
     set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
          GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
          Vector{{D}}< {{dtype}} > const & u,
          Cell const & cell )
@@ -373,14 +466,113 @@ namespace Velocity
             const auto {{c}} = cell.{{c}}();
         {% endfor -%}
         {{density_velocity_setter_macroscopic_values | substitute_force_getter_cpp | indent(8)}}
+        {% for i in range(D) -%}
+            velocity_field->get(x, y, z, uint_t{ {{i}}u }) = u[{{i}}u];
+        {% endfor %}
+
+        Equilibrium::set(pdf_field, Vector{{D}}<{{dtype}}>({% for i in range(D) %}u_{{i}}{% if not loop.last %}, {% endif %}{% endfor %}), rho {%if not compressible %} + {{dtype}} {1} {%endif%}, cell);
+    }
+
+    inline void
+    set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci )
+    {
+        assert(uint_c(values.size()) == ci.numCells() * uint_t({{D}}u));
+        auto u = values.data();
+        for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+            for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+                for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+                    {{dtype}} & pdf_xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
+                    {{dtype}} & vel_xyz0 = velocity_field->get(x, y, z, uint_t{ 0u });
+                    {% for i in range(Q) -%}
+                        const {{dtype}} f_{{i}} = pdf_field->getF( &pdf_xyz0, uint_t{ {{i}}u });
+                    {% endfor -%}
+                    {{density_getters | indent(8)}}
+
+                    {{density_velocity_setter_macroscopic_values | substitute_force_getter_cpp | indent(8)}}
+                    {% for i in range(D) -%}
+                        velocity_field->getF( &vel_xyz0, uint_t{ {{i}}u }) = u[{{i}}u];
+                    {% endfor %}
+                    std::advance(u, {{D}});
 
-        Equilibrium::set(pdf_field, Vector{{D}}<{{dtype}}>({% for i in range(D) %}u_{{i}}{% if not loop.last %}, {% endif %}{% endfor %}), rho {%if not compressible %} + {{dtype}}(1) {%endif%}, cell);
+                    Equilibrium::set(pdf_field, Vector{{D}}<{{dtype}}>({% for i in range(D) %}u_{{i}}{% if not loop.last %}, {% endif %}{% endfor %}), rho {%if not compressible %} + {{dtype}} {1} {%endif%}, Cell{x, y, z});
+                }
+            }
+        }
     }
 } // namespace Velocity
 
+namespace Force
+{
+    inline void
+    set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * force_field,
+         Vector{{D}}< {{dtype}} > const & force,
+         Cell const & cell )
+    {
+        {{dtype}} const & pdf_xyz0 = pdf_field->get(cell, uint_t{ 0u });
+        {{dtype}} & vel_xyz0 = velocity_field->get(cell, uint_t{ 0u });
+        {{dtype}} & laf_xyz0 = force_field->get(cell, uint_t{ 0u });
+        {% for i in range(Q) -%}
+            const {{dtype}} f_{{i}} = pdf_field->getF( &pdf_xyz0, uint_t{ {{i}}u });
+        {% endfor -%}
+
+        {{momentum_density_getter | substitute_force_getter_pattern("force->get\(x, ?y, ?z, ?([0-9])u?\)", "force[\g<1>u]") | indent(8) }}
+        auto const rho_inv = {{dtype}} {1} / rho;
+
+        {% for i in range(D) -%}
+            force_field->getF( &laf_xyz0, uint_t{ {{i}}u }) = force[{{i}}u];
+        {% endfor %}
+
+        {% for i in range(D) -%}
+            velocity_field->getF( &vel_xyz0, uint_t{ {{i}}u }) = md_{{i}} * rho_inv;
+        {% endfor %}
+    }
+
+    inline void
+    set( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * velocity_field,
+         GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > * force_field,
+         std::vector< {{dtype}} > const & values,
+         CellInterval const & ci )
+    {
+        assert(uint_c(values.size()) == ci.numCells() * uint_t({{D}}u));
+        auto force = values.data();
+        for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+            for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+                for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+                    {{dtype}} const & pdf_xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
+                    {{dtype}} & vel_xyz0 = velocity_field->get(x, y, z, uint_t{ 0u });
+                    {{dtype}} & laf_xyz0 = force_field->get(x, y, z, uint_t{ 0u });
+                    {% for i in range(Q) -%}
+                        const {{dtype}} f_{{i}} = pdf_field->getF( &pdf_xyz0, uint_t{ {{i}}u });
+                    {% endfor -%}
+
+                    {{momentum_density_getter | substitute_force_getter_pattern("force->get\(x, ?y, ?z, ?([0-9])u?\)", "force[\g<1>u]") | indent(12) }}
+                    auto const rho_inv = {{dtype}} {1} / rho;
+
+                    {% for i in range(D) -%}
+                        force_field->getF( &laf_xyz0, uint_t{ {{i}}u }) = force[{{i}}u];
+                    {% endfor %}
+
+                    {% for i in range(D) -%}
+                        velocity_field->getF( &vel_xyz0, uint_t{ {{i}}u }) = md_{{i}} * rho_inv;
+                    {% endfor %}
+
+                    std::advance(force, {{D}});
+                }
+            }
+        }
+    }
+} // namespace Force
+
 namespace MomentumDensity
 {
-    inline Vector{{D}}< {{dtype}} >
+    inline auto
     reduce( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
             GhostLayerField< {{dtype}}, uint_t{ {{D}}u } > const * force_field )
     {
@@ -403,7 +595,7 @@ namespace MomentumDensity
 
 namespace PressureTensor
 {
-    inline Matrix{{D}}< {{dtype}} >
+    inline auto
     get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
          Cell const & cell )
    {
@@ -422,6 +614,33 @@ namespace PressureTensor
         {% endfor %}
         return pressureTensor;
    }
+
+    inline auto
+    get( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field,
+         CellInterval const & ci )
+    {
+        std::vector< {{dtype}} > out;
+        out.reserve(ci.numCells() * uint_t({{D**2}}u));
+        for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+            for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+                for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+                    const {{dtype}} & xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
+                    {% for i in range(Q) -%}
+                        const {{dtype}} f_{{i}} = pdf_field->getF( &xyz0, uint_t{ {{i}}u });
+                    {% endfor -%}
+
+                    {{second_momentum_getter | indent(12) }}
+
+                    {% for i in range(D) -%}
+                        {% for j in range(D) -%}
+                            out.emplace_back(p_{{i*D+j}});
+                        {% endfor %}
+                    {% endfor %}
+                }
+            }
+        }
+        return out;
+    }
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/maintainer/walberla_kernels/walberla_lbm_generation.py b/maintainer/walberla_kernels/walberla_lbm_generation.py
index 72f5ffdfec4..6aec095662d 100644
--- a/maintainer/walberla_kernels/walberla_lbm_generation.py
+++ b/maintainer/walberla_kernels/walberla_lbm_generation.py
@@ -19,6 +19,7 @@
 #
 
 import os
+import re
 import sympy as sp
 import pystencils as ps
 import lbmpy_walberla
@@ -104,14 +105,29 @@ def equations_to_code(equations, variable_prefix="",
     return "\n".join(result)
 
 
+def substitute_force_getter_pattern(code, pattern, subst):
+    re_pat = re.compile(pattern)
+    assert re_pat.search(code) is not None, f"pattern '{pattern} not found in '''\n{code}\n'''"  # nopep8
+    return re_pat.sub(subst, code)
+
+
 def substitute_force_getter_cpp(code):
     field_getter = "force->"
-    assert field_getter in code is not None, f"pattern '{field_getter} not found in '''\n{code}\n'''"
+    assert field_getter in code is not None, f"pattern '{field_getter} not found in '''\n{code}\n'''"  # nopep8
     return code.replace(field_getter, "force_field->")
 
 
+def substitute_force_getter_cu(code):
+    field_getter = "force->get(x,y,z,"
+    assert field_getter in code is not None, \
+        f"pattern '{field_getter} not found in '''\n{code}\n'''"
+    return code.replace(field_getter, "force.get(")
+
+
 def add_espresso_filters_to_jinja_env(jinja_env):
     jinja_env.filters["substitute_force_getter_cpp"] = substitute_force_getter_cpp
+    jinja_env.filters["substitute_force_getter_cu"] = substitute_force_getter_cu
+    jinja_env.filters["substitute_force_getter_pattern"] = substitute_force_getter_pattern
 
 
 def generate_macroscopic_values_accessors(ctx, config, lb_method, templates):
diff --git a/requirements.txt b/requirements.txt
index ea77f9d6c09..2465a35b426 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,43 +1,48 @@
 # build system
-cython>=0.29.21,<=3.0.7
-setuptools>=59.6.0
+cython>=0.29.21,<3.0.10
+setuptools>=68.1.2
+packaging>=24.0
 # required scientific packages
-numpy>=1.23
-h5py>=3.6.0
+numpy>=1.26.4,<2.0
+h5py>=3.10.0
 # optional scientific packages
-scipy>=1.8.0
-pint>=0.18
+scipy>=1.11.4
+pandas>=1.3.5
+pint>=0.19.2
+ase>=3.22.1
 # optional packages for graphics and external devices
-matplotlib>=3.5.1
+matplotlib>=3.6.3
 vtk>=9.1.0
 PyOpenGL>=3.1.5
 pygame>=2.1.2
 # waLBerla dependencies
-pystencils==1.2
-lbmpy==1.2
+pystencils==1.3.3
+lbmpy==1.3.3
 sympy==1.9
 islpy==2022.2.1
-jinja2>=3.0.3
+jinja2>=3.1.2
 # CI-related
-requests>=2.25.1
-lxml>=4.8.0
-coverage>=6.2
+requests>=2.32.0
+lxml>=5.1.0
+coverage>=7.4.4
 # sphinx and its dependencies
-sphinx>=4.3.2
-sphinx-toggleprompt==0.4.0
-sphinxcontrib-bibtex>=2.6.1
-numpydoc>=1.5.0
-pybtex>=0.23
+sphinx>=7.2.6
+sphinx-toggleprompt==0.5.2
+sphinxcontrib-bibtex>=2.6.2
+numpydoc>=1.6.0
+pybtex>=0.24.0
 # jupyter dependencies
-jupyterlab>=4.0.8
-nbformat==5.1.3
-nbconvert==6.5.1
-tqdm>=4.57.0
+jupyterlab>=3.5
+ipykernel>=6.29.3
+nbformat==5.9.1
+nbconvert==6.5.3
+tqdm>=4.66.2
 # linters and their dependencies
-autopep8==1.6.0
-pycodestyle==2.8.0
-pylint>=2.12.2
-astroid>=2.9.3
+pep8==1.7.1
+autopep8==2.1.0
+pycodestyle==2.11.1
+pylint>=3.0.3
+astroid>=3.0.2
 isort>=5.6.4
-pre-commit>=2.17.0
+pre-commit>=3.6.2
 cmakelang==0.6.13
diff --git a/samples/h5md_trajectory.py b/samples/h5md_trajectory.py
index 6d5f7b4a846..8cf8ba44ca5 100644
--- a/samples/h5md_trajectory.py
+++ b/samples/h5md_trajectory.py
@@ -66,7 +66,8 @@
 xyz_folded.append(system.part.all().pos_folded[:])
 xyz_unfolded.append(system.part.all().pos[:])
 # resize box (simulates NpT)
-system.box_l = system.box_l + 1.
+for i in range(3):
+    system.change_volume_and_rescale_particles(system.box_l[i] + 1., "xyz"[i])
 system.integrator.run(10)
 h5.write()
 xyz_folded.append(system.part.all().pos_folded[:])
diff --git a/samples/lbf.py b/samples/lbf.py
index b418df1a960..905b219f831 100644
--- a/samples/lbf.py
+++ b/samples/lbf.py
@@ -60,6 +60,7 @@
 
 
 lb_params = {'agrid': 1, 'density': 1, 'kinematic_viscosity': 1, 'tau': 0.01,
+             'single_precision': False,
              'ext_force_density': [0, 0, -1.0 / (box_l**3)]}
 
 if args.gpu:
diff --git a/samples/reaction_ensemble_complex_reaction.py b/samples/reaction_ensemble_complex_reaction.py
index 93c2c2805dd..ab4974d0df0 100644
--- a/samples/reaction_ensemble_complex_reaction.py
+++ b/samples/reaction_ensemble_complex_reaction.py
@@ -148,7 +148,9 @@ def equations(variables):
 
 print("concentrations sampled with the reaction ensemble vs. analytical solutions:")
 for ptype in types:
-    print(f"  type {types_name[ptype]}: {concentrations[ptype]:.4f} +/- {concentrations_95ci[ptype]:.4f} mol/l (95% CI), expected: {concentrations_numerical[ptype]:.4f} mol/l")
+    print(f"  type {types_name[ptype]}: {concentrations[ptype]:.4f} "
+          f"+/- {concentrations_95ci[ptype]:.4f} mol/l (95% CI), "
+          f"expected: {concentrations_numerical[ptype]:.4f} mol/l")
 
 K_sim = ((concentrations[type_C] / c_ref_in_mol_per_l)**nu_C
          * (concentrations[type_D] / c_ref_in_mol_per_l)**nu_D
diff --git a/samples/slice_input.py b/samples/slice_input.py
index 1feb8f0fe9a..255b792e08b 100644
--- a/samples/slice_input.py
+++ b/samples/slice_input.py
@@ -60,30 +60,30 @@
 partcls = system.part.add(id=id_list, pos=pos_list, type=type_list)
 p0p1 = system.part.by_ids([0, 1])
 
-print("TYPE\n%s" % partcls.type)
+print(f"TYPE\n{partcls.type}")
 p0p1.type = [3, 3]
-print("TYPE_NEW\n%s" % partcls.type)
+print(f"TYPE_NEW\n{partcls.type}")
 
-print("POS\n%s" % partcls.pos)
+print(f"POS\n%s" % partcls.pos)
 system.part.by_ids(range(5)).pos = [[1, 1, 1], [2, 2, 2], [
     3, 3, 3], [4, 4, 4], [5, 5, 5]]
-print("POS_NEW\n%s" % partcls.pos)
+print(f"POS_NEW\n{partcls.pos}")
 
-print("V\n%s" % partcls.v)
+print(f"V\n%s" % partcls.v)
 p0p1.v = [[1, 2, 3], [2, 3, 4]]
-print("V_NEW\n%s" % partcls.v)
+print(f"V_NEW\n{partcls.v}")
 
-print("F\n%s" % partcls.f)
+print(f"F\n{partcls.f}")
 p0p1.f = [[3, 4, 5], [4, 5, 6]]
-print("F_NEW\n%s" % partcls.f)
+print(f"F_NEW\n{partcls.f}")
 
 if espressomd.has_features(["MASS"]):
-    print("MASS\n%s" % partcls.mass)
+    print(f"MASS\n{partcls.mass}")
     p0p1.mass = [2, 3]
-    print("MASS_NEW\n%s" % partcls.mass)
+    print(f"MASS_NEW\n{partcls.mass}")
 
 if espressomd.has_features(["ELECTROSTATICS"]):
-    print("Q\n%s" % partcls.q)
+    print(f"Q\n{partcls.q}")
     system.part.by_ids(range(0, n_part, 2)).q = np.ones(n_part // 2)
     system.part.by_ids(range(1, n_part, 2)).q = -np.ones(n_part // 2)
-    print("Q_NEW\n%s" % partcls.q)
+    print(f"Q_NEW\n{partcls.q}")
diff --git a/samples/visualization_cellsystem.py b/samples/visualization_cellsystem.py
index 19f4638ddd1..3f38c6f0065 100644
--- a/samples/visualization_cellsystem.py
+++ b/samples/visualization_cellsystem.py
@@ -47,7 +47,7 @@
 system.time_step = 0.0005
 system.cell_system.set_regular_decomposition(use_verlet_lists=True)
 system.cell_system.skin = 0.4
-#system.cell_system.node_grid = [i, j, k]
+# system.cell_system.node_grid = [i, j, k]
 
 for i in range(100):
     system.part.add(pos=box * np.random.random(3))
diff --git a/samples/visualization_ljliquid.py b/samples/visualization_ljliquid.py
index 9934ddc203c..8722ebd6ac0 100644
--- a/samples/visualization_ljliquid.py
+++ b/samples/visualization_ljliquid.py
@@ -93,7 +93,7 @@
     f"Simulate {n_part} particles in a cubic box {box_l} at density {density}.")
 print("Interactions:\n")
 act_min_dist = system.analysis.min_dist()
-print(f"Start with minimal distance {act_min_dist}")
+print(f"Start with minimal distance {act_min_dist:.3f}")
 
 visualizer = espressomd.visualization.openGLLive(system)
 
@@ -127,7 +127,7 @@
 #############################################################
 #      Integration                                          #
 #############################################################
-print("\nStart integration: run %d times %d steps" % (int_n_times, int_steps))
+print(f"\nStart integration: run {int_n_times} times {int_steps} steps")
 
 # print initial energies
 energies = system.analysis.energy()
diff --git a/src/config/features.def b/src/config/features.def
index 59a2583e7ba..ff8eb2a0413 100644
--- a/src/config/features.def
+++ b/src/config/features.def
@@ -54,7 +54,6 @@ ROTATIONAL_INERTIA              implies ROTATION
 /* Electrostatics */
 ELECTROSTATICS
 P3M                             equals ELECTROSTATICS and FFTW
-MMM1D_GPU                       requires CUDA and ELECTROSTATICS
 MMM1D_MACHINE_PREC              requires ELECTROSTATICS
 
 /* Magnetostatics */
@@ -63,8 +62,6 @@ DP3M                            equals DIPOLES and FFTW
 DIPOLAR_DIRECT_SUM              requires CUDA
 DIPOLAR_DIRECT_SUM              equals DIPOLES and ROTATION and CUDA
 DIPOLE_FIELD_TRACKING           implies DIPOLES
-DIPOLAR_BARNES_HUT              requires CUDA
-DIPOLAR_BARNES_HUT              equals DIPOLES and ROTATION and CUDA
 
 /* Virtual sites features */
 VIRTUAL_SITES
diff --git a/src/config/include/config/config.hpp b/src/config/include/config/config.hpp
index f392f9fff80..a3a8d14ea9f 100644
--- a/src/config/include/config/config.hpp
+++ b/src/config/include/config/config.hpp
@@ -54,7 +54,7 @@
 #endif
 
 /** Whether to use the approximation of Abramowitz/Stegun @cite abramowitz65a
- *  @ref AS_erfc_part() for \f$\exp(d^2) \mathrm{erfc}(d)\f$,
+ *  @ref Utils::AS_erfc_part() for \f$\exp(d^2) \mathrm{erfc}(d)\f$,
  *  or the C function <tt>std::erfc()</tt> in P3M and Ewald summation.
  */
 #ifndef USE_ERFC_APPROXIMATION
diff --git a/src/config/myconfig-default.hpp b/src/config/myconfig-default.hpp
index 4e8c8df611a..f385f81ee3e 100644
--- a/src/config/myconfig-default.hpp
+++ b/src/config/myconfig-default.hpp
@@ -40,9 +40,6 @@
 
 // Charges and dipoles
 #define ELECTROSTATICS
-#ifdef CUDA
-#define MMM1D_GPU
-#endif
 #define DIPOLES
 
 // Active matter
diff --git a/src/core/BondList.hpp b/src/core/BondList.hpp
index e3b0eb00aa2..1c077cb3f7f 100644
--- a/src/core/BondList.hpp
+++ b/src/core/BondList.hpp
@@ -19,12 +19,10 @@
 #ifndef ESPRESSO_BONDLIST_HPP
 #define ESPRESSO_BONDLIST_HPP
 
-#include <utils/Span.hpp>
 #include <utils/compact_vector.hpp>
 
 #include <boost/container/vector.hpp>
 #include <boost/iterator/iterator_facade.hpp>
-#include <boost/range/algorithm/copy.hpp>
 #include <boost/serialization/access.hpp>
 #include <boost/serialization/array.hpp>
 #include <boost/version.hpp>
@@ -33,6 +31,7 @@
 #include <cassert>
 #include <cstddef>
 #include <memory>
+#include <span>
 #include <type_traits>
 
 /**
@@ -45,18 +44,18 @@
 class BondView {
   /* Bond id */
   int m_id = -1;
-  Utils::Span<const int> m_partners;
+  std::span<const int> m_partners;
 
 public:
   BondView() = default;
-  BondView(int id, Utils::Span<const int> partners)
+  BondView(int id, std::span<const int> partners)
       : m_id(id), m_partners(partners) {}
 
   int bond_id() const { return m_id; }
-  Utils::Span<const int> const &partner_ids() const { return m_partners; }
+  auto const &partner_ids() const { return m_partners; }
 
   bool operator==(BondView const &rhs) const {
-    return m_id == rhs.m_id and boost::equal(m_partners, rhs.m_partners);
+    return m_id == rhs.m_id and std::ranges::equal(m_partners, rhs.m_partners);
   }
 
   bool operator!=(BondView const &rhs) const { return not(*this == rhs); }
@@ -108,13 +107,13 @@ class BondList {
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
     if (Archive::is_loading::value) {
       std::size_t size{};
-      ar &size;
+      ar & size;
       m_storage.resize(size);
     }
 
     if (Archive::is_saving::value) {
       auto size = m_storage.size();
-      ar &size;
+      ar & size;
     }
 
     ar &boost::serialization::make_array(m_storage.data(), m_storage.size());
@@ -140,8 +139,8 @@ class BondList {
       auto const partners_begin = m_it;
       auto const partners_end = id_pos;
       auto const dist = std::distance(partners_begin, partners_end);
-      return {-(*id_pos) - 1, Utils::make_span(std::addressof(*partners_begin),
-                                               static_cast<size_type>(dist))};
+      return {-(*id_pos) - 1, std::span(std::addressof(*partners_begin),
+                                        static_cast<size_type>(dist))};
     }
   };
 
@@ -188,7 +187,7 @@ class BondList {
    * @param bond Bond to add.
    */
   void insert(BondView const &bond) {
-    boost::copy(bond.partner_ids(), std::back_inserter(m_storage));
+    std::ranges::copy(bond.partner_ids(), std::back_inserter(m_storage));
     assert(bond.bond_id() >= 0);
     m_storage.push_back(-(bond.bond_id() + 1));
   }
diff --git a/src/core/BoxGeometry.hpp b/src/core/BoxGeometry.hpp
index 6ba20e48e86..fdf3f6321a6 100644
--- a/src/core/BoxGeometry.hpp
+++ b/src/core/BoxGeometry.hpp
@@ -99,17 +99,17 @@ class BoxGeometry {
 public:
   BoxGeometry() {
     set_length(Utils::Vector3d{1., 1., 1.});
-    set_periodic(0, true);
-    set_periodic(1, true);
-    set_periodic(2, true);
+    set_periodic(0u, true);
+    set_periodic(1u, true);
+    set_periodic(2u, true);
     set_type(BoxType::CUBOID);
   }
   BoxGeometry(BoxGeometry const &rhs) {
     m_type = rhs.type();
     set_length(rhs.length());
-    set_periodic(0, rhs.periodic(0));
-    set_periodic(1, rhs.periodic(1));
-    set_periodic(2, rhs.periodic(2));
+    set_periodic(0u, rhs.periodic(0u));
+    set_periodic(1u, rhs.periodic(1u));
+    set_periodic(2u, rhs.periodic(2u));
     m_lees_edwards_bc = rhs.m_lees_edwards_bc;
   }
 
@@ -190,7 +190,7 @@ class BoxGeometry {
    *         i.e. <tt>a - b</tt>. Can be negative.
    */
   template <typename T> T inline get_mi_coord(T a, T b, unsigned coord) const {
-    assert(coord <= 2);
+    assert(coord <= 2u);
 
     return detail::get_mi_coord(a, b, m_length[coord], m_length_inv[coord],
                                 m_length_half[coord], m_periodic[coord]);
@@ -210,8 +210,7 @@ class BoxGeometry {
   Utils::Vector<T, 3> get_mi_vector(const Utils::Vector<T, 3> &a,
                                     const Utils::Vector<T, 3> &b) const {
     if (type() == BoxType::LEES_EDWARDS) {
-      auto const shear_plane_normal =
-          static_cast<unsigned int>(lees_edwards_bc().shear_plane_normal);
+      auto const shear_plane_normal = lees_edwards_bc().shear_plane_normal;
       auto a_tmp = a;
       auto b_tmp = b;
       a_tmp[shear_plane_normal] = Algorithm::periodic_fold(
@@ -250,10 +249,8 @@ class BoxGeometry {
     auto ret = u - v;
     if (type() == BoxType::LEES_EDWARDS) {
       auto const &le = m_lees_edwards_bc;
-      auto const shear_plane_normal =
-          static_cast<unsigned int>(le.shear_plane_normal);
-      auto const shear_direction =
-          static_cast<unsigned int>(le.shear_direction);
+      auto const shear_plane_normal = le.shear_plane_normal;
+      auto const shear_direction = le.shear_direction;
       auto const dy = x[shear_plane_normal] - y[shear_plane_normal];
       if (fabs(dy) > 0.5 * length_half()[shear_plane_normal]) {
         ret[shear_direction] -= Utils::sgn(dy) * le.shear_velocity;
@@ -264,11 +261,11 @@ class BoxGeometry {
 
   /** @brief Fold coordinates to primary simulation box in-place.
    *  Lees-Edwards offset is ignored.
-   *  @param[in,out] pos        coordinate to fold
+   *  @param[in,out] pos        coordinates to fold
    *  @param[in,out] image_box  image box offset
    */
   void fold_position(Utils::Vector3d &pos, Utils::Vector3i &image_box) const {
-    for (unsigned int i = 0u; i < 3u; i++) {
+    for (auto i = 0u; i < 3u; i++) {
       if (m_periodic[i]) {
         auto const result =
             Algorithm::periodic_fold(pos[i], image_box[i], m_length[i]);
@@ -284,21 +281,39 @@ class BoxGeometry {
     }
   }
 
-  /** @brief Calculate coordinates folded to primary simulation box.
-   *  @param p    coordinate to fold
-   *  @return Folded coordinates.
+  /**
+   * @brief Calculate coordinates folded to primary simulation box.
+   * @param[in] pos    coordinates to fold
+   * @return Folded coordinates.
    */
-  auto folded_position(Utils::Vector3d const &p) const {
-    Utils::Vector3d p_folded;
+  auto folded_position(Utils::Vector3d const &pos) const {
+    auto pos_folded = pos;
     for (unsigned int i = 0u; i < 3u; i++) {
       if (m_periodic[i]) {
-        p_folded[i] = Algorithm::periodic_fold(p[i], m_length[i]);
-      } else {
-        p_folded[i] = p[i];
+        pos_folded[i] = Algorithm::periodic_fold(pos[i], m_length[i]);
+      }
+    }
+
+    return pos_folded;
+  }
+
+  /**
+   * @brief Calculate image box of coordinates folded to primary simulation box.
+   * @param[in] pos        coordinates
+   * @param[in] image_box  image box to fold
+   * @return Folded image box.
+   */
+  auto folded_image_box(Utils::Vector3d const &pos,
+                        Utils::Vector3i const &image_box) const {
+    auto image_box_folded = image_box;
+    for (auto i = 0u; i < 3u; i++) {
+      if (m_periodic[i]) {
+        image_box_folded[i] =
+            Algorithm::periodic_fold(pos[i], image_box[i], m_length[i]).second;
       }
     }
 
-    return p_folded;
+    return image_box_folded;
   }
 
   /** @brief Calculate image box shift vector */
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 6f93be1162b..0e7b82024d1 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -58,10 +58,9 @@ if(ESPRESSO_BUILD_WITH_CUDA)
   target_sources(espresso_core PRIVATE cuda/init.cpp)
   espresso_add_gpu_library(
     espresso_cuda SHARED cuda/common_cuda.cu cuda/init_cuda.cu
-    cuda/CudaHostAllocator.cu magnetostatics/barnes_hut_gpu_cuda.cu
-    magnetostatics/dipolar_direct_sum_gpu_cuda.cu
-    electrostatics/mmm1d_gpu_cuda.cu electrostatics/p3m_gpu_cuda.cu
-    electrostatics/p3m_gpu_error_cuda.cu system/GpuParticleData_cuda.cu)
+    cuda/CudaHostAllocator.cu magnetostatics/dipolar_direct_sum_gpu_cuda.cu
+    electrostatics/p3m_gpu_cuda.cu electrostatics/p3m_gpu_error_cuda.cu
+    system/GpuParticleData_cuda.cu)
   add_library(espresso::cuda ALIAS espresso_cuda)
   target_link_libraries(
     espresso_cuda PRIVATE CUDA::cuda_driver CUDA::cudart CUDA::cufft
@@ -83,15 +82,21 @@ install(TARGETS espresso_core
 
 target_link_libraries(
   espresso_core PRIVATE espresso::config espresso::utils::mpi espresso::shapes
-                        espresso::profiler espresso::cpp_flags
+                        espresso::cpp_flags
   PUBLIC espresso::utils MPI::MPI_CXX Random123 espresso::particle_observables
-         Boost::serialization Boost::mpi)
+         Boost::serialization Boost::mpi espresso::profiler)
 
 target_include_directories(espresso_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
 if(ESPRESSO_BUILD_WITH_WALBERLA)
-  target_link_libraries(espresso_core PRIVATE espresso::walberla
-                                              ${WALBERLA_LIBS})
+  target_link_libraries(
+    espresso_core
+    PRIVATE espresso::walberla
+            $<$<BOOL:${ESPRESSO_BUILD_WITH_CUDA}>:espresso::walberla_cuda>)
+endif()
+
+if(ESPRESSO_BUILD_WITH_FFTW)
+  add_subdirectory(fft)
 endif()
 
 add_subdirectory(accumulators)
@@ -122,3 +127,10 @@ add_subdirectory(virtual_sites)
 if(ESPRESSO_BUILD_TESTS)
   add_subdirectory(unit_tests)
 endif()
+
+if(ESPRESSO_BUILD_WITH_HDF5 AND ESPRESSO_BUILD_WITH_COVERAGE
+   AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set_source_files_properties(
+    io/writer/h5md_core.cpp PROPERTIES COMPILE_OPTIONS -felide-constructors
+                                       DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+endif()
diff --git a/src/core/CellParticleIterator.hpp b/src/core/CellParticleIterator.hpp
index b6a5ee9d26d..1853d828bc9 100644
--- a/src/core/CellParticleIterator.hpp
+++ b/src/core/CellParticleIterator.hpp
@@ -22,4 +22,4 @@
 #include "ParticleIterator.hpp"
 #include "cell_system/Cell.hpp"
 
-using CellParticleIterator = ParticleIterator<Cell *const *>;
+using CellParticleIterator = ParticleIterator<std::span<Cell *const>::iterator>;
diff --git a/src/core/MpiCallbacks.hpp b/src/core/MpiCallbacks.hpp
index 0174076a742..4b5f1a03aa6 100644
--- a/src/core/MpiCallbacks.hpp
+++ b/src/core/MpiCallbacks.hpp
@@ -43,7 +43,6 @@
 #include <boost/mpi/communicator.hpp>
 #include <boost/mpi/environment.hpp>
 #include <boost/mpi/packed_iarchive.hpp>
-#include <boost/range/algorithm/remove_if.hpp>
 
 #include <cassert>
 #include <memory>
@@ -271,7 +270,7 @@ class MpiCallbacks {
     if (m_comm.rank() == 0) {
       try {
         abort_loop();
-      } catch (...) {
+      } catch (...) { // NOLINT(bugprone-empty-catch)
       }
     }
   }
@@ -331,12 +330,9 @@ class MpiCallbacks {
    * @param id Identifier of the callback to remove.
    */
   void remove(int id) {
-    m_callbacks.erase(
-        boost::remove_if(m_callbacks,
-                         [ptr = m_callback_map[id]](auto const &e) {
-                           return e.get() == ptr;
-                         }),
-        m_callbacks.end());
+    std::erase_if(m_callbacks, [ptr = m_callback_map[id]](auto const &e) {
+      return e.get() == ptr;
+    });
     m_callback_map.remove(id);
   }
 
diff --git a/src/core/Observable_stat.cpp b/src/core/Observable_stat.cpp
index d50d367f368..f13704af9d1 100644
--- a/src/core/Observable_stat.cpp
+++ b/src/core/Observable_stat.cpp
@@ -25,7 +25,6 @@
 
 #include "communication.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/index.hpp>
 
 #include <boost/mpi/collectives/reduce.hpp>
@@ -33,6 +32,7 @@
 #include <cassert>
 #include <cstddef>
 #include <functional>
+#include <span>
 #include <vector>
 
 Observable_stat::Observable_stat(std::size_t chunk_size, std::size_t n_bonded,
@@ -46,9 +46,7 @@ Observable_stat::Observable_stat(std::size_t chunk_size, std::size_t n_bonded,
 #else
   constexpr std::size_t n_vs = 0;
 #endif
-  auto const n_non_bonded =
-      static_cast<std::size_t>(Utils::lower_triangular(max_type, max_type)) +
-      1ul;
+  auto const n_non_bonded = get_non_bonded_offset(max_type, max_type) + 1ul;
   constexpr std::size_t n_ext_fields = 1; // reduction over all fields
   constexpr std::size_t n_kinetic = 1; // linear+angular kinetic contributions
 
@@ -57,26 +55,23 @@ Observable_stat::Observable_stat(std::size_t chunk_size, std::size_t n_bonded,
   m_data = std::vector<double>(m_chunk_size * n_elements);
 
   // spans for the different contributions
-  kinetic = Utils::Span<double>(m_data.data(), m_chunk_size);
-  bonded = Utils::Span<double>(kinetic.end(), n_bonded * m_chunk_size);
-  coulomb = Utils::Span<double>(bonded.end(), n_coulomb * m_chunk_size);
-  dipolar = Utils::Span<double>(coulomb.end(), n_dipolar * m_chunk_size);
-  virtual_sites = Utils::Span<double>(dipolar.end(), n_vs * m_chunk_size);
+  kinetic = std::span<double>(m_data.data(), m_chunk_size);
+  bonded = std::span<double>(kinetic.end(), n_bonded * m_chunk_size);
+  coulomb = std::span<double>(bonded.end(), n_coulomb * m_chunk_size);
+  dipolar = std::span<double>(coulomb.end(), n_dipolar * m_chunk_size);
+  virtual_sites = std::span<double>(dipolar.end(), n_vs * m_chunk_size);
   external_fields =
-      Utils::Span<double>(virtual_sites.end(), n_ext_fields * m_chunk_size);
+      std::span<double>(virtual_sites.end(), n_ext_fields * m_chunk_size);
   non_bonded_intra =
-      Utils::Span<double>(external_fields.end(), n_non_bonded * m_chunk_size);
+      std::span<double>(external_fields.end(), n_non_bonded * m_chunk_size);
   non_bonded_inter =
-      Utils::Span<double>(non_bonded_intra.end(), n_non_bonded * m_chunk_size);
-  assert(non_bonded_inter.end() == (m_data.data() + m_data.size()));
+      std::span<double>(non_bonded_intra.end(), n_non_bonded * m_chunk_size);
+  assert(&*non_bonded_inter.end() == (m_data.data() + m_data.size()));
 }
 
-Utils::Span<double>
-Observable_stat::get_non_bonded_contribution(Utils::Span<double> base_pointer,
-                                             int type1, int type2) const {
-  auto const offset = static_cast<std::size_t>(
+std::size_t Observable_stat::get_non_bonded_offset(int type1, int type2) const {
+  return static_cast<std::size_t>(
       Utils::lower_triangular(std::max(type1, type2), std::min(type1, type2)));
-  return {base_pointer.begin() + offset * m_chunk_size, m_chunk_size};
 }
 
 void Observable_stat::mpi_reduce() {
diff --git a/src/core/Observable_stat.hpp b/src/core/Observable_stat.hpp
index a5e40db0d56..5a04649a63b 100644
--- a/src/core/Observable_stat.hpp
+++ b/src/core/Observable_stat.hpp
@@ -19,15 +19,12 @@
 
 #pragma once
 
-#include <boost/range/algorithm/transform.hpp>
-#include <boost/range/numeric.hpp>
-
-#include <utils/Span.hpp>
-
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <functional>
+#include <numeric>
+#include <span>
 #include <vector>
 
 /** Observable for the pressure and energy. */
@@ -37,10 +34,14 @@ class Observable_stat {
   /** Number of doubles per data item */
   std::size_t m_chunk_size;
 
+  std::size_t get_non_bonded_offset(int type1, int type2) const;
+
   /** Get contribution from a non-bonded interaction */
-  Utils::Span<double>
-  get_non_bonded_contribution(Utils::Span<double> base_pointer, int type1,
-                              int type2) const;
+  auto get_non_bonded_contribution(std::span<double> view, int type1,
+                                   int type2) const {
+    auto const offset = get_non_bonded_offset(type1, type2);
+    return view.subspan(offset * m_chunk_size, m_chunk_size);
+  }
 
 public:
   Observable_stat(std::size_t chunk_size, std::size_t n_bonded, int max_type);
@@ -52,10 +53,10 @@ class Observable_stat {
    *  @param column Which column to sum up (only relevant for multi-dimensional
    *                observables).
    */
-  double accumulate(double acc = 0.0, std::size_t column = 0) const {
+  double accumulate(double acc = 0.0, std::size_t column = 0ul) const {
     assert(column < m_chunk_size);
-    if (m_chunk_size == 1)
-      return boost::accumulate(m_data, acc);
+    if (m_chunk_size == 1ul)
+      return std::accumulate(m_data.begin(), m_data.end(), acc);
 
     for (auto it = m_data.begin() + static_cast<std::ptrdiff_t>(column);
          it < m_data.end(); it += static_cast<std::ptrdiff_t>(m_chunk_size))
@@ -65,40 +66,40 @@ class Observable_stat {
 
   /** Rescale values */
   void rescale(double volume) {
-    auto const fac = 1. / volume;
-    boost::transform(m_data, m_data.begin(), [fac](auto e) { return e * fac; });
+    std::ranges::transform(m_data, m_data.begin(),
+                           std::bind_front(std::multiplies{}, 1. / volume));
   }
 
   /** Contribution from linear and angular kinetic energy (accumulated). */
-  Utils::Span<double> kinetic;
+  std::span<double> kinetic;
   /** Contribution(s) from bonded interactions. */
-  Utils::Span<double> bonded;
+  std::span<double> bonded;
   /** Contribution(s) from Coulomb interactions. */
-  Utils::Span<double> coulomb;
+  std::span<double> coulomb;
   /** Contribution(s) from dipolar interactions. */
-  Utils::Span<double> dipolar;
+  std::span<double> dipolar;
   /** Contribution from virtual sites (accumulated). */
-  Utils::Span<double> virtual_sites;
+  std::span<double> virtual_sites;
   /** Contribution from external fields (accumulated). */
-  Utils::Span<double> external_fields;
+  std::span<double> external_fields;
   /** Contribution(s) from non-bonded intramolecular interactions. */
-  Utils::Span<double> non_bonded_intra;
+  std::span<double> non_bonded_intra;
   /** Contribution(s) from non-bonded intermolecular interactions. */
-  Utils::Span<double> non_bonded_inter;
+  std::span<double> non_bonded_inter;
 
   /** Get contribution from a bonded interaction */
-  Utils::Span<double> bonded_contribution(int bond_id) const {
+  std::span<double> bonded_contribution(int bond_id) const {
     auto const offset = m_chunk_size * static_cast<std::size_t>(bond_id);
     return {bonded.data() + offset, m_chunk_size};
   }
 
   void add_non_bonded_contribution(int type1, int type2, int molid1, int molid2,
-                                   Utils::Span<const double> data) {
+                                   std::span<const double> data) {
     assert(data.size() == m_chunk_size);
-    auto const span = (molid1 == molid2) ? non_bonded_intra : non_bonded_inter;
-    auto const dest = get_non_bonded_contribution(span, type1, type2);
+    auto const view = (molid1 == molid2) ? non_bonded_intra : non_bonded_inter;
+    auto const dest = get_non_bonded_contribution(view, type1, type2);
 
-    boost::transform(dest, data, dest.begin(), std::plus<>{});
+    std::ranges::transform(dest, data, dest.begin(), std::plus{});
   }
 
   void add_non_bonded_contribution(int type1, int type2, int molid1, int molid2,
@@ -107,14 +108,12 @@ class Observable_stat {
   }
 
   /** Get contribution from a non-bonded intramolecular interaction */
-  Utils::Span<double> non_bonded_intra_contribution(int type1,
-                                                    int type2) const {
+  auto non_bonded_intra_contribution(int type1, int type2) const {
     return get_non_bonded_contribution(non_bonded_intra, type1, type2);
   }
 
   /** Get contribution from a non-bonded intermolecular interaction */
-  Utils::Span<double> non_bonded_inter_contribution(int type1,
-                                                    int type2) const {
+  auto non_bonded_inter_contribution(int type1, int type2) const {
     return get_non_bonded_contribution(non_bonded_inter, type1, type2);
   }
 
diff --git a/src/core/PartCfg.cpp b/src/core/PartCfg.cpp
index 21c330d3ce3..222aff96a9c 100644
--- a/src/core/PartCfg.cpp
+++ b/src/core/PartCfg.cpp
@@ -23,10 +23,9 @@
 #include "particle_node.hpp"
 #include "system/System.hpp"
 
-#include <utils/Span.hpp>
-
 #include <algorithm>
 #include <cstddef>
+#include <span>
 
 void PartCfg::update() {
   m_parts.clear();
@@ -37,8 +36,7 @@ void PartCfg::update() {
   for (std::size_t offset = 0; offset < ids.size();) {
     auto const this_size = std::clamp(chunk_size, std::size_t{0},
                                       std::size_t{ids.size() - offset});
-    auto const chunk_ids =
-        Utils::make_const_span(ids.data() + offset, this_size);
+    auto const chunk_ids = std::span(ids.data() + offset, this_size);
 
     prefetch_particle_data(chunk_ids);
 
diff --git a/src/core/Particle.hpp b/src/core/Particle.hpp
index 46b9085e043..2e125ae3abb 100644
--- a/src/core/Particle.hpp
+++ b/src/core/Particle.hpp
@@ -35,12 +35,11 @@
 #include <boost/serialization/vector.hpp>
 
 #include <algorithm>
+#include <cassert>
 #include <cstdint>
 #include <vector>
 
 namespace detail {
-inline void check_axis_idx_valid(unsigned int const axis) { assert(axis <= 2); }
-
 inline bool get_nth_bit(uint8_t const bitfield, unsigned int const bit_idx) {
   return bitfield & (1u << bit_idx);
 }
@@ -57,7 +56,7 @@ struct ParticleParametersSwimming {
   bool is_engine_force_on_fluid = false;
 
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
-    ar &f_swim &swimming &is_engine_force_on_fluid;
+    ar & f_swim & swimming & is_engine_force_on_fluid;
   }
 };
 #endif
@@ -160,10 +159,10 @@ struct ParticleProperties {
     Utils::Quaternion<double> quat = Utils::Quaternion<double>::identity();
 
     template <class Archive> void serialize(Archive &ar, long int) {
-      ar &to_particle_id;
-      ar &distance;
-      ar &rel_orientation;
-      ar &quat;
+      ar & to_particle_id;
+      ar & distance;
+      ar & rel_orientation;
+      ar & quat;
     }
   } vs_relative;
 #endif // VIRTUAL_SITES_RELATIVE
@@ -199,53 +198,53 @@ struct ParticleProperties {
 #endif
 
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
-    ar &identity;
-    ar &mol_id;
-    ar &type;
-    ar &propagation;
+    ar & identity;
+    ar & mol_id;
+    ar & type;
+    ar & propagation;
 
 #ifdef MASS
-    ar &mass;
+    ar & mass;
 #endif
 #ifdef ROTATIONAL_INERTIA
-    ar &rinertia;
+    ar & rinertia;
 #endif
 #ifdef ROTATION
-    ar &rotation;
+    ar & rotation;
 #endif
 #ifdef ELECTROSTATICS
-    ar &q;
+    ar & q;
 #endif
 
 #ifdef LB_ELECTROHYDRODYNAMICS
-    ar &mu_E;
+    ar & mu_E;
 #endif
 #ifdef DIPOLES
-    ar &dipm;
+    ar & dipm;
 #endif
 #ifdef DIPOLE_FIELD_TRACKING
-    ar &dip_fld;
+    ar & dip_fld;
 #endif
 #ifdef VIRTUAL_SITES_RELATIVE
-    ar &vs_relative;
+    ar & vs_relative;
 #endif
 
 #ifdef THERMOSTAT_PER_PARTICLE
-    ar &gamma;
+    ar & gamma;
 #ifdef ROTATION
-    ar &gamma_rot;
+    ar & gamma_rot;
 #endif
 #endif // THERMOSTAT_PER_PARTICLE
 #ifdef EXTERNAL_FORCES
-    ar &ext_flag;
-    ar &ext_force;
+    ar & ext_flag;
+    ar & ext_force;
 #ifdef ROTATION
-    ar &ext_torque;
+    ar & ext_torque;
 #endif
 #endif // EXTERNAL_FORCES
 
 #ifdef ENGINE
-    ar &swim;
+    ar & swim;
 #endif
   }
 };
@@ -274,13 +273,13 @@ struct ParticlePosition {
 #endif
 
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
-    ar &p;
-    ar &i;
+    ar & p;
+    ar & i;
 #ifdef ROTATION
-    ar &quat;
+    ar & quat;
 #endif
 #ifdef BOND_CONSTRAINT
-    ar &p_last_timestep;
+    ar & p_last_timestep;
 #endif
   }
 };
@@ -300,15 +299,17 @@ struct ParticleForce {
 
   friend ParticleForce operator+(ParticleForce const &lhs,
                                  ParticleForce const &rhs) {
-#ifdef ROTATION
-    return {lhs.f + rhs.f, lhs.torque + rhs.torque};
-#else
-    return lhs.f + rhs.f;
-#endif
+    ParticleForce result = lhs;
+    result += rhs;
+    return result;
   }
 
   ParticleForce &operator+=(ParticleForce const &rhs) {
-    return *this = *this + rhs;
+    f += rhs.f;
+#ifdef ROTATION
+    torque += rhs.torque;
+#endif
+    return *this;
   }
 
   /** force. */
@@ -320,9 +321,9 @@ struct ParticleForce {
 #endif
 
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
-    ar &f;
+    ar & f;
 #ifdef ROTATION
-    ar &torque;
+    ar & torque;
 #endif
   }
 };
@@ -343,9 +344,9 @@ struct ParticleMomentum {
 #endif
 
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
-    ar &v;
+    ar & v;
 #ifdef ROTATION
-    ar &omega;
+    ar & omega;
 #endif
   }
 };
@@ -363,10 +364,10 @@ struct ParticleLocal {
   double lees_edwards_offset = 0.;
 
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
-    ar &ghost;
-    ar &lees_edwards_flag;
-    ar &p_old;
-    ar &lees_edwards_offset;
+    ar & ghost;
+    ar & lees_edwards_flag;
+    ar & p_old;
+    ar & lees_edwards_offset;
   }
 };
 
@@ -385,7 +386,7 @@ struct ParticleRattle {
   }
 
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
-    ar &correction;
+    ar & correction;
   }
 };
 #endif
@@ -458,11 +459,11 @@ struct Particle { // NOLINT(bugprone-exception-escape)
   auto &rotation() { return p.rotation; }
   bool can_rotate() const { return static_cast<bool>(p.rotation); }
   bool can_rotate_around(unsigned int const axis) const {
-    detail::check_axis_idx_valid(axis);
+    assert(axis <= 2u);
     return detail::get_nth_bit(p.rotation, axis);
   }
   void set_can_rotate_around(unsigned int const axis, bool const rot_flag) {
-    detail::check_axis_idx_valid(axis);
+    assert(axis <= 2u);
     if (rot_flag) {
       p.rotation |= static_cast<uint8_t>(1u << axis);
     } else {
@@ -539,7 +540,7 @@ struct Particle { // NOLINT(bugprone-exception-escape)
   auto &fixed() { return p.ext_flag; }
   bool has_fixed_coordinates() const { return static_cast<bool>(p.ext_flag); }
   bool is_fixed_along(unsigned int const axis) const {
-    detail::check_axis_idx_valid(axis);
+    assert(axis <= 2u);
     return detail::get_nth_bit(p.ext_flag, axis);
   }
   void set_fixed_along(int const axis, bool const fixed_flag) {
@@ -580,14 +581,14 @@ struct Particle { // NOLINT(bugprone-exception-escape)
 private:
   friend boost::serialization::access;
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
-    ar &p;
-    ar &r;
-    ar &m;
-    ar &f;
-    ar &l;
-    ar &bl;
+    ar & p;
+    ar & r;
+    ar & m;
+    ar & f;
+    ar & l;
+    ar & bl;
 #ifdef EXCLUSIONS
-    ar &el;
+    ar & el;
 #endif
   }
 };
diff --git a/src/core/ParticleIterator.hpp b/src/core/ParticleIterator.hpp
index 1db965e4e4b..03d837f91b0 100644
--- a/src/core/ParticleIterator.hpp
+++ b/src/core/ParticleIterator.hpp
@@ -74,7 +74,7 @@ struct ParticleIterator
     auto it = std::next(begin.m_cell);
 
     while (it != end.m_cell) {
-      dist += (*it)->particles().size();
+      dist += static_cast<long>((*it)->particles().size());
       ++it;
     }
 
diff --git a/src/core/accumulators.cpp b/src/core/accumulators.cpp
index bc2b8306033..db1bcee9a5f 100644
--- a/src/core/accumulators.cpp
+++ b/src/core/accumulators.cpp
@@ -18,7 +18,6 @@
  */
 #include "accumulators.hpp"
 
-#include <boost/range/algorithm/remove_if.hpp>
 #include <boost/range/numeric.hpp>
 
 #include <algorithm>
@@ -76,17 +75,13 @@ void auto_update_add(AccumulatorBase *acc) {
 
 void auto_update_remove(AccumulatorBase *acc) {
   assert(auto_update_contains(acc));
-  auto const beg = auto_update_accumulators.begin();
-  auto const end = auto_update_accumulators.end();
-  auto_update_accumulators.erase(
-      std::remove_if(beg, end, detail::MatchPredicate{acc}), end);
+  std::erase_if(auto_update_accumulators, detail::MatchPredicate{acc});
 }
 
 bool auto_update_contains(AccumulatorBase const *acc) noexcept {
   assert(acc);
-  auto const beg = auto_update_accumulators.begin();
-  auto const end = auto_update_accumulators.end();
-  return std::find_if(beg, end, detail::MatchPredicate{acc}) != end;
+  return std::ranges::any_of(auto_update_accumulators,
+                             detail::MatchPredicate{acc});
 }
 
 } // namespace Accumulators
diff --git a/src/core/accumulators/Correlator.cpp b/src/core/accumulators/Correlator.cpp
index fccadbd62cd..430b8656805 100644
--- a/src/core/accumulators/Correlator.cpp
+++ b/src/core/accumulators/Correlator.cpp
@@ -26,7 +26,6 @@
 #include <boost/archive/binary_oarchive.hpp>
 #include <boost/iostreams/device/array.hpp>
 #include <boost/iostreams/stream.hpp>
-#include <boost/range/algorithm/transform.hpp>
 #include <boost/serialization/string.hpp>
 #include <boost/serialization/vector.hpp>
 
@@ -197,10 +196,10 @@ void Correlator::initialize() {
   dim_A = A_obs->n_values();
   dim_B = B_obs->n_values();
 
-  if (dim_A == 0) {
+  if (dim_A == 0u) {
     throw std::runtime_error("dimension of first observable has to be >= 1");
   }
-  if (dim_B == 0) {
+  if (dim_B == 0u) {
     throw std::runtime_error("dimension of second observable has to be >= 1");
   }
 
@@ -212,7 +211,9 @@ void Correlator::initialize() {
     m_correlation_args = Utils::Vector3d{0, 0, 0};
   } else if (corr_operation_name == "tensor_product") {
     m_dim_corr = dim_A * dim_B;
-    m_shape = {dim_A, dim_B};
+    m_shape.clear();
+    m_shape.emplace_back(dim_A);
+    m_shape.emplace_back(dim_B);
     corr_operation = &tensor_product;
     m_correlation_args = Utils::Vector3d{0, 0, 0};
   } else if (corr_operation_name == "square_distance_componentwise") {
@@ -229,18 +230,19 @@ void Correlator::initialize() {
     }
     m_correlation_args =
         Utils::hadamard_product(m_correlation_args, m_correlation_args);
-    if (dim_A % 3)
+    if (dim_A % 3u)
       throw std::runtime_error("dimA must be divisible by 3 for fcs_acf");
-    m_dim_corr = dim_A / 3;
+    m_dim_corr = dim_A / 3u;
     m_shape = A_obs->shape();
-    if (m_shape.back() != 3)
+    if (m_shape.back() != 3u)
       throw std::runtime_error(
           "the last dimension of dimA must be 3 for fcs_acf");
     m_shape.pop_back();
     corr_operation = &fcs_acf;
   } else if (corr_operation_name == "scalar_product") {
-    m_dim_corr = 1;
-    m_shape = {1};
+    m_dim_corr = 1u;
+    m_shape.clear();
+    m_shape.emplace_back(1u);
     corr_operation = &scalar_product;
     m_correlation_args = Utils::Vector3d{0, 0, 0};
   } else {
@@ -524,8 +526,8 @@ std::vector<double> Correlator::get_correlation() {
 
 std::vector<double> Correlator::get_lag_times() const {
   std::vector<double> res(n_values());
-  boost::transform(tau, res.begin(),
-                   [dt = m_dt](auto const &a) { return a * dt; });
+  std::ranges::transform(tau, res.begin(),
+                         [dt = m_dt](auto const &a) { return a * dt; });
   return res;
 }
 
diff --git a/src/core/algorithm/periodic_fold.hpp b/src/core/algorithm/periodic_fold.hpp
index 6e00e42a420..ccdf8f0929e 100644
--- a/src/core/algorithm/periodic_fold.hpp
+++ b/src/core/algorithm/periodic_fold.hpp
@@ -20,9 +20,18 @@
 #define CORE_ALGORITHM_PERIODIC_FOLD_HPP
 
 #include <cmath>
+#include <concepts>
 #include <limits>
+#include <type_traits>
 #include <utility>
 
+// Define a concept that checks if a type T is an integer or a reference to an
+// integer
+template <typename T>
+concept IntegralOrRef = std::integral<std::remove_reference_t<T>>;
+template <typename T>
+concept FloatingPointOrRef = std::floating_point<std::remove_reference_t<T>>;
+
 namespace Algorithm {
 /**
  * @brief Fold value into primary interval.
@@ -32,7 +41,7 @@ namespace Algorithm {
  * @param l Length of primary interval
  * @return x folded into [0, l) and number of folds.
  */
-template <typename T, typename I>
+template <FloatingPointOrRef T, IntegralOrRef I>
 std::pair<T, I> periodic_fold(T x, I i, T const l) {
   using limits = std::numeric_limits<I>;
 
@@ -56,7 +65,7 @@ std::pair<T, I> periodic_fold(T x, I i, T const l) {
  * @param l Length of primary interval
  * @return x folded into [0, l).
  */
-template <typename T> T periodic_fold(T x, T const l) {
+template <FloatingPointOrRef T> T periodic_fold(T x, T const l) {
 #ifndef __FAST_MATH__
   /* Can't fold if either x or l is nan or inf. */
   if (std::isnan(x) or std::isnan(l) or std::isinf(x) or (l == 0)) {
diff --git a/src/core/analysis/statistics.cpp b/src/core/analysis/statistics.cpp
index 66a71a7cb20..a2f09a64dca 100644
--- a/src/core/analysis/statistics.cpp
+++ b/src/core/analysis/statistics.cpp
@@ -34,7 +34,6 @@
 #include "system/System.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/contains.hpp>
 #include <utils/math/sqr.hpp>
 #include <utils/mpi/gather_buffer.hpp>
@@ -47,6 +46,7 @@
 #include <cstdlib>
 #include <functional>
 #include <limits>
+#include <numbers>
 #include <stdexcept>
 #include <tuple>
 #include <utility>
@@ -366,7 +366,7 @@ structure_factor(System::System const &system, std::vector<int> const &p_types,
   };
   auto const buf_pos = gather_traits_for_types(system, p_types, trait_pos);
   auto const order_sq = Utils::sqr(static_cast<std::size_t>(order));
-  auto const twoPI_L = 2. * Utils::pi() * system.box_geo->length_inv()[0];
+  auto const twoPI_L = 2. * std::numbers::pi * system.box_geo->length_inv()[0];
   std::vector<double> ff(2ul * order_sq + 1ul);
   std::vector<double> wavevectors;
   std::vector<double> intensities;
diff --git a/src/core/bond_breakage/bond_breakage.hpp b/src/core/bond_breakage/bond_breakage.hpp
index 604fb6c8d35..6725ba8103b 100644
--- a/src/core/bond_breakage/bond_breakage.hpp
+++ b/src/core/bond_breakage/bond_breakage.hpp
@@ -54,10 +54,10 @@ struct QueueEntry {
   // Serialization for synchronization across mpi ranks
   friend class boost::serialization::access;
   template <typename Archive>
-  void serialize(Archive &ar, const unsigned int version) {
-    ar &particle_id;
-    ar &bond_partners;
-    ar &bond_type;
+  void serialize(Archive &ar, unsigned int const /* version */) {
+    ar & particle_id;
+    ar & bond_partners;
+    ar & bond_type;
   }
 };
 
diff --git a/src/core/bond_error.cpp b/src/core/bond_error.cpp
index 45fb29fcbe7..874ab9a1953 100644
--- a/src/core/bond_error.cpp
+++ b/src/core/bond_error.cpp
@@ -23,9 +23,9 @@
 #include "error_handling/RuntimeErrorStream.hpp"
 #include "errorhandling.hpp"
 
-#include <utils/Span.hpp>
+#include <span>
 
-void bond_broken_error(int id, Utils::Span<const int> partner_ids) {
+void bond_broken_error(int id, std::span<const int> partner_ids) {
   auto error_msg = runtimeErrorMsg();
 
   error_msg << "bond broken between particles " << id;
diff --git a/src/core/bond_error.hpp b/src/core/bond_error.hpp
index de2d0a72fe3..f5a00260781 100644
--- a/src/core/bond_error.hpp
+++ b/src/core/bond_error.hpp
@@ -21,11 +21,10 @@
 #ifndef ESPRESSO_BOND_ERROR_HPP
 #define ESPRESSO_BOND_ERROR_HPP
 
-#include <utils/Span.hpp>
-
 #include <exception>
+#include <span>
 
-void bond_broken_error(int id, Utils::Span<const int> partner_ids);
+void bond_broken_error(int id, std::span<const int> partner_ids);
 
 /**
  * Exception indicating that a particle id
diff --git a/src/core/bonded_interactions/angle_cosine.hpp b/src/core/bonded_interactions/angle_cosine.hpp
index c67e76ee9c4..681e7388a94 100644
--- a/src/core/bonded_interactions/angle_cosine.hpp
+++ b/src/core/bonded_interactions/angle_cosine.hpp
@@ -66,10 +66,10 @@ struct AngleCosineBond {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &bend;
-    ar &phi0;
-    ar &cos_phi0;
-    ar &sin_phi0;
+    ar & bend;
+    ar & phi0;
+    ar & cos_phi0;
+    ar & sin_phi0;
   }
 };
 
diff --git a/src/core/bonded_interactions/angle_cossquare.hpp b/src/core/bonded_interactions/angle_cossquare.hpp
index 1b59aa53acc..0b1a45136d4 100644
--- a/src/core/bonded_interactions/angle_cossquare.hpp
+++ b/src/core/bonded_interactions/angle_cossquare.hpp
@@ -62,9 +62,9 @@ struct AngleCossquareBond {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &bend;
-    ar &phi0;
-    ar &cos_phi0;
+    ar & bend;
+    ar & phi0;
+    ar & cos_phi0;
   }
 };
 
diff --git a/src/core/bonded_interactions/angle_harmonic.hpp b/src/core/bonded_interactions/angle_harmonic.hpp
index 407209e525c..5aa1d77e04d 100644
--- a/src/core/bonded_interactions/angle_harmonic.hpp
+++ b/src/core/bonded_interactions/angle_harmonic.hpp
@@ -59,8 +59,8 @@ struct AngleHarmonicBond {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &bend;
-    ar &phi0;
+    ar & bend;
+    ar & phi0;
   }
 };
 
diff --git a/src/core/bonded_interactions/bonded_coulomb.hpp b/src/core/bonded_interactions/bonded_coulomb.hpp
index d34496047a3..bb51b8483c1 100644
--- a/src/core/bonded_interactions/bonded_coulomb.hpp
+++ b/src/core/bonded_interactions/bonded_coulomb.hpp
@@ -30,9 +30,8 @@
 
 #include <utils/Vector.hpp>
 
-#include <boost/optional.hpp>
-
 #include <cmath>
+#include <optional>
 
 /** Parameters for Coulomb bond Potential */
 struct BondedCoulomb {
@@ -45,15 +44,15 @@ struct BondedCoulomb {
 
   BondedCoulomb(double prefactor) { this->prefactor = prefactor; }
 
-  boost::optional<Utils::Vector3d> force(double q1q2,
-                                         Utils::Vector3d const &dx) const;
-  boost::optional<double> energy(double q1q2, Utils::Vector3d const &dx) const;
+  std::optional<Utils::Vector3d> force(double q1q2,
+                                       Utils::Vector3d const &dx) const;
+  std::optional<double> energy(double q1q2, Utils::Vector3d const &dx) const;
 
 private:
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &prefactor;
+    ar & prefactor;
   }
 };
 
@@ -61,7 +60,7 @@ struct BondedCoulomb {
  *  @param[in]  q1q2      Product of the particle charges.
  *  @param[in]  dx        Distance between the particles.
  */
-inline boost::optional<Utils::Vector3d>
+inline std::optional<Utils::Vector3d>
 BondedCoulomb::force(double const q1q2, Utils::Vector3d const &dx) const {
 #ifdef ELECTROSTATICS
   auto const dist2 = dx.norm2();
@@ -77,7 +76,7 @@ BondedCoulomb::force(double const q1q2, Utils::Vector3d const &dx) const {
  *  @param[in]  q1q2      Product of the particle charges.
  *  @param[in]  dx        Distance between the particles.
  */
-inline boost::optional<double>
+inline std::optional<double>
 BondedCoulomb::energy(double const q1q2, Utils::Vector3d const &dx) const {
 #ifdef ELECTROSTATICS
   auto const dist = dx.norm();
diff --git a/src/core/bonded_interactions/bonded_coulomb_sr.hpp b/src/core/bonded_interactions/bonded_coulomb_sr.hpp
index 0e4023b8c17..fa7fbb8e3b2 100644
--- a/src/core/bonded_interactions/bonded_coulomb_sr.hpp
+++ b/src/core/bonded_interactions/bonded_coulomb_sr.hpp
@@ -33,10 +33,9 @@
 
 #include <utils/Vector.hpp>
 
-#include <boost/optional.hpp>
-
 #include <cmath>
 #include <functional>
+#include <optional>
 
 /** Parameters for Coulomb bond short-range Potential */
 struct BondedCoulombSR {
@@ -49,11 +48,11 @@ struct BondedCoulombSR {
 
   BondedCoulombSR(double q1q2) { this->q1q2 = q1q2; }
 
-  boost::optional<Utils::Vector3d>
+  std::optional<Utils::Vector3d>
   force(Utils::Vector3d const &dx,
         std::function<Utils::Vector3d(double, Utils::Vector3d const &,
                                       double)> const &kernel) const;
-  boost::optional<double>
+  std::optional<double>
   energy(Particle const &p1, Particle const &p2, Utils::Vector3d const &dx,
          std::function<double(Particle const &, Particle const &, double,
                               Utils::Vector3d const &, double)> const &kernel)
@@ -63,7 +62,7 @@ struct BondedCoulombSR {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &q1q2;
+    ar & q1q2;
   }
 };
 
@@ -71,7 +70,7 @@ struct BondedCoulombSR {
  *  @param[in]  dx        Distance between the particles.
  *  @param[in]  kernel    Coulomb force kernel.
  */
-inline boost::optional<Utils::Vector3d> BondedCoulombSR::force(
+inline std::optional<Utils::Vector3d> BondedCoulombSR::force(
     Utils::Vector3d const &dx,
     std::function<Utils::Vector3d(double, Utils::Vector3d const &,
                                   double)> const &kernel) const {
@@ -88,7 +87,7 @@ inline boost::optional<Utils::Vector3d> BondedCoulombSR::force(
  *  @param[in]  dx        Distance between the particles.
  *  @param[in]  kernel    Coulomb energy kernel.
  */
-inline boost::optional<double> BondedCoulombSR::energy(
+inline std::optional<double> BondedCoulombSR::energy(
     Particle const &p1, Particle const &p2, Utils::Vector3d const &dx,
     std::function<double(Particle const &, Particle const &, double,
                          Utils::Vector3d const &, double)> const &kernel)
diff --git a/src/core/bonded_interactions/bonded_interaction_data.cpp b/src/core/bonded_interactions/bonded_interaction_data.cpp
index ab4750e9b57..cbb4441f4d9 100644
--- a/src/core/bonded_interactions/bonded_interaction_data.cpp
+++ b/src/core/bonded_interactions/bonded_interaction_data.cpp
@@ -21,11 +21,8 @@
 #include "system/System.hpp"
 #include "thermalized_bond.hpp"
 
-#include <boost/range/numeric.hpp>
 #include <boost/variant.hpp>
 
-#include <utils/constants.hpp>
-
 #include <algorithm>
 #include <cstddef>
 #include <vector>
@@ -41,8 +38,8 @@ class BondCutoff : public boost::static_visitor<double> {
 };
 
 double maximal_cutoff_bonded() {
-  auto const max_cut_bonded = boost::accumulate(
-      bonded_ia_params, BONDED_INACTIVE_CUTOFF,
+  auto const max_cut_bonded = std::accumulate(
+      bonded_ia_params.begin(), bonded_ia_params.end(), BONDED_INACTIVE_CUTOFF,
       [](auto max_cut, auto const &kv) {
         return std::max(max_cut,
                         boost::apply_visitor(BondCutoff(), *kv.second));
diff --git a/src/core/bonded_interactions/bonded_interactions.dox b/src/core/bonded_interactions/bonded_interactions.dox
index a7d454ce1cf..251d7711e2f 100644
--- a/src/core/bonded_interactions/bonded_interactions.dox
+++ b/src/core/bonded_interactions/bonded_interactions.dox
@@ -67,12 +67,12 @@
  *    is @ref BONDED_INACTIVE_CUTOFF to ensure that it is always skipped by
  *    the short-range loop.
  *  * @code{.cpp}
- *    boost::optional<Utils::Vector3d> force(Utils::Vector3d const &dx) const;
+ *    std::optional<Utils::Vector3d> force(Utils::Vector3d const &dx) const;
  *    @endcode
  *    This function returns the bond force. If it is a bond involving three
  *    or four particles, a @c std::tuple with three or four force vectors
  *    has to be returned, respectively.
- *    - The returned value is in a @c boost::optional container if the bond is
+ *    - The returned value is in a @c std::optional container if the bond is
  *      breakable. If the bond is broken, the returned object is empty; this
  *      will stop the integrator with a runtime error.
  *    - The function can make use of a pre-calculated distance vector (\p dx)
@@ -83,7 +83,7 @@
  *      a look at @ref forces_inline.hpp to see where this function will be
  *      called and which other variables may be available for your calculation.
  *  * @code{.cpp}
- *    boost::optional<double> energy(Utils::Vector3d const &dx) const;
+ *    std::optional<double> energy(Utils::Vector3d const &dx) const;
  *    @endcode
  *    This function returns the bond energy. The same information as given for
  *    the force calculation above applies here. This function will be called
diff --git a/src/core/bonded_interactions/bonded_tab.hpp b/src/core/bonded_interactions/bonded_tab.hpp
index 130461b5d1c..346f004dd9d 100644
--- a/src/core/bonded_interactions/bonded_tab.hpp
+++ b/src/core/bonded_interactions/bonded_tab.hpp
@@ -33,15 +33,15 @@
 #include "bonded_interactions/dihedral.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/sqr.hpp>
 
-#include <boost/optional.hpp>
 #include <boost/serialization/shared_ptr.hpp>
 
 #include <cassert>
 #include <cmath>
 #include <memory>
+#include <numbers>
+#include <optional>
 #include <tuple>
 #include <vector>
 
@@ -66,7 +66,7 @@ struct TabulatedBond {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &pot;
+    ar & pot;
   }
 };
 
@@ -84,8 +84,8 @@ struct TabulatedDistanceBond : public TabulatedBond {
     this->pot->maxval = max;
   }
 
-  boost::optional<Utils::Vector3d> force(Utils::Vector3d const &dx) const;
-  boost::optional<double> energy(Utils::Vector3d const &dx) const;
+  std::optional<Utils::Vector3d> force(Utils::Vector3d const &dx) const;
+  std::optional<double> energy(Utils::Vector3d const &dx) const;
 };
 
 /** Parameters for 3-body tabulated potential. */
@@ -98,7 +98,7 @@ struct TabulatedAngleBond : public TabulatedBond {
                      std::vector<double> const &force)
       : TabulatedBond(min, max, energy, force) {
     this->pot->minval = 0.;
-    this->pot->maxval = Utils::pi() + ROUND_ERROR_PREC;
+    this->pot->maxval = std::numbers::pi + ROUND_ERROR_PREC;
   }
 
   std::tuple<Utils::Vector3d, Utils::Vector3d, Utils::Vector3d>
@@ -117,16 +117,16 @@ struct TabulatedDihedralBond : public TabulatedBond {
                         std::vector<double> const &force)
       : TabulatedBond(min, max, energy, force) {
     this->pot->minval = 0.;
-    this->pot->maxval = 2. * Utils::pi() + ROUND_ERROR_PREC;
+    this->pot->maxval = 2. * std::numbers::pi + ROUND_ERROR_PREC;
   }
 
-  boost::optional<std::tuple<Utils::Vector3d, Utils::Vector3d, Utils::Vector3d,
-                             Utils::Vector3d>>
+  std::optional<std::tuple<Utils::Vector3d, Utils::Vector3d, Utils::Vector3d,
+                           Utils::Vector3d>>
   forces(Utils::Vector3d const &v12, Utils::Vector3d const &v23,
          Utils::Vector3d const &v34) const;
-  boost::optional<double> energy(Utils::Vector3d const &v12,
-                                 Utils::Vector3d const &v23,
-                                 Utils::Vector3d const &v34) const;
+  std::optional<double> energy(Utils::Vector3d const &v12,
+                               Utils::Vector3d const &v23,
+                               Utils::Vector3d const &v34) const;
 };
 
 /** Compute a tabulated bond length force.
@@ -137,7 +137,7 @@ struct TabulatedDihedralBond : public TabulatedBond {
  *
  *  @param[in]  dx        Distance between the particles.
  */
-inline boost::optional<Utils::Vector3d>
+inline std::optional<Utils::Vector3d>
 TabulatedDistanceBond::force(Utils::Vector3d const &dx) const {
   auto const dist = dx.norm();
 
@@ -156,7 +156,7 @@ TabulatedDistanceBond::force(Utils::Vector3d const &dx) const {
  *
  *  @param[in]  dx        Distance between the particles.
  */
-inline boost::optional<double>
+inline std::optional<double>
 TabulatedDistanceBond::energy(Utils::Vector3d const &dx) const {
   auto const dist = dx.norm();
 
@@ -218,8 +218,8 @@ inline double TabulatedAngleBond::energy(Utils::Vector3d const &vec1,
  *  @param[in] v34  Vector from @p p3 to @p p4
  *  @return the forces on @p p2, @p p1, @p p3
  */
-inline boost::optional<std::tuple<Utils::Vector3d, Utils::Vector3d,
-                                  Utils::Vector3d, Utils::Vector3d>>
+inline std::optional<std::tuple<Utils::Vector3d, Utils::Vector3d,
+                                Utils::Vector3d, Utils::Vector3d>>
 TabulatedDihedralBond::forces(Utils::Vector3d const &v12,
                               Utils::Vector3d const &v23,
                               Utils::Vector3d const &v34) const {
@@ -263,7 +263,7 @@ TabulatedDihedralBond::forces(Utils::Vector3d const &v12,
  *  @param[in] v23  Vector from @p p2 to @p p3
  *  @param[in] v34  Vector from @p p3 to @p p4
  */
-inline boost::optional<double>
+inline std::optional<double>
 TabulatedDihedralBond::energy(Utils::Vector3d const &v12,
                               Utils::Vector3d const &v23,
                               Utils::Vector3d const &v34) const {
diff --git a/src/core/bonded_interactions/dihedral.hpp b/src/core/bonded_interactions/dihedral.hpp
index 64fa9fde841..ff57332c912 100644
--- a/src/core/bonded_interactions/dihedral.hpp
+++ b/src/core/bonded_interactions/dihedral.hpp
@@ -31,11 +31,10 @@
 #include "config/config.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
-
-#include <boost/optional.hpp>
 
 #include <cmath>
+#include <numbers>
+#include <optional>
 #include <tuple>
 
 /** Parameters for four-body angular potential (dihedral-angle potentials). */
@@ -54,22 +53,22 @@ struct DihedralBond {
     this->phase = phase;
   }
 
-  boost::optional<std::tuple<Utils::Vector3d, Utils::Vector3d, Utils::Vector3d,
-                             Utils::Vector3d>>
+  std::optional<std::tuple<Utils::Vector3d, Utils::Vector3d, Utils::Vector3d,
+                           Utils::Vector3d>>
   forces(Utils::Vector3d const &v12, Utils::Vector3d const &v23,
          Utils::Vector3d const &v34) const;
 
-  boost::optional<double> energy(Utils::Vector3d const &v12,
-                                 Utils::Vector3d const &v23,
-                                 Utils::Vector3d const &v34) const;
+  std::optional<double> energy(Utils::Vector3d const &v12,
+                               Utils::Vector3d const &v23,
+                               Utils::Vector3d const &v34) const;
 
 private:
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &mult;
-    ar &bend;
-    ar &phase;
+    ar & mult;
+    ar & bend;
+    ar & phase;
   }
 };
 
@@ -110,8 +109,8 @@ inline bool calc_dihedral_angle(Utils::Vector3d const &a,
 
   /* catch case of undefined dihedral angle */
   if (l_aXb <= TINY_LENGTH_VALUE || l_bXc <= TINY_LENGTH_VALUE) {
-    phi = -1.0;
-    cosphi = 0.0;
+    phi = -1.;
+    cosphi = 0.;
     return true;
   }
 
@@ -120,13 +119,13 @@ inline bool calc_dihedral_angle(Utils::Vector3d const &a,
 
   cosphi = aXb * bXc;
 
-  if (fabs(fabs(cosphi) - 1) < TINY_SIN_VALUE)
+  if (fabs(fabs(cosphi) - 1.) < TINY_SIN_VALUE)
     cosphi = std::round(cosphi);
 
   /* Calculate dihedral angle */
   phi = acos(cosphi);
-  if ((aXb * c) < 0.0)
-    phi = (2.0 * Utils::pi()) - phi;
+  if ((aXb * c) < 0.)
+    phi = 2. * std::numbers::pi - phi;
   return false;
 }
 
@@ -139,8 +138,8 @@ inline bool calc_dihedral_angle(Utils::Vector3d const &a,
  *  @param[in] v34  Vector from @p p3 to @p p4
  *  @return the forces on @p p2, @p p1, @p p3
  */
-inline boost::optional<std::tuple<Utils::Vector3d, Utils::Vector3d,
-                                  Utils::Vector3d, Utils::Vector3d>>
+inline std::optional<std::tuple<Utils::Vector3d, Utils::Vector3d,
+                                Utils::Vector3d, Utils::Vector3d>>
 DihedralBond::forces(Utils::Vector3d const &v12, Utils::Vector3d const &v23,
                      Utils::Vector3d const &v34) const {
   /* vectors for dihedral angle calculation */
@@ -193,7 +192,7 @@ DihedralBond::forces(Utils::Vector3d const &v12, Utils::Vector3d const &v23,
  *  @param[in] v23  Vector from @p p2 to @p p3
  *  @param[in] v34  Vector from @p p3 to @p p4
  */
-inline boost::optional<double>
+inline std::optional<double>
 DihedralBond::energy(Utils::Vector3d const &v12, Utils::Vector3d const &v23,
                      Utils::Vector3d const &v34) const {
   /* vectors for dihedral calculations. */
diff --git a/src/core/bonded_interactions/fene.hpp b/src/core/bonded_interactions/fene.hpp
index 0866f9ea663..82d2ec8c43c 100644
--- a/src/core/bonded_interactions/fene.hpp
+++ b/src/core/bonded_interactions/fene.hpp
@@ -31,9 +31,8 @@
 #include <utils/Vector.hpp>
 #include <utils/math/sqr.hpp>
 
-#include <boost/optional.hpp>
-
 #include <cmath>
+#include <optional>
 
 /** Parameters for FENE bond Potential. */
 struct FeneBond {
@@ -61,25 +60,25 @@ struct FeneBond {
     this->drmax2i = 1. / this->drmax2;
   }
 
-  boost::optional<Utils::Vector3d> force(Utils::Vector3d const &dx) const;
-  boost::optional<double> energy(Utils::Vector3d const &dx) const;
+  std::optional<Utils::Vector3d> force(Utils::Vector3d const &dx) const;
+  std::optional<double> energy(Utils::Vector3d const &dx) const;
 
 private:
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &k;
-    ar &drmax;
-    ar &r0;
-    ar &drmax2;
-    ar &drmax2i;
+    ar & k;
+    ar & drmax;
+    ar & r0;
+    ar & drmax2;
+    ar & drmax2i;
   }
 };
 
 /** Compute the FENE bond force.
  *  @param[in]  dx        Distance between the particles.
  */
-inline boost::optional<Utils::Vector3d>
+inline std::optional<Utils::Vector3d>
 FeneBond::force(Utils::Vector3d const &dx) const {
   auto const len = dx.norm();
   auto const dr = len - r0;
@@ -104,8 +103,7 @@ FeneBond::force(Utils::Vector3d const &dx) const {
 /** Compute the FENE bond energy.
  *  @param[in]  dx        Distance between the particles.
  */
-inline boost::optional<double>
-FeneBond::energy(Utils::Vector3d const &dx) const {
+inline std::optional<double> FeneBond::energy(Utils::Vector3d const &dx) const {
   /* compute bond stretching (r-r0) */
   double const dr = dx.norm() - r0;
 
diff --git a/src/core/bonded_interactions/harmonic.hpp b/src/core/bonded_interactions/harmonic.hpp
index ea091a3a559..1c9cde00bf8 100644
--- a/src/core/bonded_interactions/harmonic.hpp
+++ b/src/core/bonded_interactions/harmonic.hpp
@@ -31,7 +31,7 @@
 #include <utils/Vector.hpp>
 #include <utils/math/sqr.hpp>
 
-#include <boost/optional.hpp>
+#include <optional>
 
 /** Parameters for harmonic bond Potential */
 struct HarmonicBond {
@@ -52,23 +52,23 @@ struct HarmonicBond {
     this->r_cut = r_cut;
   }
 
-  boost::optional<Utils::Vector3d> force(Utils::Vector3d const &dx) const;
-  boost::optional<double> energy(Utils::Vector3d const &dx) const;
+  std::optional<Utils::Vector3d> force(Utils::Vector3d const &dx) const;
+  std::optional<double> energy(Utils::Vector3d const &dx) const;
 
 private:
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &k;
-    ar &r;
-    ar &r_cut;
+    ar & k;
+    ar & r;
+    ar & r_cut;
   }
 };
 
 /** Compute the harmonic bond force.
  *  @param[in]  dx        Distance between the particles.
  */
-inline boost::optional<Utils::Vector3d>
+inline std::optional<Utils::Vector3d>
 HarmonicBond::force(Utils::Vector3d const &dx) const {
   auto const dist = dx.norm();
 
@@ -92,7 +92,7 @@ HarmonicBond::force(Utils::Vector3d const &dx) const {
 /** Compute the harmonic bond energy.
  *  @param[in]  dx        Distance between the particles.
  */
-inline boost::optional<double>
+inline std::optional<double>
 HarmonicBond::energy(Utils::Vector3d const &dx) const {
   auto const dist = dx.norm();
 
diff --git a/src/core/bonded_interactions/quartic.hpp b/src/core/bonded_interactions/quartic.hpp
index 9d724c58d40..eb37ccce7ad 100644
--- a/src/core/bonded_interactions/quartic.hpp
+++ b/src/core/bonded_interactions/quartic.hpp
@@ -30,7 +30,7 @@
 #include <utils/math/int_pow.hpp>
 #include <utils/math/sqr.hpp>
 
-#include <boost/optional.hpp>
+#include <optional>
 
 /** Parameters for quartic bond Potential */
 struct QuarticBond {
@@ -49,24 +49,24 @@ struct QuarticBond {
     this->r_cut = r_cut;
   }
 
-  boost::optional<Utils::Vector3d> force(Utils::Vector3d const &dx) const;
-  boost::optional<double> energy(Utils::Vector3d const &dx) const;
+  std::optional<Utils::Vector3d> force(Utils::Vector3d const &dx) const;
+  std::optional<double> energy(Utils::Vector3d const &dx) const;
 
 private:
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &k0;
-    ar &k1;
-    ar &r;
-    ar &r_cut;
+    ar & k0;
+    ar & k1;
+    ar & r;
+    ar & r_cut;
   }
 };
 
 /** Compute the quartic bond force.
  *  @param[in]  dx        Distance between the particles.
  */
-inline boost::optional<Utils::Vector3d>
+inline std::optional<Utils::Vector3d>
 QuarticBond::force(Utils::Vector3d const &dx) const {
   auto const dist = dx.norm();
 
@@ -91,7 +91,7 @@ QuarticBond::force(Utils::Vector3d const &dx) const {
 /** Compute the quartic bond energy.
  *  @param[in]  dx        Distance between the particles.
  */
-inline boost::optional<double>
+inline std::optional<double>
 QuarticBond::energy(Utils::Vector3d const &dx) const {
   auto const dist = dx.norm();
 
diff --git a/src/core/bonded_interactions/rigid_bond.hpp b/src/core/bonded_interactions/rigid_bond.hpp
index 9767def295d..3b0fbb7d4cc 100644
--- a/src/core/bonded_interactions/rigid_bond.hpp
+++ b/src/core/bonded_interactions/rigid_bond.hpp
@@ -56,8 +56,8 @@ struct RigidBond {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &d2;
-    ar &p_tol;
-    ar &v_tol;
+    ar & d2;
+    ar & p_tol;
+    ar & v_tol;
   }
 };
diff --git a/src/core/bonded_interactions/thermalized_bond.hpp b/src/core/bonded_interactions/thermalized_bond.hpp
index d315fce101e..7cf9e9a0bf2 100644
--- a/src/core/bonded_interactions/thermalized_bond.hpp
+++ b/src/core/bonded_interactions/thermalized_bond.hpp
@@ -29,9 +29,8 @@
 
 #include <utils/Vector.hpp>
 
-#include <boost/optional.hpp>
-
 #include <cmath>
+#include <optional>
 #include <tuple>
 
 /** Parameters for Thermalized bond */
@@ -71,7 +70,7 @@ struct ThermalizedBond {
     pref2_dist = std::sqrt(24. * gamma_distance / time_step * temp_distance);
   }
 
-  boost::optional<std::tuple<Utils::Vector3d, Utils::Vector3d>>
+  std::optional<std::tuple<Utils::Vector3d, Utils::Vector3d>>
   forces(Particle const &p1, Particle const &p2,
          Utils::Vector3d const &dx) const;
 
@@ -79,14 +78,14 @@ struct ThermalizedBond {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &temp_com;
-    ar &gamma_com;
-    ar &temp_distance;
-    ar &gamma_distance;
-    ar &r_cut;
-    ar &pref1_com;
-    ar &pref2_com;
-    ar &pref1_dist;
-    ar &pref2_dist;
+    ar & temp_com;
+    ar & gamma_com;
+    ar & temp_distance;
+    ar & gamma_distance;
+    ar & r_cut;
+    ar & pref1_com;
+    ar & pref2_com;
+    ar & pref1_dist;
+    ar & pref2_dist;
   }
 };
diff --git a/src/core/bonded_interactions/thermalized_bond_kernel.hpp b/src/core/bonded_interactions/thermalized_bond_kernel.hpp
index 6f0dc132f2a..2b13b08a8d6 100644
--- a/src/core/bonded_interactions/thermalized_bond_kernel.hpp
+++ b/src/core/bonded_interactions/thermalized_bond_kernel.hpp
@@ -30,9 +30,8 @@
 
 #include <utils/Vector.hpp>
 
-#include <boost/optional.hpp>
-
 #include <cmath>
+#include <optional>
 #include <tuple>
 
 /** Separately thermalizes the com and distance of a particle pair.
@@ -41,7 +40,7 @@
  *  @param[in]  dx        Distance between the particles.
  *  @return the forces on @p p1 and @p p2
  */
-inline boost::optional<std::tuple<Utils::Vector3d, Utils::Vector3d>>
+inline std::optional<std::tuple<Utils::Vector3d, Utils::Vector3d>>
 ThermalizedBond::forces(Particle const &p1, Particle const &p2,
                         Utils::Vector3d const &dx) const {
   // Bond broke?
diff --git a/src/core/cell_system/AtomDecomposition.cpp b/src/core/cell_system/AtomDecomposition.cpp
index bd538039767..d0f7266bd2e 100644
--- a/src/core/cell_system/AtomDecomposition.cpp
+++ b/src/core/cell_system/AtomDecomposition.cpp
@@ -51,6 +51,7 @@ void AtomDecomposition::configure_neighbors() {
 
   local().m_neighbors = Neighbors<Cell *>(red_neighbors, black_neighbors);
 }
+
 GhostCommunicator AtomDecomposition::prepare_comm() {
   /* no need for comm for only 1 node */
   if (m_comm.size() == 1) {
@@ -68,6 +69,7 @@ GhostCommunicator AtomDecomposition::prepare_comm() {
 
   return ghost_comm;
 }
+
 void AtomDecomposition::configure_comms() {
   m_exchange_ghosts_comm = prepare_comm();
   m_collect_ghost_force_comm = prepare_comm();
@@ -90,6 +92,7 @@ void AtomDecomposition::configure_comms() {
     }
   }
 }
+
 void AtomDecomposition::mark_cells() {
   m_local_cells.resize(1, std::addressof(local()));
   m_ghost_cells.clear();
@@ -99,6 +102,7 @@ void AtomDecomposition::mark_cells() {
     }
   }
 }
+
 void AtomDecomposition::resort(bool global_flag,
                                std::vector<ParticleChange> &diff) {
   for (auto &p : local().particles()) {
diff --git a/src/core/cell_system/AtomDecomposition.hpp b/src/core/cell_system/AtomDecomposition.hpp
index 60284018f34..4b897629db7 100644
--- a/src/core/cell_system/AtomDecomposition.hpp
+++ b/src/core/cell_system/AtomDecomposition.hpp
@@ -29,12 +29,12 @@
 #include "Particle.hpp"
 #include "ghosts.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 
 #include <boost/mpi/communicator.hpp>
-#include <boost/optional.hpp>
 
+#include <optional>
+#include <span>
 #include <utility>
 #include <vector>
 
@@ -75,12 +75,8 @@ class AtomDecomposition : public ParticleDecomposition {
     return m_collect_ghost_force_comm;
   }
 
-  Utils::Span<Cell *const> local_cells() const override {
-    return Utils::make_span(m_local_cells);
-  }
-  Utils::Span<Cell *const> ghost_cells() const override {
-    return Utils::make_span(m_ghost_cells);
-  }
+  std::span<Cell *const> local_cells() const override { return m_local_cells; }
+  std::span<Cell *const> ghost_cells() const override { return m_ghost_cells; }
 
   /* Getter needed for HybridDecomposition */
   auto const &get_local_cells() const { return m_local_cells; }
@@ -106,7 +102,7 @@ class AtomDecomposition : public ParticleDecomposition {
 
   /* Return true if minimum image convention is
    * needed for distance calculation. */
-  boost::optional<BoxGeometry> minimum_image_distance() const override {
+  std::optional<BoxGeometry> minimum_image_distance() const override {
     return m_box;
   }
 
diff --git a/src/core/cell_system/Cell.hpp b/src/core/cell_system/Cell.hpp
index c15d1a4a3db..944642d8d6b 100644
--- a/src/core/cell_system/Cell.hpp
+++ b/src/core/cell_system/Cell.hpp
@@ -22,11 +22,10 @@
 #include "Particle.hpp"
 #include "ParticleList.hpp"
 
-#include <utils/Span.hpp>
-
 #include <boost/range/iterator_range.hpp>
 
 #include <algorithm>
+#include <span>
 #include <utility>
 #include <vector>
 
@@ -56,8 +55,8 @@ template <class CellRef> class Neighbors {
     return *this;
   }
 
-  Neighbors(Utils::Span<const CellRef> red_neighbors,
-            Utils::Span<const CellRef> black_neighbors) {
+  Neighbors(std::span<const CellRef> red_neighbors,
+            std::span<const CellRef> black_neighbors) {
     m_neighbors.resize(red_neighbors.size() + black_neighbors.size());
     m_red_black_divider = std::copy(red_neighbors.begin(), red_neighbors.end(),
                                     m_neighbors.begin());
diff --git a/src/core/cell_system/CellStructure.cpp b/src/core/cell_system/CellStructure.cpp
index 8913d29376a..91c5bee4a7f 100644
--- a/src/core/cell_system/CellStructure.cpp
+++ b/src/core/cell_system/CellStructure.cpp
@@ -37,13 +37,14 @@
 #include <utils/contains.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
-#include <boost/range/algorithm/min_element.hpp>
 #include <boost/variant.hpp>
 
 #include <algorithm>
 #include <cassert>
+#include <cstddef>
 #include <iterator>
 #include <memory>
+#include <optional>
 #include <set>
 #include <stdexcept>
 #include <string>
@@ -69,7 +70,7 @@ void CellStructure::check_particle_index() const {
   }
 
   /* checks: local particle id */
-  int local_part_cnt = 0;
+  std::size_t local_part_cnt = 0u;
   for (int n = 0; n < get_max_local_particle_id() + 1; n++) {
     if (get_local_particle(n) != nullptr) {
       local_part_cnt++;
@@ -248,12 +249,13 @@ void CellStructure::set_atom_decomposition() {
   system.on_cell_structure_change();
 }
 
-void CellStructure::set_regular_decomposition(double range) {
+void CellStructure::set_regular_decomposition(
+    double range, std::optional<std::pair<int, int>> fully_connected_boundary) {
   auto &system = get_system();
   auto &local_geo = *system.local_geo;
   auto const &box_geo = *system.box_geo;
   set_particle_decomposition(std::make_unique<RegularDecomposition>(
-      ::comm_cart, range, box_geo, local_geo));
+      ::comm_cart, range, box_geo, local_geo, fully_connected_boundary));
   m_type = CellStructureType::REGULAR;
   local_geo.set_cell_structure_type(m_type);
   system.on_cell_structure_change();
@@ -289,7 +291,7 @@ void CellStructure::set_verlet_skin_heuristic() {
   }
   /* maximal skin that can be used without resorting is the maximal
    * range of the cell system minus what is needed for interactions. */
-  auto const max_range = *boost::min_element(max_cutoff());
+  auto const max_range = std::ranges::min(max_cutoff());
   auto const new_skin = std::min(0.4 * max_cut, max_range - max_cut);
   set_verlet_skin(new_skin);
 }
diff --git a/src/core/cell_system/CellStructure.hpp b/src/core/cell_system/CellStructure.hpp
index 107629e447e..9993fffc3dd 100644
--- a/src/core/cell_system/CellStructure.hpp
+++ b/src/core/cell_system/CellStructure.hpp
@@ -46,7 +46,9 @@
 #include <cassert>
 #include <iterator>
 #include <memory>
+#include <optional>
 #include <set>
+#include <span>
 #include <stdexcept>
 #include <utility>
 #include <vector>
@@ -84,11 +86,10 @@ enum DataPart : unsigned {
 unsigned map_data_parts(unsigned data_parts);
 
 namespace Cells {
-inline ParticleRange particles(Utils::Span<Cell *const> cells) {
+inline ParticleRange particles(std::span<Cell *const> cells) {
   /* Find first non-empty cell */
-  auto first_non_empty =
-      std::find_if(cells.begin(), cells.end(),
-                   [](const Cell *c) { return not c->particles().empty(); });
+  auto first_non_empty = std::ranges::find_if(
+      cells, [](const Cell *c) { return not c->particles().empty(); });
 
   return {CellParticleIterator(first_non_empty, cells.end()),
           CellParticleIterator(cells.end())};
@@ -254,8 +255,8 @@ struct CellStructure : public System::Leaf<CellStructure> {
 
   template <class InputRange, class OutputIterator>
   void get_local_particles(InputRange ids, OutputIterator out) {
-    boost::transform(ids, out,
-                     [this](int id) { return get_local_particle(id); });
+    std::ranges::transform(ids, out,
+                           [this](int id) { return get_local_particle(id); });
   }
 
   CellStructureType decomposition_type() const { return m_type; }
@@ -479,13 +480,12 @@ struct CellStructure : public System::Leaf<CellStructure> {
    * @param partner_ids Ids to resolve.
    * @return Vector of Particle pointers.
    */
-  auto resolve_bond_partners(Utils::Span<const int> partner_ids) {
+  auto resolve_bond_partners(std::span<const int> partner_ids) {
     boost::container::static_vector<Particle *, 4> partners;
     get_local_particles(partner_ids, std::back_inserter(partners));
 
     /* Check if id resolution failed for any partner */
-    if (std::any_of(partners.begin(), partners.end(),
-                    [](Particle const *const p) { return p == nullptr; })) {
+    if (std::ranges::find(partners, nullptr) != partners.end()) {
       throw BondResolutionError{};
     }
 
@@ -495,7 +495,7 @@ struct CellStructure : public System::Leaf<CellStructure> {
   /**
    * @brief Execute kernel for every bond on particle.
    * @tparam Handler Callable, which can be invoked with
-   *                 (Particle, int, Utils::Span<Particle *>),
+   *                 (Particle, int, std::span<Particle *>),
    *                 returning a bool.
    * @param p Particles for whom the bonds are evaluated.
    * @param handler is called for every bond, and handed
@@ -510,10 +510,8 @@ struct CellStructure : public System::Leaf<CellStructure> {
 
       try {
         auto partners = resolve_bond_partners(partner_ids);
-
-        auto const bond_broken =
-            handler(p, bond.bond_id(), Utils::make_span(partners));
-
+        auto const partners_span = std::span(partners.data(), partners.size());
+        auto const bond_broken = handler(p, bond.bond_id(), partners_span);
         if (bond_broken) {
           bond_broken_error(p.id(), partner_ids);
         }
@@ -560,7 +558,9 @@ struct CellStructure : public System::Leaf<CellStructure> {
    *
    * @param range Interaction range.
    */
-  void set_regular_decomposition(double range);
+  void set_regular_decomposition(
+      double range,
+      std::optional<std::pair<int, int>> fully_connected_boundary);
 
   /**
    * @brief Set the particle decomposition to @ref HybridDecomposition.
diff --git a/src/core/cell_system/HybridDecomposition.cpp b/src/core/cell_system/HybridDecomposition.cpp
index a54002f8df2..4c21fe64756 100644
--- a/src/core/cell_system/HybridDecomposition.cpp
+++ b/src/core/cell_system/HybridDecomposition.cpp
@@ -37,6 +37,7 @@
 #include <cstddef>
 #include <functional>
 #include <iterator>
+#include <optional>
 #include <set>
 #include <utility>
 
@@ -48,7 +49,7 @@ HybridDecomposition::HybridDecomposition(boost::mpi::communicator comm,
                                          std::set<int> n_square_types)
     : m_comm(std::move(comm)), m_box(box_geo), m_cutoff_regular(cutoff_regular),
       m_regular_decomposition(RegularDecomposition(
-          m_comm, cutoff_regular + skin, m_box, local_box)),
+          m_comm, cutoff_regular + skin, m_box, local_box, std::nullopt)),
       m_n_square(AtomDecomposition(m_comm, m_box)),
       m_n_square_types(std::move(n_square_types)),
       m_get_global_ghost_flags(std::move(get_ghost_flags)) {
diff --git a/src/core/cell_system/HybridDecomposition.hpp b/src/core/cell_system/HybridDecomposition.hpp
index ce73c8cd7f5..fa695c4f818 100644
--- a/src/core/cell_system/HybridDecomposition.hpp
+++ b/src/core/cell_system/HybridDecomposition.hpp
@@ -32,15 +32,15 @@
 #include "Particle.hpp"
 #include "ghosts.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 
 #include <boost/mpi/communicator.hpp>
-#include <boost/optional.hpp>
 
 #include <cstddef>
 #include <functional>
+#include <optional>
 #include <set>
+#include <span>
 #include <utility>
 #include <vector>
 
@@ -100,13 +100,8 @@ class HybridDecomposition : public ParticleDecomposition {
     return m_collect_ghost_force_comm;
   }
 
-  Utils::Span<Cell *const> local_cells() const override {
-    return Utils::make_span(m_local_cells);
-  }
-
-  Utils::Span<Cell *const> ghost_cells() const override {
-    return Utils::make_span(m_ghost_cells);
-  }
+  std::span<Cell *const> local_cells() const override { return m_local_cells; }
+  std::span<Cell *const> ghost_cells() const override { return m_ghost_cells; }
 
   Cell *particle_to_cell(Particle const &p) override {
     if (is_n_square_type(p.type())) {
@@ -125,9 +120,10 @@ class HybridDecomposition : public ParticleDecomposition {
   Utils::Vector3d max_cutoff() const override {
     return m_n_square.max_cutoff();
   }
+
   Utils::Vector3d max_range() const override { return m_n_square.max_range(); }
 
-  boost::optional<BoxGeometry> minimum_image_distance() const override {
+  std::optional<BoxGeometry> minimum_image_distance() const override {
     return m_box;
   }
 
diff --git a/src/core/cell_system/ParticleDecomposition.hpp b/src/core/cell_system/ParticleDecomposition.hpp
index 53fb4223399..b2f50e253e1 100644
--- a/src/core/cell_system/ParticleDecomposition.hpp
+++ b/src/core/cell_system/ParticleDecomposition.hpp
@@ -24,12 +24,12 @@
 #include "BoxGeometry.hpp"
 #include "ghosts.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 
-#include <boost/optional.hpp>
 #include <boost/variant.hpp>
 
+#include <optional>
+#include <span>
 #include <vector>
 
 struct RemovedParticle {
@@ -91,7 +91,7 @@ class ParticleDecomposition {
    *
    * @return List of local cells.
    */
-  virtual Utils::Span<Cell *const> local_cells() const = 0;
+  virtual std::span<Cell *const> local_cells() const = 0;
 
   /**
    * @brief Get pointer to local cells.
@@ -102,7 +102,7 @@ class ParticleDecomposition {
    *
    * @return List of ghost cells.
    */
-  virtual Utils::Span<Cell *const> ghost_cells() const = 0;
+  virtual std::span<Cell *const> ghost_cells() const = 0;
 
   /**
    * @brief Determine which cell a particle id belongs to.
@@ -128,7 +128,7 @@ class ParticleDecomposition {
    *        if minimum image convention should be used needed for
    *        distance calculation.
    */
-  virtual boost::optional<BoxGeometry> minimum_image_distance() const = 0;
+  virtual std::optional<BoxGeometry> minimum_image_distance() const = 0;
 
   virtual BoxGeometry const &box() const = 0;
 
diff --git a/src/core/cell_system/RegularDecomposition.cpp b/src/core/cell_system/RegularDecomposition.cpp
index 397a45998e2..df48efd35ce 100644
--- a/src/core/cell_system/RegularDecomposition.cpp
+++ b/src/core/cell_system/RegularDecomposition.cpp
@@ -52,7 +52,7 @@ int RegularDecomposition::position_to_cell_index(
     Utils::Vector3d const &pos) const {
   Utils::Vector3i cpos;
 
-  for (unsigned int i = 0u; i < 3u; i++) {
+  for (auto i = 0u; i < 3u; i++) {
     cpos[i] = static_cast<int>(std::floor(pos[i] * inv_cell_size[i])) + 1 -
               cell_offset[i];
 
@@ -306,7 +306,7 @@ void RegularDecomposition::create_cell_grid(double range) {
     auto const volume = Utils::product(local_box_l);
     auto const scale = std::cbrt(RegularDecomposition::max_num_cells / volume);
 
-    for (unsigned int i = 0; i < 3; i++) {
+    for (auto i = 0u; i < 3u; i++) {
       /* this is at least 1 */
       cell_grid[i] = static_cast<int>(std::ceil(local_box_l[i] * scale));
       cell_range[i] = local_box_l[i] / static_cast<double>(cell_grid[i]);
@@ -336,11 +336,11 @@ void RegularDecomposition::create_cell_grid(double range) {
         break;
 
       /* find coordinate with the smallest cell range */
-      int min_ind = 0;
-      double min_size = cell_range[0];
+      auto min_ind = 0u;
+      auto min_size = cell_range[0];
 
-      for (int i = 1; i < 3; i++) {
-        if (cell_grid[i] > 1 && cell_range[i] < min_size) {
+      for (auto i = 1u; i < 3u; ++i) {
+        if (cell_grid[i] > 1 and cell_range[i] < min_size) {
           min_ind = i;
           min_size = cell_range[i];
         }
@@ -369,7 +369,7 @@ void RegularDecomposition::create_cell_grid(double range) {
 
   /* now set all dependent variables */
   int new_cells = 1;
-  for (unsigned int i = 0; i < 3; i++) {
+  for (auto i = 0u; i < 3u; i++) {
     ghost_cell_grid[i] = cell_grid[i] + 2;
     new_cells *= ghost_cell_grid[i];
     cell_size[i] = m_local_box.length()[i] / static_cast<double>(cell_grid[i]);
@@ -397,6 +397,27 @@ void RegularDecomposition::init_cell_interactions() {
   auto const &node_grid = ::communicator.node_grid;
   auto const global_halo_offset = hadamard_product(node_pos, cell_grid) - halo;
   auto const global_size = hadamard_product(node_grid, cell_grid);
+  auto const at_boundary = [&global_size](int coord, Utils::Vector3i cell_idx) {
+    return (cell_idx[coord] == 0 or cell_idx[coord] == global_size[coord]);
+  };
+
+  // For the fully connected feature (cells that don't share at least a corner)
+  // only apply if one cell is a ghost cell (i.e. connections across the
+  // periodic boundary.
+  auto const fcb_is_inner_connection = [&global_size, this](Utils::Vector3i a,
+                                                            Utils::Vector3i b) {
+    if (fully_connected_boundary()) {
+      auto const [fc_normal, fc_dir] = *fully_connected_boundary();
+      auto const involves_ghost_cell =
+          (a[fc_normal] == -1 or a[fc_normal] == global_size[fc_normal] or
+           b[fc_normal] == -1 or b[fc_normal] == global_size[fc_normal]);
+      if (not involves_ghost_cell) {
+        // check if cells do not share at least a corner
+        return std::abs((a - b)[fc_dir]) > 1;
+      }
+    }
+    return false;
+  };
 
   /* Translate a node local index (relative to the origin of the local grid)
    * to a global index. */
@@ -418,6 +439,19 @@ void RegularDecomposition::init_cell_interactions() {
     return (global_index - global_halo_offset);
   };
 
+  // sanity checks
+  if (fully_connected_boundary()) {
+    auto const [fc_normal, fc_dir] = *fully_connected_boundary();
+    if (fc_normal == fc_dir) {
+      throw std::domain_error("fully_connected_boundary normal and connection "
+                              "coordinates need to differ.");
+    }
+    if (node_grid[fc_dir] != 1) {
+      throw std::runtime_error(
+          "The MPI nodegrid must be 1 in the fully connected direction.");
+    }
+  }
+
   /* We only consider local cells (e.g. not halo cells), which
    * span the range [(1,1,1), cell_grid) in local coordinates. */
   auto const start = global_index(Utils::Vector3i{1, 1, 1});
@@ -431,24 +465,22 @@ void RegularDecomposition::init_cell_interactions() {
         Utils::Vector3i lower_index = {m - 1, n - 1, o - 1};
         Utils::Vector3i upper_index = {m + 1, n + 1, o + 1};
 
-        //        /* In the fully connected case, we consider all cells
-        //         * in the direction as neighbors, not only the nearest ones.
+        /* In the fully connected case, we consider all cells
+        * in the direction as neighbors, not only the nearest ones.
         //         */
-        //        for (int i = 0; i < 3; i++) {
-        //          if (dd.fully_connected[i]) {
-        //            // Fully connected is only needed at the box surface
-        //            if (i==0 and (n!=start[1] or n!=end[1]-1) and (o!=start[2]
-        //            or o!=end[2]-1)) continue; if (i==1 and (m!=start[0] or
-        //            m!=end[0]-1) and (o!=start[2] or o!=end[2]-1)) continue;
-        //            if (i==2 and (m!=start[0] or m!=end[0]-1) and (n!=start[1]
-        //            or n!=end[1]-1)) continue; lower_index[i] = 0;
-        //            upper_index[i] = global_size[i] - 1;
-        //          }
-        //        }
+        if (fully_connected_boundary()) {
+          auto const [fc_boundary, fc_direction] = *fully_connected_boundary();
+
+          // Fully connected is only needed at the box surface
+          if (at_boundary(fc_boundary, {m, n, o})) {
+            lower_index[fc_direction] = -1;
+            upper_index[fc_direction] = global_size[fc_boundary];
+          }
+        }
 
         /* In non-periodic directions, the halo needs not
          * be considered. */
-        for (int i = 0; i < 3; i++) {
+        for (auto i = 0u; i < 3u; i++) {
           if (not m_box.periodic(i)) {
             lower_index[i] = std::max(0, lower_index[i]);
             upper_index[i] = std::min(global_size[i] - 1, upper_index[i]);
@@ -466,6 +498,12 @@ void RegularDecomposition::init_cell_interactions() {
         for (int p = lower_index[2]; p <= upper_index[2]; p++)
           for (int q = lower_index[1]; q <= upper_index[1]; q++)
             for (int r = lower_index[0]; r <= upper_index[0]; r++) {
+              if (fully_connected_boundary()) {
+                // Avoid fully connecting the boundary layer and the
+                // next INNER layer
+                if (fcb_is_inner_connection({m, n, o}, {r, q, p}))
+                  continue;
+              }
               neighbors.insert(Utils::Vector3i{r, q, p});
             }
 
@@ -629,11 +667,13 @@ GhostCommunicator RegularDecomposition::prepare_comm() {
   return ghost_comm;
 }
 
-RegularDecomposition::RegularDecomposition(boost::mpi::communicator comm,
-                                           double range,
-                                           BoxGeometry const &box_geo,
-                                           LocalBox const &local_geo)
-    : m_comm(std::move(comm)), m_box(box_geo), m_local_box(local_geo) {
+RegularDecomposition::RegularDecomposition(
+    boost::mpi::communicator comm, double range, BoxGeometry const &box_geo,
+    LocalBox const &local_geo,
+    std::optional<std::pair<int, int>> fully_connected)
+    : m_comm(std::move(comm)), m_box(box_geo), m_local_box(local_geo),
+      m_fully_connected_boundary(std::move(fully_connected)) {
+
   /* set up new regular decomposition cell structure */
   create_cell_grid(range);
 
diff --git a/src/core/cell_system/RegularDecomposition.hpp b/src/core/cell_system/RegularDecomposition.hpp
index 79bcaad74b4..f79a71ff101 100644
--- a/src/core/cell_system/RegularDecomposition.hpp
+++ b/src/core/cell_system/RegularDecomposition.hpp
@@ -31,12 +31,12 @@
 #include "ParticleList.hpp"
 #include "ghosts.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 
 #include <boost/mpi/communicator.hpp>
-#include <boost/optional.hpp>
 
+#include <optional>
+#include <span>
 #include <vector>
 
 /**
@@ -79,6 +79,7 @@ struct RegularDecomposition : public ParticleDecomposition {
   boost::mpi::communicator m_comm;
   BoxGeometry const &m_box;
   LocalBox m_local_box;
+  std::optional<std::pair<int, int>> m_fully_connected_boundary = {};
   std::vector<Cell> cells;
   std::vector<Cell *> m_local_cells;
   std::vector<Cell *> m_ghost_cells;
@@ -87,7 +88,8 @@ struct RegularDecomposition : public ParticleDecomposition {
 
 public:
   RegularDecomposition(boost::mpi::communicator comm, double range,
-                       BoxGeometry const &box_geo, LocalBox const &local_geo);
+                       BoxGeometry const &box_geo, LocalBox const &local_geo,
+                       std::optional<std::pair<int, int>> fully_connected);
 
   GhostCommunicator const &exchange_ghosts_comm() const override {
     return m_exchange_ghosts_comm;
@@ -96,12 +98,8 @@ struct RegularDecomposition : public ParticleDecomposition {
     return m_collect_ghost_force_comm;
   }
 
-  Utils::Span<Cell *const> local_cells() const override {
-    return Utils::make_span(m_local_cells);
-  }
-  Utils::Span<Cell *const> ghost_cells() const override {
-    return Utils::make_span(m_ghost_cells);
-  }
+  std::span<Cell *const> local_cells() const override { return m_local_cells; }
+  std::span<Cell *const> ghost_cells() const override { return m_ghost_cells; }
 
   /* Getter needed for HybridDecomposition */
   auto const &get_local_cells() const { return m_local_cells; }
@@ -119,7 +117,9 @@ struct RegularDecomposition : public ParticleDecomposition {
   Utils::Vector3d max_cutoff() const override;
   Utils::Vector3d max_range() const override;
 
-  boost::optional<BoxGeometry> minimum_image_distance() const override {
+  auto fully_connected_boundary() const { return m_fully_connected_boundary; }
+
+  std::optional<BoxGeometry> minimum_image_distance() const override {
     return {m_box};
   }
 
@@ -154,11 +154,11 @@ struct RegularDecomposition : public ParticleDecomposition {
    */
   Cell *position_to_cell(Utils::Vector3d const &pos) {
     auto const index = position_to_cell_index(pos);
-    return (index < 0) ? nullptr : &(cells.at(index));
+    return (index < 0) ? nullptr : &(cells.at(static_cast<std::size_t>(index)));
   }
   Cell const *position_to_cell(Utils::Vector3d const &pos) const {
     auto const index = position_to_cell_index(pos);
-    return (index < 0) ? nullptr : &(cells.at(index));
+    return (index < 0) ? nullptr : &(cells.at(static_cast<std::size_t>(index)));
   }
 
   /**
diff --git a/src/core/cells.cpp b/src/core/cells.cpp
index a0f95342f86..8b05fd1734b 100644
--- a/src/core/cells.cpp
+++ b/src/core/cells.cpp
@@ -41,11 +41,11 @@
 #include <utils/Vector.hpp>
 #include <utils/math/sqr.hpp>
 
-#include <boost/range/algorithm/min_element.hpp>
 #include <boost/serialization/set.hpp>
 
 #include <algorithm>
 #include <functional>
+#include <optional>
 #include <stdexcept>
 #include <utility>
 #include <vector>
@@ -85,7 +85,7 @@ static auto get_pairs_filtered(System::System const &system,
 namespace detail {
 static auto get_max_neighbor_search_range(System::System const &system) {
   auto const &cell_structure = *system.cell_structure;
-  return *boost::min_element(cell_structure.max_range());
+  return std::ranges::min(cell_structure.max_range());
 }
 static void search_distance_sanity_check_max_range(System::System const &system,
                                                    double const distance) {
@@ -101,7 +101,7 @@ static void search_distance_sanity_check_max_range(System::System const &system,
 }
 static void
 search_distance_sanity_check_cell_structure(System::System const &system,
-                                            double const distance) {
+                                            double const) {
   auto const &cell_structure = *system.cell_structure;
   if (cell_structure.decomposition_type() == CellStructureType::HYBRID) {
     throw std::runtime_error("Cannot search for neighbors in the hybrid "
@@ -115,7 +115,7 @@ static void search_neighbors_sanity_checks(System::System const &system,
 }
 } // namespace detail
 
-boost::optional<std::vector<int>>
+std::optional<std::vector<int>>
 get_short_range_neighbors(System::System const &system, int const pid,
                           double const distance) {
   detail::search_neighbors_sanity_checks(system, distance);
@@ -142,7 +142,7 @@ get_short_range_neighbors(System::System const &system, int const pid,
 static auto get_interacting_neighbors(System::System const &system,
                                       Particle const &p) {
   auto &cell_structure = *system.cell_structure;
-  auto const distance = *boost::min_element(cell_structure.max_range());
+  auto const distance = std::ranges::min(cell_structure.max_range());
   detail::search_neighbors_sanity_checks(system, distance);
   std::vector<Particle const *> ret;
   auto const cutoff2 = Utils::sqr(distance);
diff --git a/src/core/cells.hpp b/src/core/cells.hpp
index 20b7f2729b9..4b811783699 100644
--- a/src/core/cells.hpp
+++ b/src/core/cells.hpp
@@ -59,8 +59,7 @@
 
 #include <utils/Vector.hpp>
 
-#include <boost/optional.hpp>
-
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -85,7 +84,7 @@ get_pairs_of_types(System::System const &system, double distance,
  * @brief Get ids of particles that are within a certain distance
  * of another particle.
  */
-boost::optional<std::vector<int>>
+std::optional<std::vector<int>>
 get_short_range_neighbors(System::System const &system, int pid,
                           double distance);
 
@@ -102,8 +101,8 @@ namespace boost {
 namespace serialization {
 template <class Archive>
 void serialize(Archive &ar, NeighborPIDs &n, unsigned int const /* version */) {
-  ar &n.pid;
-  ar &n.neighbor_pids;
+  ar & n.pid;
+  ar & n.neighbor_pids;
 }
 } // namespace serialization
 } // namespace boost
@@ -132,12 +131,12 @@ namespace boost {
 namespace serialization {
 template <class Archive>
 void serialize(Archive &ar, PairInfo &p, unsigned int const /* version */) {
-  ar &p.id1;
-  ar &p.id2;
-  ar &p.pos1;
-  ar &p.pos2;
-  ar &p.vec21;
-  ar &p.node;
+  ar & p.id1;
+  ar & p.id2;
+  ar & p.pos1;
+  ar & p.pos2;
+  ar & p.vec21;
+  ar & p.node;
 }
 } // namespace serialization
 } // namespace boost
diff --git a/src/core/collision.cpp b/src/core/collision.cpp
index 51e5cc1e83c..62b4d6b3a4f 100644
--- a/src/core/collision.cpp
+++ b/src/core/collision.cpp
@@ -32,7 +32,6 @@
 #include "virtual_sites.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/sqr.hpp>
 #include <utils/mpi/all_compare.hpp>
 #include <utils/mpi/gather_buffer.hpp>
@@ -43,6 +42,7 @@
 #include <algorithm>
 #include <array>
 #include <cmath>
+#include <numbers>
 #include <stdexcept>
 #include <string>
 #include <utility>
@@ -58,8 +58,8 @@ namespace boost {
 namespace serialization {
 template <typename Archive>
 void serialize(Archive &ar, CollisionPair &c, const unsigned int) {
-  ar &c.pp1;
-  ar &c.pp2;
+  ar & c.pp1;
+  ar & c.pp2;
 }
 } // namespace serialization
 } // namespace boost
@@ -167,26 +167,7 @@ void Collision_parameters::initialize() {
       (get_bond_num_partners(collision_params.bond_vs) != 1 and
        get_bond_num_partners(collision_params.bond_vs) != 2)) {
     throw std::runtime_error("The bond type to be used for binding virtual "
-                             "sites needs to be a pair or three-particle bond");
-  }
-
-  if (collision_params.mode == CollisionModeType::BIND_THREE_PARTICLES) {
-    if (collision_params.bond_three_particles +
-            collision_params.three_particle_angle_resolution >
-        bonded_ia_params.size()) {
-      throw std::runtime_error(
-          "Insufficient bonds defined for three particle binding");
-    }
-
-    for (int i = collision_params.bond_three_particles;
-         i < collision_params.bond_three_particles +
-                 collision_params.three_particle_angle_resolution;
-         i++) {
-      if (get_bond_num_partners(i) != 2) {
-        throw std::runtime_error(
-            "The bonds for three particle binding need to be angle bonds.");
-      }
-    }
+                             "sites needs to be a pair bond");
   }
 
   // Create particle types
@@ -275,74 +256,6 @@ static void bind_at_point_of_collision_calc_vs_pos(Particle const &p1,
   pos2 = p1.pos() - vec21 * (1. - collision_params.vs_placement);
 }
 
-// Considers three particles for three_particle_binding and performs
-// the binding if the criteria are met
-static void coldet_do_three_particle_bond(Particle &p, Particle const &p1,
-                                          Particle const &p2,
-                                          BoxGeometry const &box_geo) {
-  // If p1 and p2 are not closer or equal to the cutoff distance, skip
-  // p1:
-  if (box_geo.get_mi_vector(p.pos(), p1.pos()).norm() >
-      collision_params.distance)
-    return;
-  // p2:
-  if (box_geo.get_mi_vector(p.pos(), p2.pos()).norm() >
-      collision_params.distance)
-    return;
-
-  // Check, if there already is a three-particle bond centered on p
-  // with p1 and p2 as partners. If so, skip this triplet.
-  // Note that the bond partners can appear in any order.
-
-  /* Check if a bond is a bond placed by the collision detection */
-  auto is_collision_bond = [](BondView const &bond) {
-    return (bond.bond_id() >= collision_params.bond_three_particles) and
-           (bond.bond_id() <=
-            collision_params.bond_three_particles +
-                collision_params.three_particle_angle_resolution);
-  };
-  /* Check if the bond is between the particles we are currently considering */
-  auto has_same_partners = [id1 = p1.id(),
-                            id2 = p2.id()](BondView const &bond) {
-    auto const partner_ids = bond.partner_ids();
-
-    return ((partner_ids[0] == id1) and (partner_ids[1] == id2)) or
-           ((partner_ids[0] == id2) and (partner_ids[1] == id1));
-  };
-
-  auto const &bonds = p.bonds();
-  if (std::any_of(bonds.begin(), bonds.end(), [=](auto const &bond) {
-        return is_collision_bond(bond) and has_same_partners(bond);
-      })) {
-    return;
-  }
-
-  // If we are still here, we need to create angular bond
-  // First, find the angle between the particle p, p1 and p2
-
-  /* vector from p to p1 */
-  auto const vec1 = box_geo.get_mi_vector(p.pos(), p1.pos()).normalize();
-  /* vector from p to p2 */
-  auto const vec2 = box_geo.get_mi_vector(p.pos(), p2.pos()).normalize();
-
-  auto const cosine = std::clamp(vec1 * vec2, -TINY_COS_VALUE, TINY_COS_VALUE);
-
-  // Bond angle
-  auto const phi = acos(cosine);
-
-  // We find the bond id by dividing the range from 0 to pi in
-  // three_particle_angle_resolution steps and by adding the id
-  // of the bond for zero degrees.
-  auto const bond_id = static_cast<int>(
-      floor(0.5 + phi / Utils::pi() *
-                      (collision_params.three_particle_angle_resolution - 1)) +
-      collision_params.bond_three_particles);
-
-  // Create the bond
-  const std::array<int, 2> bondT = {{p1.id(), p2.id()}};
-  p.bonds().insert({bond_id, bondT});
-}
-
 #ifdef VIRTUAL_SITES_RELATIVE
 static void place_vs_and_relate_to_particle(CellStructure &cell_structure,
                                             BoxGeometry const &box_geo,
@@ -414,74 +327,8 @@ std::vector<CollisionPair> gather_global_collision_queue() {
   return res;
 }
 
-static void three_particle_binding_do_search(Cell *basecell, Particle &p1,
-                                             Particle &p2,
-                                             BoxGeometry const &box_geo) {
-  auto handle_cell = [&p1, &p2, &box_geo](Cell *c) {
-    for (auto &p : c->particles()) {
-      // Skip collided particles themselves
-      if ((p.id() == p1.id()) or (p.id() == p2.id())) {
-        continue;
-      }
-
-      // We need all cyclical permutations, here (bond is placed on 1st
-      // particle, order of bond partners does not matter, so we don't need
-      // non-cyclic permutations).
-      // @ref coldet_do_three_particle_bond checks the bonding criterion and if
-      // the involved particles are not already bonded before it binds them.
-      if (!p.is_ghost()) {
-        coldet_do_three_particle_bond(p, p1, p2, box_geo);
-      }
-
-      if (!p1.is_ghost()) {
-        coldet_do_three_particle_bond(p1, p, p2, box_geo);
-      }
-
-      if (!p2.is_ghost()) {
-        coldet_do_three_particle_bond(p2, p, p1, box_geo);
-      }
-    }
-  };
-
-  /* Search the base cell ... */
-  handle_cell(basecell);
-
-  /* ... and all the neighbors. */
-  for (auto &n : basecell->neighbors().all()) {
-    handle_cell(n);
-  }
-}
-
-// Goes through the collision queue and for each pair in it
-// looks for a third particle by using the domain decomposition
-// cell system. If found, it performs three particle binding
-static void three_particle_binding_domain_decomposition(
-    CellStructure &cell_structure, BoxGeometry const &box_geo,
-    std::vector<CollisionPair> const &gathered_queue) {
-
-  for (auto &c : gathered_queue) {
-    // If we have both particles, at least as ghosts, Get the corresponding cell
-    // indices
-    if (cell_structure.get_local_particle(c.pp1) and
-        cell_structure.get_local_particle(c.pp2)) {
-      Particle &p1 = *cell_structure.get_local_particle(c.pp1);
-      Particle &p2 = *cell_structure.get_local_particle(c.pp2);
-      auto cell1 = cell_structure.find_current_cell(p1);
-      auto cell2 = cell_structure.find_current_cell(p2);
-
-      if (cell1)
-        three_particle_binding_do_search(cell1, p1, p2, box_geo);
-      if (cell2 and cell1 != cell2)
-        three_particle_binding_do_search(cell2, p1, p2, box_geo);
-
-    } // If local particles exist
-  }   // Loop over total collisions
-}
-
 // Handle the collisions stored in the queue
 void handle_collisions(CellStructure &cell_structure) {
-  auto &system = System::get_system();
-  auto const &box_geo = *system.box_geo;
   // Note that the glue to surface mode adds bonds between the centers
   // but does so later in the process. This is needed to guarantee that
   // a particle can only be glued once, even if queued twice in a single
@@ -503,6 +350,8 @@ void handle_collisions(CellStructure &cell_structure) {
 
 // Virtual sites based collision schemes
 #ifdef VIRTUAL_SITES_RELATIVE
+  auto &system = System::get_system();
+  auto const &box_geo = *system.box_geo;
   auto const min_global_cut = system.get_min_global_cut();
   if ((collision_params.mode == CollisionModeType::BIND_VS) ||
       (collision_params.mode == CollisionModeType::GLUE_TO_SURF)) {
@@ -643,7 +492,7 @@ void handle_collisions(CellStructure &cell_structure) {
                                           cell_structure);
         }
       } // we considered the pair
-    }   // Loop over all collisions in the queue
+    } // Loop over all collisions in the queue
 #ifdef ADDITIONAL_CHECKS
     if (!Utils::Mpi::all_compare(comm_cart, current_vs_pid)) {
       throw std::runtime_error("Nodes disagree about current_vs_pid");
@@ -657,16 +506,9 @@ void handle_collisions(CellStructure &cell_structure) {
           Cells::DATA_PART_PROPERTIES | Cells::DATA_PART_BONDS);
     }
     system.update_used_propagations();
-  }    // are we in one of the vs_based methods
+  } // are we in one of the vs_based methods
 #endif // defined VIRTUAL_SITES_RELATIVE
 
-  // three-particle-binding part
-  if (collision_params.mode == CollisionModeType::BIND_THREE_PARTICLES) {
-    auto gathered_queue = gather_global_collision_queue();
-    three_particle_binding_domain_decomposition(cell_structure, box_geo,
-                                                gathered_queue);
-  } // if TPB
-
   local_collision_queue.clear();
 }
 
diff --git a/src/core/collision.hpp b/src/core/collision.hpp
index 339e47f81a8..29c0bfe9bd1 100644
--- a/src/core/collision.hpp
+++ b/src/core/collision.hpp
@@ -41,15 +41,13 @@ enum class CollisionModeType : int {
   BIND_VS = 2,
   /** @brief Glue a particle to a specific spot on another particle. */
   GLUE_TO_SURF = 3,
-  /** @brief Three particle binding mode. */
-  BIND_THREE_PARTICLES = 4
 };
 
 class Collision_parameters {
 public:
   Collision_parameters()
       : mode(CollisionModeType::OFF), distance(0.), distance2(0.),
-        bond_centers(-1), bond_vs(-1), bond_three_particles(-1) {}
+        bond_centers(-1), bond_vs(-1) {}
 
   /// collision protocol
   CollisionModeType mode;
@@ -75,13 +73,6 @@ class Collision_parameters {
   int part_type_to_attach_vs_to;
   /// Particle type to which the newly glued particle is converted
   int part_type_after_glueing;
-  /// First bond type (for zero degrees) used for the three-particle bond
-  /// (angle potential)
-  int bond_three_particles;
-  /// Number of angle bonds to use (angular resolution)
-  /// different angle bonds with different equilibrium angles
-  /// Are expected to have ids immediately following to bond_three_particles
-  int three_particle_angle_resolution;
   /** Placement of virtual sites for MODE_VS.
    *  0=on same particle as related to,
    *  1=on collision partner,
diff --git a/src/core/constraints/Constraints.hpp b/src/core/constraints/Constraints.hpp
index a966038dcd8..89076a26b68 100644
--- a/src/core/constraints/Constraints.hpp
+++ b/src/core/constraints/Constraints.hpp
@@ -64,7 +64,7 @@ template <class ParticleRange, class Constraint> class Constraints {
   void remove(std::shared_ptr<Constraint> const &constraint) {
     auto &system = System::get_system();
     assert(contains(constraint));
-    m_constraints.erase(std::remove(begin(), end(), constraint), end());
+    std::erase(m_constraints, constraint);
     system.on_constraint_change();
   }
 
@@ -105,12 +105,14 @@ template <class ParticleRange, class Constraint> class Constraints {
     }
   }
 
-  void on_boxl_change() const {
+  void veto_boxl_change() const {
     if (not m_constraints.empty()) {
       throw std::runtime_error("The box size can not be changed because there "
                                "are active constraints.");
     }
   }
+
+  void on_boxl_change() const { veto_boxl_change(); }
 };
 } // namespace Constraints
 
diff --git a/src/core/constraints/ShapeBasedConstraint.cpp b/src/core/constraints/ShapeBasedConstraint.cpp
index 555a22fc0dd..ad25f7b5924 100644
--- a/src/core/constraints/ShapeBasedConstraint.cpp
+++ b/src/core/constraints/ShapeBasedConstraint.cpp
@@ -95,7 +95,7 @@ ParticleForce ShapeBasedConstraint::force(Particle const &p,
 
     if (dist > 0) {
       outer_normal_vec = -dist_vec / dist;
-      pf = calc_central_radial_force(p, part_rep, ia_params, dist_vec, dist) +
+      pf = calc_central_radial_force(ia_params, dist_vec, dist) +
            calc_central_radial_charge_force(p, part_rep, ia_params, dist_vec,
                                             dist, get_ptr(coulomb_kernel)) +
            calc_non_central_force(p, part_rep, ia_params, dist_vec, dist);
@@ -111,11 +111,10 @@ ParticleForce ShapeBasedConstraint::force(Particle const &p,
 #endif
     } else if (m_penetrable && (dist <= 0)) {
       if ((!m_only_positive) && (dist < 0)) {
-        pf =
-            calc_central_radial_force(p, part_rep, ia_params, dist_vec, -dist) +
-            calc_central_radial_charge_force(p, part_rep, ia_params, dist_vec,
-                                             -dist, get_ptr(coulomb_kernel)) +
-            calc_non_central_force(p, part_rep, ia_params, dist_vec, -dist);
+        pf = calc_central_radial_force(ia_params, dist_vec, -dist) +
+             calc_central_radial_charge_force(p, part_rep, ia_params, dist_vec,
+                                              -dist, get_ptr(coulomb_kernel)) +
+             calc_non_central_force(p, part_rep, ia_params, dist_vec, -dist);
 
 #ifdef DPD
         if (m_system.thermostat->thermo_switch & THERMO_DPD) {
diff --git a/src/core/cuda/common_cuda.cu b/src/core/cuda/common_cuda.cu
index 3f7e136369a..9fe7aa8ed80 100644
--- a/src/core/cuda/common_cuda.cu
+++ b/src/core/cuda/common_cuda.cu
@@ -20,47 +20,71 @@
 #include "errorhandling.hpp"
 
 #include "utils.cuh"
+#include "utils.hpp"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
 
 #include <cstdio>
+#include <cstdlib>
+#include <sstream>
+#include <string>
+#include <utility>
 
 cudaStream_t stream[1];
 
+static std::basic_ostream<char> &operator<<(std::basic_ostream<char> &os,
+                                            const dim3 &dim) {
+  os << "<" << dim.x << "," << dim.y << "," << dim.z << ">";
+  return os;
+}
+
+static std::basic_ostream<char> &operator<<(std::basic_ostream<char> &os,
+                                            cudaError_t CU_err) {
+  os << "CUDA error: \"" << cudaGetErrorString(CU_err) << "\"";
+  return os;
+}
+
 void cuda_check_errors_exit(const dim3 &block, const dim3 &grid,
                             const char *function, const char *file,
                             unsigned int line) {
   cudaError_t CU_err = cudaGetLastError();
   if (CU_err != cudaSuccess) {
-    fprintf(stderr,
-            "error \"%s\" calling %s with dim %d %d %d, grid %d %d "
-            "%d in %s:%u\n",
-            cudaGetErrorString(CU_err), function, block.x, block.y, block.z,
-            grid.x, grid.y, grid.z, file, line);
-    errexit();
+    std::stringstream message;
+    message << CU_err << " while calling " << function
+            << " with block: " << block << ", grid: " << grid << " in " << file
+            << ":" << line;
+    throw cuda_fatal_error(message.str());
   }
 }
 
 void cuda_safe_mem_exit(cudaError_t CU_err, const char *file,
                         unsigned int line) {
   if (CU_err != cudaSuccess) {
-    fprintf(stderr, "CUDA Memory error at %s:%u.\n", file, line);
-    fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(CU_err));
+    std::stringstream message;
+    message << CU_err << " during memory operation in " << file << ":" << line;
     if (CU_err == cudaErrorInvalidValue)
-      fprintf(stderr, "You may have tried to allocate zero memory at %s:%u.\n",
-              file, line);
-    errexit();
-  } else {
+      message << ". You may have tried to allocate zero memory";
+    throw cuda_fatal_error(message.str());
+  }
+  {
     CU_err = cudaGetLastError();
     if (CU_err != cudaSuccess) {
-      fprintf(stderr,
-              "Error found during memory operation. Possibly however "
-              "from a failed operation before. %s:%u.\n",
-              file, line);
-      printf("CUDA error: %s\n", cudaGetErrorString(CU_err));
-      if (CU_err == cudaErrorInvalidValue)
-        fprintf(stderr,
-                "You may have tried to allocate zero memory before %s:%u.\n",
-                file, line);
-      errexit();
+      std::stringstream message;
+      message << CU_err << " in " << file << ":" << line << ". Error found "
+              << "during memory operation. Possibly however from a failed "
+                 "operation before the memory operation";
+      throw cuda_fatal_error(message.str());
     }
   }
 }
+
+cuda_fatal_error::cuda_fatal_error(std::string msg)
+    : m_msg(std::move(msg)), m_terminate_handler(&errexit) {}
+
+void cuda_fatal_error::terminate() noexcept {
+  if (m_terminate_handler == nullptr or m_terminate_handler == errexit) {
+    fprintf(stderr, "%s\n", what());
+  }
+  ((m_terminate_handler == nullptr) ? &std::abort : m_terminate_handler)();
+}
diff --git a/src/core/cuda/init.hpp b/src/core/cuda/init.hpp
index 224006249b5..429bab2e561 100644
--- a/src/core/cuda/init.hpp
+++ b/src/core/cuda/init.hpp
@@ -36,7 +36,7 @@ struct EspressoGpuDevice {
   /** Local CUDA device id */
   int id;
   /** Local CUDA device name */
-  char name[64];
+  char name[256];
   /** Node identification */
   char proc_name[64];
   /** MPI process identification */
@@ -66,16 +66,16 @@ int cuda_get_n_gpus();
  *  \ref computeCapabilityMinMajor . \ref computeCapabilityMinMinor .
  *
  *  @param dev CUDA device number
- *  @return \ref ES_OK if the GPU meets the requirements, else \ref ES_ERROR.
+ *  @return @c false if the GPU meets the requirements, else @c true.
  */
-int cuda_check_gpu_compute_capability(int dev);
+bool cuda_check_gpu_compute_capability(int dev);
 
 /** Get the name of a CUDA device.
  *
  *  @param[in]  dev the CUDA device number to ask the name for
- *  @param[out] name a buffer to write the name to, at least 64 characters
+ *  @param[out] name a buffer to write the name to, at least 256 characters
  */
-void cuda_get_gpu_name(int dev, char name[64]);
+void cuda_get_gpu_name(int dev, char *name);
 
 /** Choose a device for future CUDA computations.
  *
@@ -89,10 +89,10 @@ void cuda_set_device(int dev);
  */
 int cuda_get_device();
 
-/** Test if actual CUDA device works.
- *  @return \ref ES_OK on success, \ref ES_ERROR else.
+/** Test if communication to the CUDA device works.
+ *  @return @c false on success, else @c true.
  */
-int cuda_test_device_access();
+bool cuda_test_device_access();
 
 /**
  * Check that a device is available, that its compute capability
diff --git a/src/core/cuda/init_cuda.cu b/src/core/cuda/init_cuda.cu
index 048e62cf7ed..18aceaea986 100644
--- a/src/core/cuda/init_cuda.cu
+++ b/src/core/cuda/init_cuda.cu
@@ -17,12 +17,11 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <cuda.h>
-
 #include "init.hpp"
 #include "utils.cuh"
 
-#include <utils/constants.hpp>
+#include <cuda.h>
+#include <cuda_runtime.h>
 
 #include <cstring>
 #include <string>
@@ -47,22 +46,28 @@ int cuda_get_n_gpus() {
   return deviceCount;
 }
 
-int cuda_check_gpu_compute_capability(int dev) {
+bool cuda_check_gpu_compute_capability(int dev) {
   cudaDeviceProp deviceProp;
   CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev))
-  if (deviceProp.major < computeCapabilityMinMajor ||
-      (deviceProp.major == computeCapabilityMinMajor &&
-       deviceProp.minor < computeCapabilityMinMinor)) {
-    return ES_ERROR;
-  }
-  return ES_OK;
+  return (deviceProp.major < computeCapabilityMinMajor or
+          (deviceProp.major == computeCapabilityMinMajor and
+           deviceProp.minor < computeCapabilityMinMinor));
 }
 
-void cuda_get_gpu_name(int dev, char name[64]) {
+/**
+ * @brief Safely copy the device name and pad the string with null characters.
+ */
+static void cuda_copy_gpu_name(char *const name, cudaDeviceProp const &prop) {
+  char buffer[256] = {'\0'};
+  std::strncpy(buffer, prop.name, 256);
+  name[255] = '\0';
+  std::strncpy(name, buffer, 256);
+}
+
+void cuda_get_gpu_name(int dev, char *const name) {
   cudaDeviceProp deviceProp;
   CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev))
-  std::strncpy(name, deviceProp.name, 63);
-  name[63] = 0;
+  cuda_copy_gpu_name(name, deviceProp);
 }
 
 EspressoGpuDevice cuda_get_device_props(const int dev) {
@@ -76,8 +81,7 @@ EspressoGpuDevice cuda_get_device_props(const int dev) {
                            deviceProp.minor,
                            deviceProp.totalGlobalMem,
                            deviceProp.multiProcessorCount};
-  std::strncpy(device.name, deviceProp.name, 64);
-  device.name[63] = '\0';
+  cuda_copy_gpu_name(device.name, deviceProp);
   return device;
 }
 
@@ -93,7 +97,7 @@ int cuda_get_device() {
   return dev;
 }
 
-int cuda_test_device_access() {
+bool cuda_test_device_access() {
   int *d = nullptr;
   int h = 42;
   cudaError_t err;
@@ -113,10 +117,7 @@ int cuda_test_device_access() {
   if (err != cudaSuccess) {
     throw cuda_runtime_error_cuda(err);
   }
-  if (h != 42) {
-    return ES_ERROR;
-  }
-  return ES_OK;
+  return h != 42;
 }
 
 void cuda_check_device() {
@@ -124,9 +125,9 @@ void cuda_check_device() {
     throw cuda_runtime_error("No GPU was found.");
   }
   auto const devID = cuda_get_device();
-  auto const compute_capability = cuda_check_gpu_compute_capability(devID);
-  auto const communication_test = cuda_test_device_access();
-  if (compute_capability != ES_OK or communication_test != ES_OK) {
+  auto const incompatible = cuda_check_gpu_compute_capability(devID);
+  auto const communication_failure = cuda_test_device_access();
+  if (incompatible or communication_failure) {
     throw cuda_runtime_error("CUDA device " + std::to_string(devID) +
                              " is not capable of running ESPResSo.");
   }
diff --git a/src/core/cuda/utils.cuh b/src/core/cuda/utils.cuh
index d699a6fe13c..1e9b48bd62a 100644
--- a/src/core/cuda/utils.cuh
+++ b/src/core/cuda/utils.cuh
@@ -26,6 +26,7 @@
 #include "utils.hpp"
 
 #include <cuda.h>
+#include <cuda_runtime.h>
 
 #include <string>
 
@@ -46,6 +47,7 @@ private:
   {                                                                            \
     cudaError_t const error_code = (statement);                                \
     if (error_code != cudaSuccess) {                                           \
+      static_cast<void>(cudaGetLastError()); /* clear non-sticky errors */     \
       throw cuda_runtime_error_cuda(error_code);                               \
     }                                                                          \
   }
diff --git a/src/core/cuda/utils.hpp b/src/core/cuda/utils.hpp
index 65b92d1fbf4..70d223e07f3 100644
--- a/src/core/cuda/utils.hpp
+++ b/src/core/cuda/utils.hpp
@@ -23,14 +23,47 @@
 
 #ifdef CUDA
 
+#include <exception>
 #include <stdexcept>
 #include <string>
 
+/**
+ * @brief Wrapper for CUDA runtime exceptions.
+ * When the exception cannot be recovered from,
+ * prefer using @ref cuda_fatal_error instead.
+ */
 class cuda_runtime_error : public std::runtime_error {
 public:
   cuda_runtime_error(std::string const &msg) : std::runtime_error(msg) {}
 };
 
+/**
+ * @brief Fatal CUDA exception.
+ * Best course of action is to terminate the program immediately.
+ */
+
+class cuda_fatal_error {
+  std::string m_msg;
+  std::terminate_handler m_terminate_handler;
+
+public:
+  explicit cuda_fatal_error(std::string msg);
+
+  ~cuda_fatal_error() { terminate(); }
+
+  auto get_terminate() noexcept { return m_terminate_handler; }
+
+  auto set_terminate(std::terminate_handler callback) noexcept {
+    auto old_handler = m_terminate_handler;
+    m_terminate_handler = callback;
+    return old_handler;
+  }
+
+  void terminate() noexcept;
+
+  char const *what() const noexcept { return m_msg.c_str(); }
+};
+
 /**
  * @brief Invoke a function and silently ignore any thrown
  * @ref cuda_runtime_error error.
@@ -42,7 +75,7 @@ template <class F, class... Args>
 void invoke_skip_cuda_exceptions(F &&f, Args &&...args) {
   try {
     return f(args...);
-  } catch (cuda_runtime_error const &) {
+  } catch (cuda_runtime_error const &) { // NOLINT(bugprone-empty-catch)
     // pass
   }
 }
diff --git a/src/core/dpd.cpp b/src/core/dpd.cpp
index 5885eeb53cd..dd30549ff48 100644
--- a/src/core/dpd.cpp
+++ b/src/core/dpd.cpp
@@ -36,7 +36,6 @@
 #include "thermostat.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/sqr.hpp>
 #include <utils/math/tensor_product.hpp>
 #include <utils/matrix.hpp>
diff --git a/src/core/ek/EKNone.hpp b/src/core/ek/EKNone.hpp
index 33a4cd67068..05174e9bc51 100644
--- a/src/core/ek/EKNone.hpp
+++ b/src/core/ek/EKNone.hpp
@@ -34,6 +34,7 @@ struct EKNone {
   void veto_time_step(double) const { throw NoEKActive{}; }
   void veto_kT(double) const { throw NoEKActive{}; }
   void sanity_checks(System::System const &) const { throw NoEKActive{}; }
+  void veto_boxl_change() const { throw NoEKActive{}; }
   void on_cell_structure_change() const { throw NoEKActive{}; }
   void on_boxl_change() const { throw NoEKActive{}; }
   void on_node_grid_change() const { throw NoEKActive{}; }
diff --git a/src/core/ek/EKReactions.hpp b/src/core/ek/EKReactions.hpp
index 12d8869ca50..a74efe458e6 100644
--- a/src/core/ek/EKReactions.hpp
+++ b/src/core/ek/EKReactions.hpp
@@ -47,7 +47,7 @@ template <class EKReaction> class EKReactions {
   }
   void remove(std::shared_ptr<EKReaction> const &ek_reaction) {
     assert(contains(ek_reaction));
-    m_ekreactions.erase(std::remove(begin(), end(), ek_reaction), end());
+    std::erase(m_ekreactions, ek_reaction);
   }
 
   iterator begin() { return m_ekreactions.begin(); }
diff --git a/src/core/ek/EKWalberla.hpp b/src/core/ek/EKWalberla.hpp
index 32243869ba0..fa9c91aa63f 100644
--- a/src/core/ek/EKWalberla.hpp
+++ b/src/core/ek/EKWalberla.hpp
@@ -62,9 +62,10 @@ struct EKWalberla {
   void perform_reactions();
 
   void on_cell_structure_change() const {}
-  void on_boxl_change() const {
+  void veto_boxl_change() const {
     throw std::runtime_error("MD cell geometry change not supported by EK");
   }
+  void on_boxl_change() const { veto_boxl_change(); }
   void on_node_grid_change() const {
     throw std::runtime_error("MPI topology change not supported by EK");
   }
diff --git a/src/core/ek/Solver.cpp b/src/core/ek/Solver.cpp
index 7190ce57d9d..155909c9e09 100644
--- a/src/core/ek/Solver.cpp
+++ b/src/core/ek/Solver.cpp
@@ -90,6 +90,12 @@ void Solver::veto_kT(double kT) const {
   }
 }
 
+void Solver::veto_boxl_change() const {
+  if (impl->solver) {
+    std::visit([](auto const &ptr) { ptr->veto_boxl_change(); }, *impl->solver);
+  }
+}
+
 void Solver::on_cell_structure_change() {
   if (impl->solver) {
     auto &solver = *impl->solver;
diff --git a/src/core/ek/Solver.hpp b/src/core/ek/Solver.hpp
index d5cc229d9d1..2d6a90d1654 100644
--- a/src/core/ek/Solver.hpp
+++ b/src/core/ek/Solver.hpp
@@ -95,6 +95,7 @@ struct Solver : public System::Leaf<Solver> {
   void on_cell_structure_change();
   void on_timestep_change();
   void on_temperature_change();
+  void veto_boxl_change() const;
 
 private:
   /** @brief Pointer-to-implementation. */
diff --git a/src/core/electrostatics/CMakeLists.txt b/src/core/electrostatics/CMakeLists.txt
index 6ac70cecf30..e188a1d1065 100644
--- a/src/core/electrostatics/CMakeLists.txt
+++ b/src/core/electrostatics/CMakeLists.txt
@@ -21,7 +21,6 @@ target_sources(
   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/coulomb.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/elc.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/icc.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/mmm1d_gpu.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/mmm1d.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/mmm-modpsi.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/p3m.cpp
diff --git a/src/core/electrostatics/coulomb.cpp b/src/core/electrostatics/coulomb.cpp
index 52850937530..b7bc1eac555 100644
--- a/src/core/electrostatics/coulomb.cpp
+++ b/src/core/electrostatics/coulomb.cpp
@@ -35,8 +35,6 @@
 #include "system/System.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/checks/charge_neutrality.hpp>
-#include <utils/constants.hpp>
 #include <utils/demangle.hpp>
 
 #include <boost/accumulators/accumulators.hpp>
@@ -116,11 +114,11 @@ struct LongRangePressure {
   }
 #endif // P3M
 
-  auto operator()(std::shared_ptr<DebyeHueckel> const &actor) const {
+  auto operator()(std::shared_ptr<DebyeHueckel> const &) const {
     return Utils::Vector9d{};
   }
 
-  auto operator()(std::shared_ptr<ReactionField> const &actor) const {
+  auto operator()(std::shared_ptr<ReactionField> const &) const {
     return Utils::Vector9d{};
   }
 
@@ -155,12 +153,7 @@ struct ShortRangeCutoff {
                     std::visit(*this, actor->base_solver));
   }
 #endif // P3M
-#ifdef MMM1D_GPU
-  auto operator()(std::shared_ptr<CoulombMMM1DGpu> const &actor) const {
-    return std::numeric_limits<double>::infinity();
-  }
-#endif // MMM1D_GPU
-  auto operator()(std::shared_ptr<CoulombMMM1D> const &actor) const {
+  auto operator()(std::shared_ptr<CoulombMMM1D> const &) const {
     return std::numeric_limits<double>::infinity();
   }
 #ifdef SCAFACOS
@@ -224,11 +217,6 @@ struct LongRangeForce {
     actor->add_long_range_forces(m_particles);
   }
 #endif // P3M
-#ifdef MMM1D_GPU
-  void operator()(std::shared_ptr<CoulombMMM1DGpu> const &actor) const {
-    actor->add_long_range_forces();
-  }
-#endif
 #ifdef SCAFACOS
   void operator()(std::shared_ptr<CoulombScafacos> const &actor) const {
     actor->add_long_range_forces();
@@ -256,12 +244,6 @@ struct LongRangeEnergy {
     return actor->long_range_energy(m_particles);
   }
 #endif // P3M
-#ifdef MMM1D_GPU
-  auto operator()(std::shared_ptr<CoulombMMM1DGpu> const &actor) const {
-    actor->add_long_range_energy();
-    return 0.;
-  }
-#endif // MMM1D_GPU
 #ifdef SCAFACOS
   auto operator()(std::shared_ptr<CoulombScafacos> const &actor) const {
     return actor->long_range_energy();
diff --git a/src/core/electrostatics/coulomb.hpp b/src/core/electrostatics/coulomb.hpp
index 0d58ae6d13b..93eb9302706 100644
--- a/src/core/electrostatics/coulomb.hpp
+++ b/src/core/electrostatics/coulomb.hpp
@@ -31,7 +31,6 @@
 #include "electrostatics/elc.hpp"
 #include "electrostatics/icc.hpp"
 #include "electrostatics/mmm1d.hpp"
-#include "electrostatics/mmm1d_gpu.hpp"
 #include "electrostatics/p3m.hpp"
 #include "electrostatics/p3m_gpu.hpp"
 #include "electrostatics/reaction_field.hpp"
@@ -56,9 +55,6 @@ using ElectrostaticsActor =
                  std::shared_ptr<ElectrostaticLayerCorrection>,
 #endif // P3M
                  std::shared_ptr<CoulombMMM1D>,
-#ifdef MMM1D_GPU
-                 std::shared_ptr<CoulombMMM1DGpu>,
-#endif // MMM1D_GPU
 #ifdef SCAFACOS
                  std::shared_ptr<CoulombScafacos>,
 #endif // SCAFACOS
@@ -98,9 +94,6 @@ template <class T> struct has_pressure : std::true_type {};
 template <>
 struct has_pressure<ElectrostaticLayerCorrection> : std::false_type {};
 #endif // P3M
-#ifdef MMM1D_GPU
-template <> struct has_pressure<CoulombMMM1DGpu> : std::false_type {};
-#endif // MMM1D_GPU
 #ifdef SCAFACOS
 template <> struct has_pressure<CoulombScafacos> : std::false_type {};
 #endif // SCAFACOS
diff --git a/src/core/electrostatics/coulomb_inline.hpp b/src/core/electrostatics/coulomb_inline.hpp
index b75a33b29b8..b8a72a363f3 100644
--- a/src/core/electrostatics/coulomb_inline.hpp
+++ b/src/core/electrostatics/coulomb_inline.hpp
@@ -61,12 +61,6 @@ struct ShortRangeForceKernel {
     return std::visit(*this, ptr->base_solver);
   }
 #endif // P3M
-
-#ifdef MMM1D_GPU
-  result_type operator()(std::shared_ptr<CoulombMMM1DGpu> const &) const {
-    return {};
-  }
-#endif // MMM1D_GPU
 #endif // ELECTROSTATICS
 };
 
@@ -145,11 +139,6 @@ struct ShortRangeEnergyKernel {
     }};
   }
 #endif // P3M
-#ifdef MMM1D_GPU
-  result_type operator()(std::shared_ptr<CoulombMMM1DGpu> const &) const {
-    return {};
-  }
-#endif // MMM1D_GPU
   result_type operator()(std::shared_ptr<CoulombMMM1D> const &actor) const {
     return kernel_type{[&actor](Particle const &, Particle const &, double q1q2,
                                 Utils::Vector3d const &d, double dist) {
@@ -162,45 +151,45 @@ struct ShortRangeEnergyKernel {
 inline std::optional<Solver::ShortRangeForceKernel>
 Solver::pair_force_kernel() const {
 #ifdef ELECTROSTATICS
-  if (impl->solver) {
+  if (auto &solver = impl->solver; solver.has_value()) {
     auto const visitor = Coulomb::ShortRangeForceKernel();
-    return std::visit(visitor, *impl->solver);
+    return std::visit(visitor, *solver);
   }
 #endif // ELECTROSTATICS
-  return {};
+  return std::nullopt;
 }
 
 inline std::optional<Solver::ShortRangeForceCorrectionsKernel>
 Solver::pair_force_elc_kernel() const {
 #ifdef ELECTROSTATICS
-  if (impl->solver) {
+  if (auto &solver = impl->solver; solver.has_value()) {
     auto const visitor = Coulomb::ShortRangeForceCorrectionsKernel();
-    return std::visit(visitor, *impl->solver);
+    return std::visit(visitor, *solver);
   }
 #endif // ELECTROSTATICS
-  return {};
+  return std::nullopt;
 }
 
 inline std::optional<Solver::ShortRangePressureKernel>
 Solver::pair_pressure_kernel() const {
 #ifdef ELECTROSTATICS
-  if (impl->solver) {
+  if (auto &solver = impl->solver; solver.has_value()) {
     auto const visitor = Coulomb::ShortRangePressureKernel();
-    return std::visit(visitor, *impl->solver);
+    return std::visit(visitor, *solver);
   }
 #endif // ELECTROSTATICS
-  return {};
+  return std::nullopt;
 }
 
 inline std::optional<Solver::ShortRangeEnergyKernel>
 Solver::pair_energy_kernel() const {
 #ifdef ELECTROSTATICS
-  if (impl->solver) {
+  if (auto &solver = impl->solver; solver.has_value()) {
     auto const visitor = Coulomb::ShortRangeEnergyKernel();
-    return std::visit(visitor, *impl->solver);
+    return std::visit(visitor, *solver);
   }
 #endif // ELECTROSTATICS
-  return {};
+  return std::nullopt;
 }
 
 } // namespace Coulomb
diff --git a/src/core/electrostatics/elc.cpp b/src/core/electrostatics/elc.cpp
index 69be07a7ebb..6c2c3f05313 100644
--- a/src/core/electrostatics/elc.cpp
+++ b/src/core/electrostatics/elc.cpp
@@ -40,7 +40,6 @@
 #include "system/System.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -51,6 +50,7 @@
 #include <cmath>
 #include <cstddef>
 #include <functional>
+#include <numbers>
 #include <variant>
 #include <vector>
 
@@ -108,7 +108,7 @@ static std::vector<SCCache> scycache;
 template <std::size_t dir>
 static std::vector<SCCache> calc_sc_cache(ParticleRange const &particles,
                                           std::size_t n_freq, double u) {
-  auto constexpr c_2pi = 2. * Utils::pi();
+  auto constexpr c_2pi = 2. * std::numbers::pi;
   auto const n_part = particles.size();
   std::vector<SCCache> ret(n_freq * n_part);
 
@@ -204,7 +204,7 @@ void ElectrostaticLayerCorrection::add_dipole_force(
     ParticleRange const &particles) const {
   constexpr std::size_t size = 3;
   auto const &box_geo = *get_system().box_geo;
-  auto const pref = prefactor * 4. * Utils::pi() / box_geo.volume();
+  auto const pref = prefactor * 4. * std::numbers::pi / box_geo.volume();
 
   /* for non-neutral systems, this shift gives the background contribution
    * (rsp. for this shift, the DM of the background is zero) */
@@ -270,7 +270,7 @@ double ElectrostaticLayerCorrection::dipole_energy(
     ParticleRange const &particles) const {
   constexpr std::size_t size = 7;
   auto const &box_geo = *get_system().box_geo;
-  auto const pref = prefactor * 2. * Utils::pi() / box_geo.volume();
+  auto const pref = prefactor * 2. * std::numbers::pi / box_geo.volume();
   auto const lz = box_geo.length()[2];
   /* for nonneutral systems, this shift gives the background contribution
      (rsp. for this shift, the DM of the background is zero) */
@@ -369,7 +369,7 @@ ElectrostaticLayerCorrection::z_energy(ParticleRange const &particles) const {
   constexpr std::size_t size = 4;
   auto const &box_geo = *get_system().box_geo;
   auto const xy_area_inv = box_geo.length_inv()[0] * box_geo.length_inv()[1];
-  auto const pref = prefactor * 2. * Utils::pi() * xy_area_inv;
+  auto const pref = prefactor * 2. * std::numbers::pi * xy_area_inv;
   auto const delta = elc.delta_mid_top * elc.delta_mid_bot;
   auto const fac_delta_mid_bot = elc.delta_mid_bot / (1. - delta);
   auto const fac_delta_mid_top = elc.delta_mid_top / (1. - delta);
@@ -445,7 +445,7 @@ void ElectrostaticLayerCorrection::add_z_force(
   constexpr std::size_t size = 1;
   auto const &box_geo = *get_system().box_geo;
   auto const xy_area_inv = box_geo.length_inv()[0] * box_geo.length_inv()[1];
-  auto const pref = prefactor * 2. * Utils::pi() * xy_area_inv;
+  auto const pref = prefactor * 2. * std::numbers::pi * xy_area_inv;
   auto const delta = elc.delta_mid_top * elc.delta_mid_bot;
   auto const fac_delta_mid_bot = elc.delta_mid_bot / (1. - delta);
   auto const fac_delta_mid_top = elc.delta_mid_top / (1. - delta);
@@ -506,7 +506,7 @@ void setup_PoQ(elc_data const &elc, double prefactor, std::size_t index,
   assert(index >= 1);
   constexpr std::size_t size = 4;
   auto const xy_area_inv = box_geo.length_inv()[0] * box_geo.length_inv()[1];
-  auto const pref_di = prefactor * 4. * Utils::pi() * xy_area_inv;
+  auto const pref_di = prefactor * 4. * std::numbers::pi * xy_area_inv;
   auto const pref = -pref_di / expm1(omega * box_geo.length()[2]);
   double lclimgebot[4], lclimgetop[4], lclimge[4];
   double fac_delta_mid_bot = 1., fac_delta_mid_top = 1., fac_delta = 1.;
@@ -648,10 +648,10 @@ static void setup_PQ(elc_data const &elc, double prefactor, std::size_t index_p,
   assert(index_q >= 1);
   constexpr std::size_t size = 8;
   auto const xy_area_inv = box_geo.length_inv()[0] * box_geo.length_inv()[1];
-  auto const pref_di = prefactor * 8 * Utils::pi() * xy_area_inv;
+  auto const pref_di = prefactor * 8. * std::numbers::pi * xy_area_inv;
   auto const pref = -pref_di / expm1(omega * box_geo.length()[2]);
   double lclimgebot[8], lclimgetop[8], lclimge[8];
-  double fac_delta_mid_bot = 1, fac_delta_mid_top = 1, fac_delta = 1;
+  double fac_delta_mid_bot = 1., fac_delta_mid_top = 1., fac_delta = 1.;
   if (elc.dielectric_contrast_on) {
     auto const delta = elc.delta_mid_top * elc.delta_mid_bot;
     auto const fac_elc = 1. / (1. - delta * exp(-omega * 2. * elc.box_h));
@@ -773,7 +773,7 @@ static void setup_PQ(elc_data const &elc, double prefactor, std::size_t index_p,
 static void add_PQ_force(std::size_t index_p, std::size_t index_q, double omega,
                          ParticleRange const &particles,
                          BoxGeometry const &box_geo) {
-  auto constexpr c_2pi = 2. * Utils::pi();
+  auto constexpr c_2pi = 2. * std::numbers::pi;
   auto const pref_x =
       c_2pi * box_geo.length_inv()[0] * static_cast<double>(index_p) / omega;
   auto const pref_y =
@@ -831,7 +831,7 @@ static double PQ_energy(double omega, std::size_t n_part) {
 
 void ElectrostaticLayerCorrection::add_force(
     ParticleRange const &particles) const {
-  auto constexpr c_2pi = 2. * Utils::pi();
+  auto constexpr c_2pi = 2. * std::numbers::pi;
   auto const &box_geo = *get_system().box_geo;
   auto const n_freqs = prepare_sc_cache(particles, box_geo, elc.far_cut);
   auto const n_scxcache = std::get<0>(n_freqs);
@@ -886,7 +886,7 @@ void ElectrostaticLayerCorrection::add_force(
 
 double ElectrostaticLayerCorrection::calc_energy(
     ParticleRange const &particles) const {
-  auto constexpr c_2pi = 2. * Utils::pi();
+  auto constexpr c_2pi = 2. * std::numbers::pi;
   auto const &box_geo = *get_system().box_geo;
   auto energy = dipole_energy(particles) + z_energy(particles);
   auto const n_freqs = prepare_sc_cache(particles, box_geo, elc.far_cut);
@@ -956,7 +956,7 @@ double ElectrostaticLayerCorrection::tune_far_cut() const {
   auto tuned_far_cut = min_inv_boxl;
   double err;
   do {
-    auto const pref = 2. * Utils::pi() * tuned_far_cut;
+    auto const pref = 2. * std::numbers::pi * tuned_far_cut;
     auto const sum = pref + 2. * (box_l_x_inv + box_l_y_inv);
     auto const den = -expm1(-pref * lz);
     auto const num1 = exp(pref * (elc.box_h - lz));
@@ -1062,8 +1062,8 @@ elc_data::elc_data(double maxPWerror, double gap_size, double far_cut,
       dielectric_contrast_on{delta_top != 0. or delta_bot != 0.},
       const_pot{with_const_pot and dielectric_contrast_on},
       neutralize{neutralize and !dielectric_contrast_on},
-      delta_mid_top{std::clamp(delta_top, -1., +1.)}, delta_mid_bot{std::clamp(
-                                                          delta_bot, -1., +1.)},
+      delta_mid_top{std::clamp(delta_top, -1., +1.)},
+      delta_mid_bot{std::clamp(delta_bot, -1., +1.)},
       pot_diff{(with_const_pot) ? potential_diff : 0.},
       // initial setup of parameters, may change later when P3M is finally tuned
       // set the space_layer to be 1/3 of the gap size, so that box = layer
@@ -1128,7 +1128,7 @@ void charge_assign(elc_data const &elc, CoulombP3M &solver,
   }
   /* prepare local FFT mesh */
   for (int i = 0; i < solver.p3m.local_mesh.size; i++)
-    solver.p3m.rs_mesh[i] = 0.;
+    solver.p3m.mesh.rs_scalar[i] = 0.;
 
   for (auto zipped : p_q_pos_range) {
     auto const p_q = boost::get<0>(zipped);
diff --git a/src/core/electrostatics/elc.hpp b/src/core/electrostatics/elc.hpp
index 31914893c5a..b6e8c606114 100644
--- a/src/core/electrostatics/elc.hpp
+++ b/src/core/electrostatics/elc.hpp
@@ -117,7 +117,7 @@ struct elc_data {
 
   /// pairwise contributions from the lowest and top layers
   template <typename Kernel>
-  void dielectric_layers_contribution(CoulombP3M const &p3m,
+  void dielectric_layers_contribution(CoulombP3M const &,
                                       BoxGeometry const &box_geo,
                                       Utils::Vector3d const &pos1,
                                       Utils::Vector3d const &pos2, double q1q2,
diff --git a/src/core/electrostatics/icc.cpp b/src/core/electrostatics/icc.cpp
index cb99d9d2aa3..83aa02ea25e 100644
--- a/src/core/electrostatics/icc.cpp
+++ b/src/core/electrostatics/icc.cpp
@@ -44,14 +44,14 @@
 #include "integrators/Propagation.hpp"
 #include "system/System.hpp"
 
-#include <utils/constants.hpp>
-
 #include <boost/mpi/collectives/all_reduce.hpp>
 #include <boost/mpi/operations.hpp>
 
 #include <algorithm>
 #include <cmath>
+#include <cstddef>
 #include <limits>
+#include <numbers>
 #include <stdexcept>
 #include <variant>
 #include <vector>
@@ -110,7 +110,7 @@ void ICCStar::iteration(CellStructure &cell_structure,
   auto const &coulomb = system.coulomb;
   auto const prefactor = std::visit(
       [](auto const &ptr) { return ptr->prefactor; }, *coulomb.impl->solver);
-  auto const pref = 1. / (prefactor * 2. * Utils::pi());
+  auto const pref = 1. / (prefactor * 2. * std::numbers::pi);
   auto const kernel = coulomb.pair_force_kernel();
   auto const elc_kernel = coulomb.pair_force_elc_kernel();
   icc_cfg.citeration = 0;
@@ -223,13 +223,13 @@ void icc_data::sanity_checks() const {
     throw std::domain_error("Parameter 'first_id' must be >= 0");
   if (eps_out <= 0.)
     throw std::domain_error("Parameter 'eps_out' must be > 0");
-  if (areas.size() != n_icc)
+  if (areas.size() != static_cast<std::size_t>(n_icc))
     throw std::invalid_argument("Parameter 'areas' has incorrect shape");
-  if (epsilons.size() != n_icc)
+  if (epsilons.size() != static_cast<std::size_t>(n_icc))
     throw std::invalid_argument("Parameter 'epsilons' has incorrect shape");
-  if (sigmas.size() != n_icc)
+  if (sigmas.size() != static_cast<std::size_t>(n_icc))
     throw std::invalid_argument("Parameter 'sigmas' has incorrect shape");
-  if (normals.size() != n_icc)
+  if (normals.size() != static_cast<std::size_t>(n_icc))
     throw std::invalid_argument("Parameter 'normals' has incorrect shape");
 }
 
@@ -245,12 +245,10 @@ void ICCStar::on_activation() const {
 }
 
 struct SanityChecksICC {
-  template <typename T>
-  void operator()(std::shared_ptr<T> const &actor) const {}
+  template <typename T> void operator()(std::shared_ptr<T> const &) const {}
 #ifdef P3M
 #ifdef CUDA
-  [[noreturn]] void
-  operator()(std::shared_ptr<CoulombP3MGPU> const &actor) const {
+  [[noreturn]] void operator()(std::shared_ptr<CoulombP3MGPU> const &) const {
     throw std::runtime_error("ICC does not work with P3MGPU");
   }
 #endif // CUDA
diff --git a/src/core/electrostatics/mmm-modpsi.cpp b/src/core/electrostatics/mmm-modpsi.cpp
index 4fb556e686b..c19bc985cf4 100644
--- a/src/core/electrostatics/mmm-modpsi.cpp
+++ b/src/core/electrostatics/mmm-modpsi.cpp
@@ -24,9 +24,8 @@
 #include "mmm-modpsi.hpp"
 #include "specfunc.hpp"
 
-#include <utils/constants.hpp>
-
 #include <cmath>
+#include <numbers>
 #include <vector>
 
 std::vector<std::vector<double>> modPsi;
@@ -40,7 +39,7 @@ static void preparePolygammaEven(int n, double binom,
     // psi^0 has a slightly different series expansion
     double maxx = 0.25;
     series.resize(1);
-    series[0] = 2 * (1 - Utils::gamma());
+    series[0] = 2. * (1. - std::numbers::egamma);
     for (int order = 1;; order += 1) {
       auto const x_order = static_cast<double>(2 * order);
       auto const coeff = -2 * hzeta(x_order + 1, 2);
diff --git a/src/core/electrostatics/mmm1d.cpp b/src/core/electrostatics/mmm1d.cpp
index eefc8ca2c8d..18a5ace1ebc 100644
--- a/src/core/electrostatics/mmm1d.cpp
+++ b/src/core/electrostatics/mmm1d.cpp
@@ -38,13 +38,13 @@
 #include "tuning.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <limits>
+#include <numbers>
 #include <vector>
 
 /* if you define this feature, the Bessel functions are calculated up
@@ -56,7 +56,7 @@
 #endif
 
 static auto far_error(int P, double minrad, Utils::Vector3d const &box_l_inv) {
-  auto const wavenumber = 2. * Utils::pi() * box_l_inv[2];
+  auto const wavenumber = 2. * std::numbers::pi * box_l_inv[2];
   // this uses an upper bound to all force components and the potential
   auto const rhores = wavenumber * minrad;
   auto const pref = 4. * box_l_inv[2] * std::max(1., wavenumber);
@@ -172,7 +172,7 @@ void CoulombMMM1D::recalc_boxl_parameters() {
 
 Utils::Vector3d CoulombMMM1D::pair_force(double q1q2, Utils::Vector3d const &d,
                                          double dist) const {
-  auto constexpr c_2pi = 2. * Utils::pi();
+  auto constexpr c_2pi = 2. * std::numbers::pi;
   auto const &box_geo = *get_system().box_geo;
   auto const n_modPsi = static_cast<int>(modPsi.size()) >> 1;
   auto const rxy2 = d[0] * d[0] + d[1] * d[1];
@@ -266,7 +266,7 @@ double CoulombMMM1D::pair_energy(double const q1q2, Utils::Vector3d const &d,
   if (q1q2 == 0.)
     return 0.;
 
-  auto constexpr c_2pi = 2. * Utils::pi();
+  auto constexpr c_2pi = 2. * std::numbers::pi;
   auto const &box_geo = *get_system().box_geo;
   auto const n_modPsi = static_cast<int>(modPsi.size()) >> 1;
   auto const rxy2 = d[0] * d[0] + d[1] * d[1];
@@ -276,10 +276,10 @@ double CoulombMMM1D::pair_energy(double const q1q2, Utils::Vector3d const &d,
 
   if (rxy2 <= far_switch_radius_sq) {
     /* near range formula */
-    energy = -2. * Utils::gamma();
+    energy = -2. * std::numbers::egamma;
 
     /* polygamma summation */
-    double r2n = 1.0;
+    double r2n = 1.;
     for (int n = 0; n < n_modPsi; n++) {
       auto const add = mod_psi_even(n, z_d) * r2n;
       energy -= add;
@@ -310,7 +310,8 @@ double CoulombMMM1D::pair_energy(double const q1q2, Utils::Vector3d const &d,
     auto const rxy_d = rxy * box_geo.length_inv()[2];
     /* The first Bessel term will compensate a little bit the
        log term, so add them close together */
-    energy = -0.25 * log(rxy2_d) + 0.5 * (Utils::ln_2() - Utils::gamma());
+    energy =
+        -0.25 * log(rxy2_d) + 0.5 * (std::numbers::ln2 - std::numbers::egamma);
     for (int bp = 1; bp < MAXIMAL_B_CUT; bp++) {
       if (bessel_radii[bp - 1] < rxy)
         break;
diff --git a/src/core/electrostatics/mmm1d_gpu.cpp b/src/core/electrostatics/mmm1d_gpu.cpp
deleted file mode 100644
index 92ff79233d4..00000000000
--- a/src/core/electrostatics/mmm1d_gpu.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "config/config.hpp"
-
-#ifdef MMM1D_GPU
-
-#include "electrostatics/mmm1d_gpu.hpp"
-
-#include "BoxGeometry.hpp"
-#include "LocalBox.hpp"
-#include "cell_system/CellStructureType.hpp"
-#include "communication.hpp"
-#include "system/GpuParticleData.hpp"
-#include "system/System.hpp"
-
-#include <stdexcept>
-
-CoulombMMM1DGpu::CoulombMMM1DGpu(double prefactor, double maxPWerror,
-                                 double far_switch_radius, int bessel_cutoff)
-    : maxPWerror{maxPWerror}, far_switch_radius{far_switch_radius},
-      far_switch_radius_sq{-1.}, bessel_cutoff{bessel_cutoff} {
-  set_prefactor(prefactor);
-  if (maxPWerror <= 0.) {
-    throw std::domain_error("Parameter 'maxPWerror' must be > 0");
-  }
-  if (far_switch_radius <= 0. and far_switch_radius != -1.) {
-    throw std::domain_error("Parameter 'far_switch_radius' must be > 0");
-  }
-  if (bessel_cutoff < 0 and bessel_cutoff != -1) {
-    throw std::domain_error("Parameter 'bessel_cutoff' must be > 0");
-  }
-  if (this_node == 0) {
-    modpsi_init();
-  }
-}
-
-void CoulombMMM1DGpu::setup_dependent_properties() {
-  auto &system = get_system();
-  auto const &box_geo = *system.box_geo;
-  if (far_switch_radius > 0. and far_switch_radius > box_geo.length()[2]) {
-    throw std::domain_error(
-        "Parameter 'far_switch_radius' must not be larger than box length");
-  }
-  auto &gpu_particle_data = system.gpu;
-  gpu_particle_data.enable_property(GpuParticleData::prop::force);
-  gpu_particle_data.enable_property(GpuParticleData::prop::pos);
-  gpu_particle_data.enable_property(GpuParticleData::prop::q);
-}
-
-void CoulombMMM1DGpu::sanity_checks_periodicity() const {
-  auto const &box_geo = *get_system().box_geo;
-  if (box_geo.periodic(0) || box_geo.periodic(1) || !box_geo.periodic(2)) {
-    throw std::runtime_error("MMM1D requires periodicity (False, False, True)");
-  }
-}
-
-void CoulombMMM1DGpu::sanity_checks_cell_structure() const {
-  auto const &local_geo = *get_system().local_geo;
-  if (local_geo.cell_structure_type() != CellStructureType::NSQUARE) {
-    throw std::runtime_error("MMM1D requires the N-square cellsystem");
-  }
-}
-
-void CoulombMMM1DGpu::tune() {
-  get_system().gpu.update();
-  if (this_node == 0) {
-    setup();
-    tune(maxPWerror, far_switch_radius, bessel_cutoff);
-  }
-  m_is_tuned = true;
-}
-
-#endif // MMM1D_GPU
diff --git a/src/core/electrostatics/mmm1d_gpu.hpp b/src/core/electrostatics/mmm1d_gpu.hpp
deleted file mode 100644
index 98e5a46e784..00000000000
--- a/src/core/electrostatics/mmm1d_gpu.hpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (C) 2014-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-/**
- * @file
- * MMM1D algorithm for long-range Coulomb interactions on the GPU.
- * Implementation of the MMM1D method for the calculation of the electrostatic
- * interaction in one-dimensionally periodic systems. For details on the
- * method see MMM in general. The MMM1D method works only with the N-squared
- * cell system since neither the near nor far formula can be decomposed.
- */
-
-#include "config/config.hpp"
-
-#ifdef MMM1D_GPU
-
-#include "electrostatics/actor.hpp"
-
-class CoulombMMM1DGpu : public Coulomb::Actor<CoulombMMM1DGpu> {
-public:
-  double maxPWerror;
-  double far_switch_radius;
-  double far_switch_radius_sq;
-  int bessel_cutoff;
-
-  CoulombMMM1DGpu(double prefactor, double maxPWerror, double far_switch_radius,
-                  int bessel_cutoff);
-  ~CoulombMMM1DGpu();
-
-  // interface methods
-  void add_long_range_forces();
-  void add_long_range_energy();
-
-  void on_activation() {
-    setup_dependent_properties();
-    sanity_checks();
-    tune();
-  }
-  void on_boxl_change() { setup(); }
-  void on_node_grid_change() const {}
-  void on_periodicity_change() const { sanity_checks_periodicity(); }
-  void on_cell_structure_change() { sanity_checks_cell_structure(); }
-  void init() const {}
-
-  void sanity_checks() const {
-    sanity_checks_periodicity();
-    sanity_checks_cell_structure();
-    sanity_checks_charge_neutrality();
-  }
-
-  void tune();
-  bool is_tuned() const { return m_is_tuned; }
-
-private:
-  bool m_is_tuned = false;
-
-  // the box length currently set on the GPU
-  // Needed to make sure it hasn't been modified after inter coulomb was used.
-  float host_boxz = 0.f;
-  // the number of particles we had during the last run. Needed to check if we
-  // have to realloc dev_forcePairs
-  unsigned int host_npart = 0u;
-
-  // pairs==-1: un-initialized device memory
-  // pairs==0: return forces using atomicAdd
-  // pairs==2: return forces using a global memory reduction
-  int pairs = -1;
-  // variables for forces and energies calculated pre-reduction
-  float *dev_forcePairs = nullptr;
-  float *dev_energyBlocks = nullptr;
-
-  // run a single force calculation and return the time it takes using
-  // high-precision CUDA timers
-  float force_benchmark();
-
-  void setup();
-  void setup_dependent_properties();
-  void modpsi_init();
-  void set_params(double boxz, double prefactor, double maxPWerror,
-                  double far_switch_radius, int bessel_cutoff);
-  void tune(double maxPWerror, double far_switch_radius, int bessel_cutoff);
-  void sanity_checks_periodicity() const;
-  void sanity_checks_cell_structure() const;
-};
-
-#endif // MMM1D_GPU
diff --git a/src/core/electrostatics/mmm1d_gpu_cuda.cu b/src/core/electrostatics/mmm1d_gpu_cuda.cu
deleted file mode 100644
index dc3271ce52c..00000000000
--- a/src/core/electrostatics/mmm1d_gpu_cuda.cu
+++ /dev/null
@@ -1,607 +0,0 @@
-/*
- * Copyright (C) 2014-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/** @file
- *  This file contains the code for the polygamma expansions used for the
- *  near formulas of MMM1D on GPU, as well as the force kernels.
- */
-
-#include "config/config.hpp"
-
-#ifdef MMM1D_GPU
-
-#include "electrostatics/mmm-modpsi.hpp"
-#include "electrostatics/mmm1d_gpu.hpp"
-#include "electrostatics/specfunc.cuh"
-
-#include "BoxGeometry.hpp"
-#include "cuda/utils.cuh"
-#include "system/System.hpp"
-
-#include <utils/constants.hpp>
-#include <utils/math/sqr.hpp>
-
-#include <cuda.h>
-
-#include <cmath>
-#include <cstddef>
-#include <cstdio>
-#include <stdexcept>
-#include <vector>
-
-#if defined(OMPI_MPI_H) || defined(_MPI_H)
-#error CU-file includes mpi.h! This should not happen!
-#endif
-
-// the code is mostly multi-GPU capable, but ESPResSo is not yet
-static constexpr int deviceCount = 1;
-static constexpr unsigned int numThreads = 64u;
-
-#undef cudaSetDevice
-#define cudaSetDevice(d)
-
-__constant__ float far_switch_radius_sq[1] = {0.05f * 0.05f};
-__constant__ float boxz[1];
-__constant__ float uz[1];
-__constant__ float coulomb_prefactor[1] = {1.0f};
-__constant__ int bessel_cutoff[1] = {5};
-__constant__ float maxPWerror[1] = {1e-5f};
-
-// As the coefficients are stored in __constant__ memory, the array needs to be
-// sized in advance. We don't know exactly how many coefficients per order, so
-// we size plentiful.
-constexpr int modpsi_order = 30;
-constexpr int modpsi_constant_size = modpsi_order * modpsi_order * 2;
-
-// linearized array on device
-__constant__ int device_n_modPsi[1] = {0};
-__constant__ unsigned int device_linModPsi_offsets[2 * modpsi_order];
-__constant__ unsigned int device_linModPsi_lengths[2 * modpsi_order];
-__constant__ float device_linModPsi[modpsi_constant_size];
-
-__device__ float dev_mod_psi_even(int n, float x) {
-  return evaluateAsTaylorSeriesAt(
-      &device_linModPsi[device_linModPsi_offsets[2 * n]],
-      static_cast<int>(device_linModPsi_lengths[2 * n]), x * x);
-}
-
-__device__ float dev_mod_psi_odd(int n, float x) {
-  return x * evaluateAsTaylorSeriesAt(
-                 &device_linModPsi[device_linModPsi_offsets[2 * n + 1]],
-                 static_cast<int>(device_linModPsi_lengths[2 * n + 1]), x * x);
-}
-
-void CoulombMMM1DGpu::modpsi_init() {
-  create_mod_psi_up_to(modpsi_order);
-
-  // linearized array on host
-  std::vector<unsigned int> linModPsi_offsets(modPsi.size());
-  std::vector<unsigned int> linModPsi_lengths(modPsi.size());
-  for (std::size_t i = 0; i < modPsi.size(); i++) {
-    if (i)
-      linModPsi_offsets[i] =
-          linModPsi_offsets[i - 1] + linModPsi_lengths[i - 1];
-    linModPsi_lengths[i] = static_cast<unsigned int>(modPsi[i].size());
-  }
-
-  // linearize the coefficients array
-  std::vector<float> linModPsi(linModPsi_offsets[modPsi.size() - 1] +
-                               linModPsi_lengths[modPsi.size() - 1]);
-  for (std::size_t i = 0; i < modPsi.size(); i++) {
-    for (std::size_t j = 0; j < modPsi[i].size(); j++) {
-      linModPsi[linModPsi_offsets[i] + j] = static_cast<float>(modPsi[i][j]);
-    }
-  }
-
-  for (int d = 0; d < deviceCount; d++) {
-    cudaSetDevice(d);
-
-    // copy to GPU
-    auto const linModPsiSize = linModPsi_offsets[modPsi.size() - 1] +
-                               linModPsi_lengths[modPsi.size() - 1];
-    if (linModPsiSize > static_cast<unsigned int>(modpsi_constant_size)) {
-      throw std::runtime_error(
-          "__constant__ device_linModPsi[] is not large enough");
-    }
-    cuda_safe_mem(cudaMemcpyToSymbol(device_linModPsi_offsets,
-                                     linModPsi_offsets.data(),
-                                     modPsi.size() * sizeof(int)));
-    cuda_safe_mem(cudaMemcpyToSymbol(device_linModPsi_lengths,
-                                     linModPsi_lengths.data(),
-                                     modPsi.size() * sizeof(int)));
-    cuda_safe_mem(cudaMemcpyToSymbol(device_linModPsi, linModPsi.data(),
-                                     linModPsiSize * sizeof(float)));
-    auto const n_modPsi = static_cast<int>(modPsi.size() >> 1);
-    cuda_safe_mem(cudaMemcpyToSymbol(device_n_modPsi, &n_modPsi, sizeof(int)));
-  }
-}
-
-/** @brief Get number of blocks for a given number of operations. */
-static auto numBlocksOps(std::size_t n_ops) {
-  auto const n_ops_per_thread = n_ops / static_cast<std::size_t>(numThreads);
-  return 1u + static_cast<unsigned int>(n_ops_per_thread);
-}
-
-/** @brief Get number of blocks for a given number of particles. */
-static auto numBlocks(std::size_t n_part) {
-  auto b = numBlocksOps(Utils::sqr(n_part)); // algorithm is N-square
-  if (b > 65535u)
-    b = 65535u;
-  return b;
-}
-
-void CoulombMMM1DGpu::setup() {
-  auto &system = get_system();
-  auto const box_z = static_cast<float>(system.box_geo->length()[2]);
-  auto const n_part = system.gpu.n_particles();
-  if (not m_is_tuned and n_part != 0) {
-    set_params(box_z, prefactor, maxPWerror, far_switch_radius, bessel_cutoff);
-    tune(maxPWerror, far_switch_radius, bessel_cutoff);
-  }
-  if (box_z != host_boxz) {
-    set_params(box_z, 0, -1, -1, -1);
-  }
-  // skip device memory reallocation if device memory is already
-  // allocated with the correct vector lengths
-  if (n_part == host_npart and pairs != -1) {
-    return;
-  }
-  // For all but the largest systems, it is faster to store force pairs
-  // and then sum them up. Atomics are slow, so we only use them when
-  // we're limited by device memory, do the latter.
-  auto const part_mem_size = 3ul * Utils::sqr(n_part) * sizeof(float);
-  pairs = 2;
-  for (int d = 0; d < deviceCount; d++) {
-    cudaSetDevice(d);
-
-    std::size_t freeMem, totalMem;
-    cudaMemGetInfo(&freeMem, &totalMem);
-    if (freeMem / 2 < part_mem_size) {
-      // don't use more than half the device's memory
-      fprintf(stderr, "Switching to atomicAdd due to memory constraints.\n");
-      pairs = 0;
-      break;
-    }
-  }
-  if (dev_forcePairs) {
-    cuda_safe_mem(cudaFree(dev_forcePairs));
-  }
-  if (pairs) {
-    // we need memory to store force pairs
-    cuda_safe_mem(cudaMalloc((void **)&dev_forcePairs, part_mem_size));
-  }
-  if (dev_energyBlocks) {
-    cuda_safe_mem(cudaFree(dev_energyBlocks));
-  }
-  cuda_safe_mem(cudaMalloc((void **)&dev_energyBlocks,
-                           numBlocks(n_part) * sizeof(float)));
-  host_npart = static_cast<unsigned int>(n_part);
-}
-
-CoulombMMM1DGpu::~CoulombMMM1DGpu() {
-  if (dev_forcePairs) {
-    cuda_safe_mem(cudaFree(dev_forcePairs));
-  }
-  if (dev_energyBlocks) {
-    cuda_safe_mem(cudaFree(dev_energyBlocks));
-  }
-}
-
-__forceinline__ __device__ float sqpow(float x) { return x * x; }
-__forceinline__ __device__ float cbpow(float x) { return x * x * x; }
-
-__device__ void sumReduction(float *input, float *sum) {
-  auto const tid = threadIdx.x;
-  for (auto i = blockDim.x / 2; i > 0; i /= 2) {
-    __syncthreads();
-    if (tid < i)
-      input[tid] += input[i + tid];
-  }
-  __syncthreads();
-  if (tid == 0)
-    sum[0] = input[0];
-}
-
-__global__ void sumKernel(float *data, std::size_t N) {
-  extern __shared__ float partialsums[];
-  if (blockIdx.x != 0)
-    return;
-  std::size_t const tid = threadIdx.x;
-  auto result = 0.f;
-
-  for (std::size_t i = 0; i < N; i += blockDim.x) {
-    if (i + tid >= N)
-      partialsums[tid] = 0.f;
-    else
-      partialsums[tid] = data[i + tid];
-
-    sumReduction(partialsums, &result);
-    if (tid == 0) {
-      if (i == 0)
-        data[0] = 0.f;
-      data[0] += result;
-    }
-  }
-}
-
-__global__ void besselTuneKernel(int *result, float far_switch_radius,
-                                 int maxCut) {
-  constexpr auto c_2pif = 2 * Utils::pi<float>();
-  auto const arg = c_2pif * *uz * far_switch_radius;
-  auto const pref = 4 * *uz * max(1.0f, c_2pif * *uz);
-  float err;
-  int P = 1;
-  do {
-    err = pref * dev_K1(arg * static_cast<float>(P)) * exp(arg) / arg *
-          (static_cast<float>(P) - 1 + 1 / arg);
-    P++;
-  } while (err > *maxPWerror && P <= maxCut);
-  P--;
-
-  result[0] = P;
-}
-
-void CoulombMMM1DGpu::tune(double maxPWerror, double far_switch_radius,
-                           int bessel_cutoff) {
-
-  if (far_switch_radius < 0.0 && bessel_cutoff < 0) {
-    // autodetermine switching radius and Bessel cutoff
-    auto const maxrad = host_boxz;
-    auto bestrad = 0.0;
-    float besttime = INFINITY;
-    auto radius = 0.05 * maxrad;
-    while (radius < maxrad) {
-      set_params(0, 0, maxPWerror, radius, bessel_cutoff);
-      tune(maxPWerror, radius, -2); // tune Bessel cutoff
-      auto const runtime = force_benchmark();
-      if (runtime < besttime) {
-        besttime = runtime;
-        bestrad = radius;
-      }
-      radius += 0.05 * maxrad;
-    }
-    set_params(0, 0, maxPWerror, bestrad, bessel_cutoff);
-    tune(maxPWerror, bestrad, -2); // tune Bessel cutoff
-  } else if (bessel_cutoff < 0) {
-    // autodetermine Bessel cutoff
-    auto const far_switch_radius_f = static_cast<float>(far_switch_radius);
-    int *dev_cutoff;
-    constexpr auto maxCut = 30;
-    cuda_safe_mem(cudaMalloc((void **)&dev_cutoff, sizeof(int)));
-    besselTuneKernel<<<dim3(1), dim3(1), 0, nullptr>>>(
-        dev_cutoff, far_switch_radius_f, maxCut);
-    int best_cutoff = 0;
-    cuda_safe_mem(cudaMemcpy(&best_cutoff, dev_cutoff, sizeof(int),
-                             cudaMemcpyDeviceToHost));
-    cuda_safe_mem(cudaFree(dev_cutoff));
-    if (bessel_cutoff != -2 && best_cutoff >= maxCut) {
-      // we already had our switching radius and only needed to
-      // determine the cutoff, i.e. this was the final tuning round
-      throw std::runtime_error(
-          "No reasonable Bessel cutoff could be determined.");
-    }
-
-    set_params(0, 0, maxPWerror, far_switch_radius, best_cutoff);
-  }
-}
-
-void CoulombMMM1DGpu::set_params(double boxz, double prefactor,
-                                 double maxPWerror, double far_switch_radius,
-                                 int bessel_cutoff) {
-  if (boxz > 0.0 && far_switch_radius > boxz) {
-    throw std::runtime_error(
-        "switching radius must not be larger than box length");
-  }
-
-  for (int d = 0; d < deviceCount; d++) {
-    cudaSetDevice(d);
-    if (far_switch_radius >= 0.0) {
-      this->far_switch_radius = far_switch_radius;
-      far_switch_radius_sq = Utils::sqr(far_switch_radius);
-      auto const far_switch_radius_sq_f =
-          static_cast<float>(far_switch_radius_sq);
-      cuda_safe_mem(cudaMemcpyToSymbol(::far_switch_radius_sq,
-                                       &far_switch_radius_sq_f, sizeof(float)));
-    }
-    if (boxz > 0.0) {
-      host_boxz = static_cast<float>(boxz);
-      auto const uz = 1.0f / host_boxz;
-      cuda_safe_mem(cudaMemcpyToSymbol(::boxz, &host_boxz, sizeof(float)));
-      cuda_safe_mem(cudaMemcpyToSymbol(::uz, &uz, sizeof(float)));
-    }
-    if (prefactor != 0.0) {
-      this->prefactor = prefactor;
-      auto const prefactor_f = static_cast<float>(prefactor);
-      cuda_safe_mem(
-          cudaMemcpyToSymbol(::coulomb_prefactor, &prefactor_f, sizeof(float)));
-    }
-    if (bessel_cutoff > 0) {
-      this->bessel_cutoff = bessel_cutoff;
-      cuda_safe_mem(
-          cudaMemcpyToSymbol(::bessel_cutoff, &bessel_cutoff, sizeof(int)));
-    }
-    if (maxPWerror > 0.0) {
-      this->maxPWerror = maxPWerror;
-      auto const maxPWerror_f = static_cast<float>(maxPWerror);
-      cuda_safe_mem(
-          cudaMemcpyToSymbol(::maxPWerror, &maxPWerror_f, sizeof(float)));
-    }
-  }
-  m_is_tuned = false;
-}
-
-__global__ void forcesKernel(float const *const __restrict__ r,
-                             float const *const __restrict__ q,
-                             float *const __restrict__ force, std::size_t N,
-                             int pairs) {
-
-  constexpr auto c_2pif = 2.f * Utils::pi<float>();
-  auto const tStop = Utils::sqr(N);
-
-  for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < tStop;
-       tid += blockDim.x * gridDim.x) {
-    auto const p1 = tid % N, p2 = tid / N;
-    auto x = r[3 * p2 + 0] - r[3 * p1 + 0];
-    auto y = r[3 * p2 + 1] - r[3 * p1 + 1];
-    auto z = r[3 * p2 + 2] - r[3 * p1 + 2];
-    auto const rxy2 = sqpow(x) + sqpow(y);
-    auto rxy = sqrt(rxy2);
-    auto sum_r = 0.f;
-    auto sum_z = 0.f;
-
-    while (fabs(z) > *boxz / 2.f) // make sure we take the shortest distance
-      z -= (z > 0.f ? 1.f : -1.f) * *boxz;
-
-    if (p1 == p2) {
-      // particle exerts no force on itself
-      rxy = 1.f; // so the division at the end doesn't fail with NaN
-                 // (sum_r is 0 anyway)
-    } else if (rxy2 <= *far_switch_radius_sq) {
-      // near formula
-      auto const uzz = *uz * z;
-      auto const uzr = *uz * rxy;
-      sum_z = dev_mod_psi_odd(0, uzz);
-      auto uzrpow = uzr;
-      for (int n = 1; n < *device_n_modPsi; n++) {
-        auto const sum_r_old = sum_r;
-        auto const mpe = dev_mod_psi_even(n, uzz);
-        auto const mpo = dev_mod_psi_odd(n, uzz);
-
-        sum_r += 2 * static_cast<float>(n) * mpe * uzrpow;
-        uzrpow *= uzr;
-        sum_z += mpo * uzrpow;
-        uzrpow *= uzr;
-
-        if (fabs(sum_r_old - sum_r) < *maxPWerror)
-          break;
-      }
-
-      sum_r *= sqpow(*uz);
-      sum_z *= sqpow(*uz);
-
-      sum_r += rxy * cbpow(rsqrt(rxy2 + sqpow(z)));
-      sum_r += rxy * cbpow(rsqrt(rxy2 + sqpow(z + *boxz)));
-      sum_r += rxy * cbpow(rsqrt(rxy2 + sqpow(z - *boxz)));
-
-      sum_z += z * cbpow(rsqrt(rxy2 + sqpow(z)));
-      sum_z += (z + *boxz) * cbpow(rsqrt(rxy2 + sqpow(z + *boxz)));
-      sum_z += (z - *boxz) * cbpow(rsqrt(rxy2 + sqpow(z - *boxz)));
-
-      if (rxy == 0.f) {
-        // particles at the same radial position only exert a force
-        // in z direction
-        rxy = 1.f; // so the division at the end doesn't fail with NaN
-                   // (sum_r is 0 anyway)
-      }
-    } else {
-      // far formula
-      for (int p = 1; p < *bessel_cutoff; p++) {
-        float arg = c_2pif * *uz * static_cast<float>(p);
-        sum_r += static_cast<float>(p) * dev_K1(arg * rxy) * cos(arg * z);
-        sum_z += static_cast<float>(p) * dev_K0(arg * rxy) * sin(arg * z);
-      }
-      sum_r *= sqpow(*uz) * 4.f * c_2pif;
-      sum_z *= sqpow(*uz) * 4.f * c_2pif;
-      sum_r += 2.f * *uz / rxy;
-    }
-
-    auto const pref = *coulomb_prefactor * q[p1] * q[p2];
-    if (pairs) {
-      force[3 * (p1 + p2 * N) + 0] = pref * sum_r / rxy * x;
-      force[3 * (p1 + p2 * N) + 1] = pref * sum_r / rxy * y;
-      force[3 * (p1 + p2 * N) + 2] = pref * sum_z;
-    } else {
-      atomicAdd(&force[3 * p2 + 0], pref * sum_r / rxy * x);
-      atomicAdd(&force[3 * p2 + 1], pref * sum_r / rxy * y);
-      atomicAdd(&force[3 * p2 + 2], pref * sum_z);
-    }
-  }
-}
-
-__global__ void energiesKernel(float const *const __restrict__ r,
-                               float const *const __restrict__ q,
-                               float *const __restrict__ energy, std::size_t N,
-                               int pairs) {
-
-  constexpr auto c_2pif = 2.f * Utils::pi<float>();
-  constexpr auto c_gammaf = Utils::gamma<float>();
-  auto const tStop = Utils::sqr(N);
-
-  extern __shared__ float partialsums[];
-  if (!pairs) {
-    partialsums[threadIdx.x] = 0;
-    __syncthreads();
-  }
-  for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < tStop;
-       tid += blockDim.x * gridDim.x) {
-    auto const p1 = tid % N, p2 = tid / N;
-    auto z = r[3 * p2 + 2] - r[3 * p1 + 2];
-    auto const rxy2 = sqpow(r[3 * p2 + 0] - r[3 * p1 + 0]) +
-                      sqpow(r[3 * p2 + 1] - r[3 * p1 + 1]);
-    auto rxy = sqrt(rxy2);
-    auto sum_e = 0.f;
-
-    while (fabs(z) > *boxz / 2.f) // make sure we take the shortest distance
-      z -= (z > 0.f ? 1.f : -1.f) * *boxz;
-
-    if (p1 == p2) // particle exerts no force on itself
-    {
-    } else if (rxy2 <= *far_switch_radius_sq) // near formula
-    {
-      auto const uzz = *uz * z;
-      auto const uzr2 = sqpow(*uz * rxy);
-      auto uzrpow = uzr2;
-      sum_e = dev_mod_psi_even(0, uzz);
-      for (int n = 1; n < *device_n_modPsi; n++) {
-        auto const sum_e_old = sum_e;
-        auto const mpe = dev_mod_psi_even(n, uzz);
-        sum_e += mpe * uzrpow;
-        uzrpow *= uzr2;
-
-        if (fabs(sum_e_old - sum_e) < *maxPWerror)
-          break;
-      }
-
-      sum_e *= -1.f * *uz;
-      sum_e -= 2.f * *uz * c_gammaf;
-      sum_e += rsqrt(rxy2 + sqpow(z));
-      sum_e += rsqrt(rxy2 + sqpow(z + *boxz));
-      sum_e += rsqrt(rxy2 + sqpow(z - *boxz));
-    } else // far formula
-    {
-      sum_e = -(log(rxy * *uz / 2.f) + c_gammaf) / 2.f;
-      for (int p = 1; p < *bessel_cutoff; p++) {
-        auto const arg = c_2pif * *uz * static_cast<float>(p);
-        sum_e += dev_K0(arg * rxy) * cos(arg * z);
-      }
-      sum_e *= *uz * 4.f;
-    }
-
-    if (pairs) {
-      energy[p1 + p2 * N] = *coulomb_prefactor * q[p1] * q[p2] * sum_e;
-    } else {
-      partialsums[threadIdx.x] += *coulomb_prefactor * q[p1] * q[p2] * sum_e;
-    }
-  }
-  if (!pairs) {
-    sumReduction(partialsums, &energy[blockIdx.x]);
-  }
-}
-
-__global__ void vectorReductionKernel(float const *src, float *dst,
-                                      std::size_t N) {
-
-  auto const tStop = Utils::sqr(N);
-
-  for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < N;
-       tid += blockDim.x * gridDim.x) {
-    auto const offset = tid % N;
-    for (std::size_t i = 0; tid + i * N < tStop; i++) {
-#pragma unroll 3
-      for (std::size_t d = 0; d < 3; d++) {
-        dst[3 * offset + d] -= src[3 * (tid + i * N) + d];
-      }
-    }
-  }
-}
-
-void CoulombMMM1DGpu::add_long_range_forces() {
-  setup();
-
-  if (pairs < 0) {
-    throw std::runtime_error("MMM1D was not initialized correctly");
-  }
-
-  auto &gpu = get_system().gpu;
-  auto const positions_device = gpu.get_particle_positions_device();
-  auto const charges_device = gpu.get_particle_charges_device();
-  auto const forces_device = gpu.get_particle_forces_device();
-  auto const n_part = gpu.n_particles();
-  if (pairs) {
-    // if we calculate force pairs, we need to reduce them to forces
-    auto const numBlocksRed = numBlocksOps(n_part);
-    KERNELCALL(forcesKernel, numBlocks(n_part), numThreads, positions_device,
-               charges_device, dev_forcePairs, n_part, pairs)
-    KERNELCALL(vectorReductionKernel, numBlocksRed, numThreads, dev_forcePairs,
-               forces_device, n_part)
-  } else {
-    KERNELCALL(forcesKernel, numBlocks(n_part), numThreads, positions_device,
-               charges_device, forces_device, n_part, pairs)
-  }
-}
-
-__global__ void scaleAndAddKernel(float *const dst, float const *const src,
-                                  std::size_t N, float factor) {
-  for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < N;
-       tid += blockDim.x * gridDim.x) {
-    dst[tid] += src[tid] * factor;
-  }
-}
-
-void CoulombMMM1DGpu::add_long_range_energy() {
-  setup();
-
-  if (pairs < 0) {
-    throw std::runtime_error("MMM1D was not initialized correctly");
-  }
-
-  auto &gpu = get_system().gpu;
-  auto const positions_device = gpu.get_particle_positions_device();
-  auto const charges_device = gpu.get_particle_charges_device();
-  auto const energy_device = gpu.get_energy_device();
-  auto const n_part = gpu.n_particles();
-  auto const shared = numThreads * static_cast<unsigned>(sizeof(float));
-  KERNELCALL_shared(energiesKernel, numBlocks(n_part), numThreads, shared,
-                    positions_device, charges_device, dev_energyBlocks, n_part,
-                    0);
-  KERNELCALL_shared(sumKernel, 1, numThreads, shared, dev_energyBlocks,
-                    numBlocks(n_part));
-  // we count every interaction twice, so halve the total energy
-  auto constexpr factor = 0.5f;
-  KERNELCALL(scaleAndAddKernel, 1, 1, &(energy_device->coulomb),
-             dev_energyBlocks, 1, factor);
-}
-
-float CoulombMMM1DGpu::force_benchmark() {
-  cudaEvent_t eventStart, eventStop;
-  float elapsedTime;
-  float *dev_f_benchmark;
-
-  auto &gpu = get_system().gpu;
-  auto const positions_device = gpu.get_particle_positions_device();
-  auto const charges_device = gpu.get_particle_charges_device();
-  auto const n_part = gpu.n_particles();
-  cuda_safe_mem(
-      cudaMalloc((void **)&dev_f_benchmark, 3ul * n_part * sizeof(float)));
-  cuda_safe_mem(cudaEventCreate(&eventStart));
-  cuda_safe_mem(cudaEventCreate(&eventStop));
-  cuda_safe_mem(cudaEventRecord(eventStart, stream[0]));
-  KERNELCALL(forcesKernel, numBlocks(n_part), numThreads, positions_device,
-             charges_device, dev_f_benchmark, n_part, 0)
-  cuda_safe_mem(cudaEventRecord(eventStop, stream[0]));
-  cuda_safe_mem(cudaEventSynchronize(eventStop));
-  cuda_safe_mem(cudaEventElapsedTime(&elapsedTime, eventStart, eventStop));
-  cuda_safe_mem(cudaEventDestroy(eventStart));
-  cuda_safe_mem(cudaEventDestroy(eventStop));
-  cuda_safe_mem(cudaFree(dev_f_benchmark));
-
-  return elapsedTime;
-}
-
-#endif // MMM1D_GPU
diff --git a/src/core/electrostatics/p3m.cpp b/src/core/electrostatics/p3m.cpp
index d8ec633abdb..5e604a4fd51 100644
--- a/src/core/electrostatics/p3m.cpp
+++ b/src/core/electrostatics/p3m.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -33,10 +33,12 @@
 #include "electrostatics/p3m_gpu.hpp"
 #include "electrostatics/p3m_gpu_error.hpp"
 
+#include "fft/fft.hpp"
 #include "p3m/TuningAlgorithm.hpp"
 #include "p3m/TuningLogger.hpp"
-#include "p3m/fft.hpp"
+#include "p3m/for_each_3d.hpp"
 #include "p3m/influence_function.hpp"
+#include "p3m/math.hpp"
 
 #include "BoxGeometry.hpp"
 #include "LocalBox.hpp"
@@ -54,12 +56,9 @@
 #include "system/System.hpp"
 #include "tuning.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/integral_parameter.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -75,10 +74,14 @@
 #include <complex>
 #include <cstddef>
 #include <functional>
+#include <initializer_list>
+#include <numbers>
 #include <optional>
+#include <span>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <tuple>
 #include <utility>
 
 void CoulombP3M::count_charged_particles() {
@@ -103,64 +106,58 @@ void CoulombP3M::count_charged_particles() {
 /** Calculate the optimal influence function of @cite hockney88a.
  *  (optimised for force calculations)
  *
- *  Each node calculates only the values for its domain in k-space
- *  (see fft.plan[3].mesh and fft.plan[3].start).
+ *  Each node calculates only the values for its domain in k-space.
  *
  *  See also: @cite hockney88a eq. 8-22 (p. 275). Note the somewhat
  *  different convention for the prefactors, which is described in
  *  @cite deserno98a @cite deserno98b.
  */
 void CoulombP3M::calc_influence_function_force() {
-  auto const start = Utils::Vector3i{p3m.fft.plan[3].start};
-  auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh};
-
-  p3m.g_force = grid_influence_function<1>(p3m.params, start, start + size,
-                                           get_system().box_geo->length());
+  auto const [KX, KY, KZ] = p3m.fft->get_permutations();
+  p3m.g_force =
+      grid_influence_function<1>(p3m.params, p3m.mesh.start, p3m.mesh.stop, KX,
+                                 KY, KZ, get_system().box_geo->length_inv());
 }
 
 /** Calculate the influence function optimized for the energy and the
  *  self energy correction.
  */
 void CoulombP3M::calc_influence_function_energy() {
-  auto const start = Utils::Vector3i{p3m.fft.plan[3].start};
-  auto const size = Utils::Vector3i{p3m.fft.plan[3].new_mesh};
-
-  p3m.g_energy = grid_influence_function<0>(p3m.params, start, start + size,
-                                            get_system().box_geo->length());
+  auto const [KX, KY, KZ] = p3m.fft->get_permutations();
+  p3m.g_energy =
+      grid_influence_function<0>(p3m.params, p3m.mesh.start, p3m.mesh.stop, KX,
+                                 KY, KZ, get_system().box_geo->length_inv());
 }
 
 /** Aliasing sum used by @ref p3m_k_space_error. */
-static auto p3m_tune_aliasing_sums(int nx, int ny, int nz,
+static auto p3m_tune_aliasing_sums(Utils::Vector3i const &shift,
                                    Utils::Vector3i const &mesh,
                                    Utils::Vector3d const &mesh_i, int cao,
                                    double alpha_L_i) {
-  using Utils::sinc;
-
-  auto const factor1 = Utils::sqr(Utils::pi() * alpha_L_i);
 
+  auto constexpr mesh_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
+  auto constexpr mesh_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
+  auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
   auto alias1 = 0.;
   auto alias2 = 0.;
-  for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) {
-    auto const nmx = nx + mx * mesh[0];
-    auto const fnmx = mesh_i[0] * nmx;
-    for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) {
-      auto const nmy = ny + my * mesh[1];
-      auto const fnmy = mesh_i[1] * nmy;
-      for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) {
-        auto const nmz = nz + mz * mesh[2];
-        auto const fnmz = mesh_i[2] * nmz;
-
-        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
-        auto const ex = exp(-factor1 * nm2);
-        auto const ex2 = Utils::sqr(ex);
-
-        auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2.0 * cao);
-
-        alias1 += ex2 / nm2;
-        alias2 += U2 * ex * (nx * nmx + ny * nmy + nz * nmz) / nm2;
-      }
-    }
-  }
+
+  Utils::Vector3i indices{};
+  Utils::Vector3i nm{};
+  Utils::Vector3d fnm{};
+  for_each_3d(
+      mesh_start, mesh_stop, indices,
+      [&]() {
+        auto const norm_sq = nm.norm2();
+        auto const ex = exp(-factor1 * norm_sq);
+        auto const energy = std::pow(Utils::product(fnm), 2 * cao);
+        alias1 += Utils::sqr(ex) / norm_sq;
+        alias2 += energy * ex * (shift * nm) / norm_sq;
+      },
+      [&](unsigned dim, int n) {
+        nm[dim] = shift[dim] + n * mesh[dim];
+        fnm[dim] = math::sinc(nm[dim] * mesh_i[dim]);
+      });
+
   return std::make_pair(alias1, alias2);
 }
 
@@ -198,51 +195,40 @@ static double p3m_real_space_error(double pref, double r_cut_iL, int n_c_part,
 static double p3m_k_space_error(double pref, Utils::Vector3i const &mesh,
                                 int cao, int n_c_part, double sum_q2,
                                 double alpha_L, Utils::Vector3d const &box_l) {
-  auto const mesh_i =
-      Utils::hadamard_division(Utils::Vector3d::broadcast(1.), mesh);
+
+  auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
+  auto const mesh_i = 1. / Utils::Vector3d(mesh);
   auto const alpha_L_i = 1. / alpha_L;
+  auto const mesh_stop = mesh / 2;
+  auto const mesh_start = -mesh_stop;
+  auto indices = Utils::Vector3i{};
+  auto values = Utils::Vector3d{};
   auto he_q = 0.;
 
-  for (int nx = -mesh[0] / 2; nx < mesh[0] / 2; nx++) {
-    auto const ctan_x = p3m_analytic_cotangent_sum(nx, mesh_i[0], cao);
-    for (int ny = -mesh[1] / 2; ny < mesh[1] / 2; ny++) {
-      auto const ctan_y =
-          ctan_x * p3m_analytic_cotangent_sum(ny, mesh_i[1], cao);
-      for (int nz = -mesh[2] / 2; nz < mesh[2] / 2; nz++) {
-        if ((nx != 0) || (ny != 0) || (nz != 0)) {
-          auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz);
-          auto const cs =
-              p3m_analytic_cotangent_sum(nz, mesh_i[2], cao) * ctan_y;
+  for_each_3d(
+      mesh_start, mesh_stop, indices,
+      [&]() {
+        if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
+          auto const n2 = indices.norm2();
+          auto const cs = Utils::product(values);
           auto const [alias1, alias2] =
-              p3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i);
+              p3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i);
           auto const d = alias1 - Utils::sqr(alias2 / cs) / n2;
           /* at high precision, d can become negative due to extinction;
              also, don't take values that have no significant digits left*/
-          if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC))
+          if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC) {
             he_q += d;
+          }
         }
-      }
-    }
-  }
+      },
+      [&values, &mesh_i, cotangent_sum](unsigned dim, int n) {
+        values[dim] = cotangent_sum(n, mesh_i[dim]);
+      });
+
   return 2. * pref * sum_q2 * sqrt(he_q / static_cast<double>(n_c_part)) /
          (box_l[1] * box_l[2]);
 }
 
-#ifdef CUDA
-static double p3mgpu_k_space_error(double prefactor,
-                                   Utils::Vector3i const &mesh, int cao,
-                                   int npart, double sum_q2, double alpha_L,
-                                   Utils::Vector3d const &box_l) {
-  auto ks_err = 0.;
-  if (this_node == 0) {
-    ks_err = p3m_k_space_error_gpu(prefactor, mesh.data(), cao, npart, sum_q2,
-                                   alpha_L, box_l.data());
-  }
-  boost::mpi::broadcast(comm_cart, ks_err, 0);
-  return ks_err;
-}
-#endif
-
 void CoulombP3M::init() {
   assert(p3m.params.mesh >= Utils::Vector3i::broadcast(1));
   assert(p3m.params.cao >= 1 and p3m.params.cao <= 7);
@@ -264,18 +250,9 @@ void CoulombP3M::init() {
     elc_layer = actor->elc.space_layer;
   }
 
+  assert(p3m.fft);
   p3m.local_mesh.calc_local_ca_mesh(p3m.params, local_geo, skin, elc_layer);
-  p3m.sm.resize(comm_cart, p3m.local_mesh);
-
-  int ca_mesh_size = fft_init(p3m.local_mesh.dim, p3m.local_mesh.margin,
-                              p3m.params.mesh, p3m.params.mesh_off, p3m.ks_pnum,
-                              p3m.fft, ::communicator.node_grid, comm_cart);
-  p3m.rs_mesh.resize(ca_mesh_size);
-
-  for (auto &e : p3m.E_mesh) {
-    e.resize(ca_mesh_size);
-  }
-
+  p3m.fft->init_fft();
   p3m.calc_differential_operator();
 
   /* fix box length dependent constants */
@@ -288,8 +265,8 @@ CoulombP3M::CoulombP3M(P3MParameters &&parameters, double prefactor,
                        int tune_timings, bool tune_verbose,
                        bool check_complex_residuals)
     : p3m{std::move(parameters)}, tune_timings{tune_timings},
-      tune_verbose{tune_verbose}, check_complex_residuals{
-                                      check_complex_residuals} {
+      tune_verbose{tune_verbose},
+      check_complex_residuals{check_complex_residuals} {
 
   if (tune_timings <= 0) {
     throw std::domain_error("Parameter 'timings' must be > 0");
@@ -301,7 +278,7 @@ CoulombP3M::CoulombP3M(P3MParameters &&parameters, double prefactor,
 
 namespace {
 template <int cao> struct AssignCharge {
-  void operator()(p3m_data_struct &p3m, double q,
+  void operator()(decltype(CoulombP3M::p3m) &p3m, double q,
                   Utils::Vector3d const &real_pos,
                   p3m_interpolation_cache &inter_weights) {
     auto const w = p3m_calculate_interpolation_weights<cao>(
@@ -310,21 +287,22 @@ template <int cao> struct AssignCharge {
     inter_weights.store(w);
 
     p3m_interpolate(p3m.local_mesh, w, [q, &p3m](int ind, double w) {
-      p3m.rs_mesh[ind] += w * q;
+      p3m.mesh.rs_scalar[ind] += w * q;
     });
   }
 
-  void operator()(p3m_data_struct &p3m, double q,
+  void operator()(decltype(CoulombP3M::p3m) &p3m, double q,
                   Utils::Vector3d const &real_pos) {
     p3m_interpolate(
         p3m.local_mesh,
         p3m_calculate_interpolation_weights<cao>(real_pos, p3m.params.ai,
                                                  p3m.local_mesh),
-        [q, &p3m](int ind, double w) { p3m.rs_mesh[ind] += w * q; });
+        [q, &p3m](int ind, double w) { p3m.mesh.rs_scalar[ind] += w * q; });
   }
 
   template <typename combined_ranges>
-  void operator()(p3m_data_struct &p3m, combined_ranges const &p_q_pos_range) {
+  void operator()(decltype(CoulombP3M::p3m) &p3m,
+                  combined_ranges const &p_q_pos_range) {
     for (auto zipped : p_q_pos_range) {
       auto const p_q = boost::get<0>(zipped);
       auto const &p_pos = boost::get<1>(zipped);
@@ -341,7 +319,7 @@ void CoulombP3M::charge_assign(ParticleRange const &particles) {
 
   /* prepare local FFT mesh */
   for (int i = 0; i < p3m.local_mesh.size; i++)
-    p3m.rs_mesh[i] = 0.0;
+    p3m.mesh.rs_scalar[i] = 0.0;
 
   auto p_q_range = ParticlePropertyRange::charge_range(particles);
   auto p_pos_range = ParticlePropertyRange::pos_range(particles);
@@ -363,11 +341,8 @@ void CoulombP3M::assign_charge(double q, Utils::Vector3d const &real_pos) {
 
 template <int cao> struct AssignForces {
   template <typename combined_ranges>
-  void operator()(p3m_data_struct &p3m, double force_prefac,
+  void operator()(decltype(CoulombP3M::p3m) &p3m, double force_prefac,
                   combined_ranges const &p_q_force_range) const {
-    using Utils::make_const_span;
-    using Utils::Span;
-    using Utils::Vector;
 
     assert(cao == p3m.inter_weights.cao());
 
@@ -383,8 +358,9 @@ template <int cao> struct AssignForces {
 
         Utils::Vector3d force{};
         p3m_interpolate(p3m.local_mesh, w, [&force, &p3m](int ind, double w) {
-          force += w * Utils::Vector3d{p3m.E_mesh[0][ind], p3m.E_mesh[1][ind],
-                                       p3m.E_mesh[2][ind]};
+          force += w * Utils::Vector3d{p3m.mesh.rs_fields[0u][ind],
+                                       p3m.mesh.rs_fields[1u][ind],
+                                       p3m.mesh.rs_fields[2u][ind]};
         });
 
         p_force -= pref * force;
@@ -414,59 +390,54 @@ static auto calc_dipole_moment(boost::mpi::communicator const &comm,
  */
 Utils::Vector9d
 CoulombP3M::long_range_pressure(ParticleRange const &particles) {
-  using namespace detail::FFT_indexing;
-
   auto const &box_geo = *get_system().box_geo;
-
   Utils::Vector9d node_k_space_pressure_tensor{};
 
   if (p3m.sum_q2 > 0.) {
     charge_assign(particles);
-    p3m.sm.gather_grid(p3m.rs_mesh.data(), comm_cart, p3m.local_mesh.dim);
-    fft_perform_forw(p3m.rs_mesh.data(), p3m.fft, comm_cart);
+    p3m.fft->perform_fwd_fft();
 
-    auto diagonal = 0.;
-    int ind = 0;
-    int j[3];
+    auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
+    auto const &mesh_stop = p3m.mesh.size;
+    auto const &offset = p3m.mesh.start;
     auto const half_alpha_inv_sq = Utils::sqr(1. / 2. / p3m.params.alpha);
-    for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[RX]; j[0]++) {
-      for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[RY]; j[1]++) {
-        for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[RZ]; j[2]++) {
-          auto const kx = 2. * Utils::pi() *
-                          p3m.d_op[RX][j[KX] + p3m.fft.plan[3].start[KX]] *
-                          box_geo.length_inv()[RX];
-          auto const ky = 2. * Utils::pi() *
-                          p3m.d_op[RY][j[KY] + p3m.fft.plan[3].start[KY]] *
-                          box_geo.length_inv()[RY];
-          auto const kz = 2. * Utils::pi() *
-                          p3m.d_op[RZ][j[KZ] + p3m.fft.plan[3].start[KZ]] *
-                          box_geo.length_inv()[RZ];
-          auto const sqk = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz);
-
-          if (sqk != 0.) {
-            auto const node_k_space_energy =
-                p3m.g_energy[ind] * (Utils::sqr(p3m.rs_mesh[2 * ind]) +
-                                     Utils::sqr(p3m.rs_mesh[2 * ind + 1]));
-            auto const vterm = -2. * (1. / sqk + half_alpha_inv_sq);
-            auto const pref = node_k_space_energy * vterm;
-            node_k_space_pressure_tensor[0] += pref * kx * kx; /* sigma_xx */
-            node_k_space_pressure_tensor[1] += pref * kx * ky; /* sigma_xy */
-            node_k_space_pressure_tensor[2] += pref * kx * kz; /* sigma_xz */
-            node_k_space_pressure_tensor[3] += pref * ky * kx; /* sigma_yx */
-            node_k_space_pressure_tensor[4] += pref * ky * ky; /* sigma_yy */
-            node_k_space_pressure_tensor[5] += pref * ky * kz; /* sigma_yz */
-            node_k_space_pressure_tensor[6] += pref * kz * kx; /* sigma_zx */
-            node_k_space_pressure_tensor[7] += pref * kz * ky; /* sigma_zy */
-            node_k_space_pressure_tensor[8] += pref * kz * kz; /* sigma_zz */
-            diagonal += node_k_space_energy;
-          }
-          ind++;
-        }
+    auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
+    auto const [KX, KY, KZ] = p3m.fft->get_permutations();
+    auto indices = Utils::Vector3i{};
+    auto index = std::size_t(0u);
+    auto diagonal = 0.;
+
+    for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+      auto const shift = indices + offset;
+      auto const kx = p3m.d_op[0u][shift[KX]] * wavevector[0u];
+      auto const ky = p3m.d_op[1u][shift[KY]] * wavevector[1u];
+      auto const kz = p3m.d_op[2u][shift[KZ]] * wavevector[2u];
+      auto const norm_sq = Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz);
+
+      if (norm_sq != 0.) {
+        auto const node_k_space_energy =
+            p3m.g_energy[index] *
+            (Utils::sqr(p3m.mesh.rs_scalar[2u * index + 0u]) +
+             Utils::sqr(p3m.mesh.rs_scalar[2u * index + 1u]));
+        auto const vterm = -2. * (1. / norm_sq + half_alpha_inv_sq);
+        auto const pref = node_k_space_energy * vterm;
+        node_k_space_pressure_tensor[0u] += pref * kx * kx; /* sigma_xx */
+        node_k_space_pressure_tensor[1u] += pref * kx * ky; /* sigma_xy */
+        node_k_space_pressure_tensor[2u] += pref * kx * kz; /* sigma_xz */
+        node_k_space_pressure_tensor[4u] += pref * ky * ky; /* sigma_yy */
+        node_k_space_pressure_tensor[5u] += pref * ky * kz; /* sigma_yz */
+        node_k_space_pressure_tensor[8u] += pref * kz * kz; /* sigma_zz */
+        diagonal += node_k_space_energy;
       }
-    }
-    node_k_space_pressure_tensor[0] += diagonal;
-    node_k_space_pressure_tensor[4] += diagonal;
-    node_k_space_pressure_tensor[8] += diagonal;
+      ++index;
+    });
+
+    node_k_space_pressure_tensor[0u] += diagonal;
+    node_k_space_pressure_tensor[4u] += diagonal;
+    node_k_space_pressure_tensor[8u] += diagonal;
+    node_k_space_pressure_tensor[3u] = node_k_space_pressure_tensor[1u];
+    node_k_space_pressure_tensor[6u] = node_k_space_pressure_tensor[2u];
+    node_k_space_pressure_tensor[7u] = node_k_space_pressure_tensor[5u];
   }
 
   return node_k_space_pressure_tensor * prefactor / (2. * box_geo.volume());
@@ -488,10 +459,7 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
             system.coulomb.impl->solver)) {
       charge_assign(particles);
     }
-    /* Gather information for FFT grid inside the nodes domain (inner local
-     * mesh) and perform forward 3D FFT (Charge Assignment Mesh). */
-    p3m.sm.gather_grid(p3m.rs_mesh.data(), comm_cart, p3m.local_mesh.dim);
-    fft_perform_forw(p3m.rs_mesh.data(), p3m.fft, comm_cart);
+    p3m.fft->perform_fwd_fft();
   }
 
   auto p_q_range = ParticlePropertyRange::charge_range(particles);
@@ -508,51 +476,47 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
                 comm_cart, boost::combine(p_q_range, p_unfolded_pos_range)))
           : std::nullopt;
   auto const volume = box_geo.volume();
-  auto const pref = 4. * Utils::pi() / volume / (2. * p3m.params.epsilon + 1.);
+  auto const pref =
+      4. * std::numbers::pi / volume / (2. * p3m.params.epsilon + 1.);
 
   /* === k-space force calculation  === */
   if (force_flag) {
-    /* sqrt(-1)*k differentiation */
-    int j[3];
-    int ind = 0;
+    /* i*k differentiation */
+    auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
+    auto const &mesh_stop = p3m.mesh.size;
+    auto const &offset = p3m.mesh.start;
+    auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
+    auto indices = Utils::Vector3i{};
+    auto index = std::size_t(0u);
+
     /* compute electric field */
     // Eq. (3.49) @cite deserno00b
-    for (j[0] = 0; j[0] < p3m.fft.plan[3].new_mesh[0]; j[0]++) {
-      for (j[1] = 0; j[1] < p3m.fft.plan[3].new_mesh[1]; j[1]++) {
-        for (j[2] = 0; j[2] < p3m.fft.plan[3].new_mesh[2]; j[2]++) {
-          auto const rho_hat = std::complex<double>(p3m.rs_mesh[2 * ind + 0],
-                                                    p3m.rs_mesh[2 * ind + 1]);
-          auto const phi_hat = p3m.g_force[ind] * rho_hat;
-
-          for (int d = 0; d < 3; d++) {
-            /* direction in r-space: */
-            int d_rs = (d + p3m.ks_pnum) % 3;
-            /* directions */
-            auto const k = 2. * Utils::pi() *
-                           p3m.d_op[d_rs][j[d] + p3m.fft.plan[3].start[d]] *
-                           box_geo.length_inv()[d_rs];
-
-            /* i*k*(Re+i*Im) = - Im*k + i*Re*k     (i=sqrt(-1)) */
-            p3m.E_mesh[d_rs][2 * ind + 0] = -k * phi_hat.imag();
-            p3m.E_mesh[d_rs][2 * ind + 1] = +k * phi_hat.real();
-          }
-
-          ind++;
-        }
+    for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+      auto const rho_hat =
+          std::complex<double>(p3m.mesh.rs_scalar[2u * index + 0u],
+                               p3m.mesh.rs_scalar[2u * index + 1u]);
+      auto const phi_hat = p3m.g_force[index] * rho_hat;
+
+      for (int d = 0; d < 3; d++) {
+        /* direction in r-space: */
+        int d_rs = (d + p3m.mesh.ks_pnum) % 3;
+        /* directions */
+        auto const k =
+            p3m.d_op[d_rs][indices[d] + offset[d]] * wavevector[d_rs];
+
+        /* i*k*(Re+i*Im) = - Im*k + i*Re*k     (i=sqrt(-1)) */
+        p3m.mesh.rs_fields[d_rs][2u * index + 0u] = -k * phi_hat.imag();
+        p3m.mesh.rs_fields[d_rs][2u * index + 1u] = +k * phi_hat.real();
       }
-    }
 
-    /* Back FFT force component mesh */
-    auto const check_complex = !p3m.params.tuning and check_complex_residuals;
-    for (int d = 0; d < 3; d++) {
-      fft_perform_back(p3m.E_mesh[d].data(), check_complex, p3m.fft, comm_cart);
-    }
+      ++index;
+    });
 
-    /* redistribute force component mesh */
-    std::array<double *, 3> E_fields = {
-        {p3m.E_mesh[0].data(), p3m.E_mesh[1].data(), p3m.E_mesh[2].data()}};
-    p3m.sm.spread_grid(Utils::make_span(E_fields), comm_cart,
-                       p3m.local_mesh.dim);
+    auto const check_residuals =
+        not p3m.params.tuning and check_complex_residuals;
+    p3m.fft->check_complex_residuals = check_residuals;
+    p3m.fft->perform_field_back_fft();
+    p3m.fft->check_complex_residuals = false;
 
     auto const force_prefac = prefactor / volume;
     Utils::integral_parameter<int, AssignForces, 1, 7>(
@@ -574,11 +538,13 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
   /* === k-space energy calculation  === */
   if (energy_flag or npt_flag) {
     auto node_energy = 0.;
-    for (int i = 0; i < p3m.fft.plan[3].new_size; i++) {
+    auto const mesh_length = Utils::product(p3m.mesh.size);
+    for (int i = 0; i < mesh_length; i++) {
       // Use the energy optimized influence function for energy!
       // Eq. (3.40) @cite deserno00b
-      node_energy += p3m.g_energy[i] * (Utils::sqr(p3m.rs_mesh[2 * i]) +
-                                        Utils::sqr(p3m.rs_mesh[2 * i + 1]));
+      node_energy +=
+          p3m.g_energy[i] * (Utils::sqr(p3m.mesh.rs_scalar[2 * i]) +
+                             Utils::sqr(p3m.mesh.rs_scalar[2 * i + 1]));
     }
     node_energy /= 2. * volume;
 
@@ -587,10 +553,10 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
     if (this_node == 0) {
       /* self energy correction */
       // Eq. (3.8) @cite deserno00b
-      energy -= p3m.sum_q2 * p3m.params.alpha * Utils::sqrt_pi_i();
+      energy -= p3m.sum_q2 * p3m.params.alpha * std::numbers::inv_sqrtpi;
       /* net charge correction */
       // Eq. (3.11) @cite deserno00b
-      energy -= p3m.square_sum_q * Utils::pi() /
+      energy -= p3m.square_sum_q * std::numbers::pi /
                 (2. * volume * Utils::sqr(p3m.params.alpha));
       /* dipole correction */
       // Eq. (3.9) @cite deserno00b
@@ -614,7 +580,7 @@ double CoulombP3M::long_range_kernel(bool force_flag, bool energy_flag,
 }
 
 class CoulombTuningAlgorithm : public TuningAlgorithm {
-  p3m_data_struct &p3m;
+  decltype(CoulombP3M::p3m) &p3m;
   double m_mesh_density_min = -1., m_mesh_density_max = -1.;
   // indicates if mesh should be tuned
   bool m_tune_mesh = false;
@@ -623,7 +589,7 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
   P3MParameters &get_params() override { return p3m.params; }
 
 public:
-  CoulombTuningAlgorithm(System::System &system, p3m_data_struct &input_p3m,
+  CoulombTuningAlgorithm(System::System &system, decltype(p3m) &input_p3m,
                          double prefactor, int timings)
       : TuningAlgorithm(system, prefactor, timings), p3m{input_p3m} {}
 
@@ -665,10 +631,10 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
     rs_err = p3m_real_space_error(m_prefactor, r_cut_iL, p3m.sum_qpart,
                                   p3m.sum_q2, 0., box_geo.length());
 
-    if (Utils::sqrt_2() * rs_err > p3m.params.accuracy) {
+    if (std::numbers::sqrt2 * rs_err > p3m.params.accuracy) {
       /* assume rs_err = ks_err -> rs_err = accuracy/sqrt(2.0) -> alpha_L */
-      alpha_L =
-          sqrt(log(Utils::sqrt_2() * rs_err / p3m.params.accuracy)) / r_cut_iL;
+      alpha_L = sqrt(log(std::numbers::sqrt2 * rs_err / p3m.params.accuracy)) /
+                r_cut_iL;
     } else {
       /* even alpha=0 is ok, however, we cannot choose it since it kills the
          k-space error formula.
@@ -682,8 +648,12 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
 #ifdef CUDA
     auto const &solver = m_system.coulomb.impl->solver;
     if (has_actor_of_type<CoulombP3MGPU>(solver)) {
-      ks_err = p3mgpu_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart,
-                                    p3m.sum_q2, alpha_L, box_geo.length());
+      if (this_node == 0) {
+        ks_err =
+            p3m_k_space_error_gpu(m_prefactor, mesh.data(), cao, p3m.sum_qpart,
+                                  p3m.sum_q2, alpha_L, box_geo.length().data());
+      }
+      boost::mpi::broadcast(comm_cart, ks_err, 0);
     } else
 #endif
       ks_err = p3m_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart,
@@ -714,7 +684,7 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
       assert(p3m.params.mesh[0] >= 1);
       if (p3m.params.mesh[1] == -1 and p3m.params.mesh[2] == -1) {
         // determine the two missing values by rescaling by the box length
-        for (int i : {1, 2}) {
+        for (auto i : {1, 2}) {
           p3m.params.mesh[i] =
               static_cast<int>(std::round(mesh_density * box_geo.length()[i]));
           // make the mesh even in all directions
@@ -734,7 +704,7 @@ class CoulombTuningAlgorithm : public TuningAlgorithm {
     while (mesh_density <= m_mesh_density_max) {
       auto trial_params = TuningAlgorithm::Parameters{};
       if (m_tune_mesh) {
-        for (int i : {0, 1, 2}) {
+        for (auto i : {0, 1, 2}) {
           trial_params.mesh[i] =
               static_cast<int>(std::round(box_geo.length()[i] * mesh_density));
           // make the mesh even in all directions
@@ -811,7 +781,7 @@ void CoulombP3M::sanity_checks_boxl() const {
   auto const &system = get_system();
   auto const &box_geo = *system.box_geo;
   auto const &local_geo = *system.local_geo;
-  for (unsigned int i = 0u; i < 3u; i++) {
+  for (auto i = 0u; i < 3u; i++) {
     /* check k-space cutoff */
     if (p3m.params.cao_cut[i] >= box_geo.length_half()[i]) {
       std::stringstream msg;
diff --git a/src/core/electrostatics/p3m.hpp b/src/core/electrostatics/p3m.hpp
index 05d7b28d3f4..de7bbdee9fa 100644
--- a/src/core/electrostatics/p3m.hpp
+++ b/src/core/electrostatics/p3m.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -42,49 +42,37 @@
 
 #include "p3m/common.hpp"
 #include "p3m/data_struct.hpp"
-#include "p3m/fft.hpp"
 #include "p3m/interpolation.hpp"
 #include "p3m/send_mesh.hpp"
 
 #include "ParticleRange.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/AS_erfc_part.hpp>
 
 #include <array>
 #include <cmath>
+#include <numbers>
+#include <utility>
 
-struct p3m_data_struct : public p3m_data_struct_base {
-  explicit p3m_data_struct(P3MParameters &&parameters)
-      : p3m_data_struct_base{std::move(parameters)} {}
-
-  /** local mesh. */
-  P3MLocalMesh local_mesh;
-  /** real space mesh (local) for CA/FFT. */
-  fft_vector<double> rs_mesh;
-  /** mesh (local) for the electric field. */
-  std::array<fft_vector<double>, 3> E_mesh;
-
-  /** number of charged particles (only on head node). */
-  int sum_qpart = 0;
-  /** Sum of square of charges (only on head node). */
-  double sum_q2 = 0.;
-  /** square of sum of charges (only on head node). */
-  double square_sum_q = 0.;
+/** @brief P3M solver. */
+struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
+  struct p3m_data_struct_impl : public p3m_data_struct {
+    explicit p3m_data_struct_impl(P3MParameters &&parameters)
+        : p3m_data_struct{std::move(parameters)} {}
 
-  p3m_interpolation_cache inter_weights;
+    /** number of charged particles (only on head node). */
+    int sum_qpart = 0;
+    /** Sum of square of charges (only on head node). */
+    double sum_q2 = 0.;
+    /** square of sum of charges (only on head node). */
+    double square_sum_q = 0.;
 
-  /** send/recv mesh sizes */
-  p3m_send_mesh sm;
+    p3m_interpolation_cache inter_weights;
+  };
 
-  fft_data_struct fft;
-};
-
-/** @brief P3M solver. */
-struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
   /** P3M parameters. */
-  p3m_data_struct p3m;
+  p3m_data_struct_impl p3m;
 
   int tune_timings;
   bool tune_verbose;
@@ -183,10 +171,11 @@ struct CoulombP3M : public Coulomb::Actor<CoulombP3M> {
     if ((q1q2 == 0.) || dist >= p3m.params.r_cut || dist <= 0.) {
       return {};
     }
-    auto const adist = p3m.params.alpha * dist;
+    auto const alpha = p3m.params.alpha;
+    auto const adist = alpha * dist;
     auto const exp_adist_sq = exp(-adist * adist);
     auto const dist_sq = dist * dist;
-    auto const two_a_sqrt_pi_i = 2.0 * p3m.params.alpha * Utils::sqrt_pi_i();
+    auto const two_a_sqrt_pi_i = 2. * alpha * std::numbers::inv_sqrtpi;
 #if USE_ERFC_APPROXIMATION
     auto const erfc_part_ri = Utils::AS_erfc_part(adist) / dist;
     auto const fac = exp_adist_sq * (erfc_part_ri + two_a_sqrt_pi_i) / dist_sq;
diff --git a/src/core/electrostatics/p3m_gpu.cpp b/src/core/electrostatics/p3m_gpu.cpp
index f8cf8abfe88..61b5aaadbe3 100644
--- a/src/core/electrostatics/p3m_gpu.cpp
+++ b/src/core/electrostatics/p3m_gpu.cpp
@@ -37,18 +37,6 @@
 
 #include "communication.hpp"
 
-#include <cassert>
-#include <limits>
-
-static auto get_n_part_safe(GpuParticleData const &gpu) {
-  auto const n_part = gpu.n_particles();
-#ifndef NDEBUG
-  auto constexpr n_part_max = std::numeric_limits<unsigned int>::max();
-  assert(n_part < static_cast<std::size_t>(n_part_max));
-#endif
-  return static_cast<unsigned int>(n_part);
-}
-
 void CoulombP3MGPU::add_long_range_forces(ParticleRange const &particles) {
 #ifdef NPT
   if (get_system().propagation->integ_switch == INTEG_METHOD_NPT_ISO) {
@@ -60,8 +48,7 @@ void CoulombP3MGPU::add_long_range_forces(ParticleRange const &particles) {
 #endif
   if (this_node == 0) {
     auto &gpu = get_system().gpu;
-    p3m_gpu_add_farfield_force(*m_gpu_data, gpu, prefactor,
-                               get_n_part_safe(gpu));
+    p3m_gpu_add_farfield_force(*m_gpu_data, gpu, prefactor, gpu.n_particles());
   }
 }
 
@@ -71,9 +58,8 @@ void CoulombP3MGPU::init() {
           system.coulomb.impl->solver)) {
     init_cpu_kernels();
   }
-  p3m_gpu_init(m_gpu_data, p3m.params.cao, p3m.params.mesh.data(),
-               p3m.params.alpha, system.box_geo->length(),
-               get_n_part_safe(system.gpu));
+  p3m_gpu_init(m_gpu_data, p3m.params.cao, p3m.params.mesh, p3m.params.alpha,
+               system.box_geo->length(), system.gpu.n_particles());
 }
 
 void CoulombP3MGPU::init_cpu_kernels() { CoulombP3M::init(); }
diff --git a/src/core/electrostatics/p3m_gpu_cuda.cu b/src/core/electrostatics/p3m_gpu_cuda.cu
index 949eb791e16..c734ba0cccd 100644
--- a/src/core/electrostatics/p3m_gpu_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_cuda.cu
@@ -30,7 +30,7 @@
 #ifdef ELECTROSTATICS
 
 #define P3M_GPU_FLOAT
-//#define P3M_GPU_REAL_DOUBLE
+// #define P3M_GPU_REAL_DOUBLE
 
 #ifdef P3M_GPU_FLOAT
 #define REAL_TYPE float
@@ -53,18 +53,21 @@
 #include "electrostatics/p3m_gpu_cuda.cuh"
 
 #include "cuda/utils.cuh"
+#include "p3m/math.hpp"
 #include "system/System.hpp"
 
 #include <utils/math/bspline.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cuda.h>
 #include <cufft.h>
 
-#include <cstdio>
-#include <cstdlib>
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <limits>
+#include <numbers>
 #include <stdexcept>
 
 #if defined(OMPI_MPI_H) || defined(_MPI_H)
@@ -90,7 +93,7 @@ struct P3MGpuData {
   /** Ewald parameter */
   REAL_TYPE alpha;
   /** Number of particles */
-  unsigned int n_part;
+  unsigned int n_part; // oddity: size_t causes UB with GCC 11.4 in Debug mode
   /** Box size */
   REAL_TYPE box[3];
   /** Mesh dimensions */
@@ -135,6 +138,31 @@ struct P3MGpuParams {
   }
 };
 
+static auto p3m_calc_blocks(unsigned int cao, std::size_t n_part) {
+  auto const cao3 = Utils::int_pow<3>(cao);
+  auto parts_per_block = 1u;
+  while ((parts_per_block + 1u) * cao3 <= 1024u) {
+    ++parts_per_block;
+  }
+  assert((n_part / parts_per_block) <= std::numeric_limits<unsigned>::max());
+  auto n = static_cast<unsigned int>(n_part / parts_per_block);
+  auto n_blocks = ((n_part % parts_per_block) == 0u) ? std::max(1u, n) : n + 1u;
+  assert(n_blocks <= std::numeric_limits<unsigned>::max());
+  return std::make_pair(parts_per_block, static_cast<unsigned>(n_blocks));
+}
+
+dim3 p3m_make_grid(unsigned int n_blocks) {
+  dim3 grid(n_blocks, 1u, 1u);
+  while (grid.x > 65536u) {
+    grid.y++;
+    if ((n_blocks % grid.y) == 0u)
+      grid.x = std::max(1u, n_blocks / grid.y);
+    else
+      grid.x = n_blocks / grid.y + 1u;
+  }
+  return grid;
+}
+
 template <int cao>
 __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
                                         int NZ, REAL_TYPE *Zaehler,
@@ -157,20 +185,20 @@ __device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
   for (MX = -P3M_BRILLOUIN; MX <= P3M_BRILLOUIN; MX++) {
     NMX = static_cast<REAL_TYPE>(((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX) +
                                  p.mesh[0] * MX);
-    S1 = int_pow<2 * cao>(Utils::sinc(Meshi[0] * NMX));
+    S1 = int_pow<2 * cao>(math::sinc(Meshi[0] * NMX));
     for (MY = -P3M_BRILLOUIN; MY <= P3M_BRILLOUIN; MY++) {
       NMY = static_cast<REAL_TYPE>(
           ((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY) + p.mesh[1] * MY);
-      S2 = S1 * int_pow<2 * cao>(Utils::sinc(Meshi[1] * NMY));
+      S2 = S1 * int_pow<2 * cao>(math::sinc(Meshi[1] * NMY));
       for (MZ = -P3M_BRILLOUIN; MZ <= P3M_BRILLOUIN; MZ++) {
         NMZ = static_cast<REAL_TYPE>(
             ((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ) + p.mesh[2] * MZ);
-        S3 = S2 * int_pow<2 * cao>(Utils::sinc(Meshi[2] * NMZ));
+        S3 = S2 * int_pow<2 * cao>(math::sinc(Meshi[2] * NMZ));
 
         NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]);
         *Nenner += S3;
 
-        TE = exp(-sqr(Utils::pi<REAL_TYPE>() / (p.alpha)) * NM2);
+        TE = exp(-sqr(std::numbers::pi_v<REAL_TYPE> / (p.alpha)) * NM2);
         zwi = S3 * TE / NM2;
         Zaehler[0] += NMX * zwi * Leni[0];
         Zaehler[1] += NMY * zwi * Leni[1];
@@ -193,7 +221,7 @@ __global__ void calculate_influence_function_device(const P3MGpuData p) {
   auto index = 0;
   REAL_TYPE Leni[3];
   for (int i = 0; i < 3; ++i) {
-    Leni[i] = 1.0f / p.box[i];
+    Leni[i] = REAL_TYPE{1} / p.box[i];
   }
 
   if ((NX >= p.mesh[0]) || (NY >= p.mesh[1]) || (NZ >= (p.mesh[2] / 2 + 1)))
@@ -216,7 +244,7 @@ __global__ void calculate_influence_function_device(const P3MGpuData p) {
           Dnz * Zaehler[2] * Leni[2];
     zwi /= ((sqr(Dnx * Leni[0]) + sqr(Dny * Leni[1]) + sqr(Dnz * Leni[2])) *
             sqr(Nenner));
-    p.G_hat[index] = 2 * zwi / Utils::pi<REAL_TYPE>();
+    p.G_hat[index] = REAL_TYPE{2} * zwi / std::numbers::pi_v<REAL_TYPE>;
   }
 }
 
@@ -260,8 +288,8 @@ __global__ void apply_diff_op(const P3MGpuData p) {
 
   const FFT_TYPE_COMPLEX meshw = p.charge_mesh[linear_index];
   FFT_TYPE_COMPLEX buf;
-  buf.x = -2.0f * Utils::pi<float>() * meshw.y;
-  buf.y = 2.0f * Utils::pi<float>() * meshw.x;
+  buf.x = REAL_TYPE(-2) * std::numbers::pi_v<REAL_TYPE> * meshw.y;
+  buf.y = REAL_TYPE(+2) * std::numbers::pi_v<REAL_TYPE> * meshw.x;
 
   p.force_mesh_x[linear_index].x =
       static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nx) * buf.x / p.box[0];
@@ -363,36 +391,20 @@ void assign_charges(P3MGpuData const &params,
                     float const *const __restrict__ part_pos,
                     float const *const __restrict__ part_q) {
   auto const cao = static_cast<unsigned int>(params.cao);
-  auto const cao3 = int_pow<3>(cao);
-  unsigned int parts_per_block = 1u, n_blocks = 1u;
-
-  while ((parts_per_block + 1u) * cao3 <= 1024u) {
-    parts_per_block++;
-  }
-  if ((params.n_part % parts_per_block) == 0u)
-    n_blocks = std::max<unsigned>(1u, params.n_part / parts_per_block);
-  else
-    n_blocks = params.n_part / parts_per_block + 1u;
-
+  auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part);
   dim3 block(parts_per_block * cao, cao, cao);
-  dim3 grid(n_blocks, 1u, 1u);
-  while (grid.x > 65536u) {
-    grid.y++;
-    if ((n_blocks % grid.y) == 0u)
-      grid.x = std::max<unsigned>(1u, n_blocks / grid.y);
-    else
-      grid.x = n_blocks / grid.y + 1u;
-  }
+  dim3 grid = p3m_make_grid(n_blocks);
 
-  auto const data_length =
-      3 * static_cast<std::size_t>(parts_per_block * cao) * sizeof(REAL_TYPE);
-  switch (cao) {
+  auto const data_length = std::size_t(3u) *
+                           static_cast<std::size_t>(parts_per_block) *
+                           static_cast<std::size_t>(cao) * sizeof(REAL_TYPE);
+  switch (params.cao) {
   case 1:
-    (assign_charge_kernel<1, false>)<<<grid, block, 0, nullptr>>>(
+    (assign_charge_kernel<1, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
         params, part_pos, part_q, parts_per_block);
     break;
   case 2:
-    (assign_charge_kernel<2, false>)<<<grid, block, 0, nullptr>>>(
+    (assign_charge_kernel<2, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
         params, part_pos, part_q, parts_per_block);
     break;
   case 3:
@@ -434,16 +446,16 @@ __global__ void assign_forces_kernel(P3MGpuData const params,
   /* id of the particle */
   auto const id =
       parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
-  if (id >= params.n_part)
+  if (id >= static_cast<unsigned>(params.n_part))
     return;
   /* position relative to the closest grid point */
   REAL_TYPE m_pos[3];
   /* index of the nearest mesh point */
   int nmp_x, nmp_y, nmp_z;
 
-  m_pos[0] = part_pos[3 * id + 0] * params.hi[0] - params.pos_shift;
-  m_pos[1] = part_pos[3 * id + 1] * params.hi[1] - params.pos_shift;
-  m_pos[2] = part_pos[3 * id + 2] * params.hi[2] - params.pos_shift;
+  m_pos[0] = part_pos[3u * id + 0u] * params.hi[0] - params.pos_shift;
+  m_pos[1] = part_pos[3u * id + 1u] * params.hi[1] - params.pos_shift;
+  m_pos[2] = part_pos[3u * id + 2u] * params.hi[2] - params.pos_shift;
 
   nmp_x = static_cast<int>(floorf(m_pos[0] + REAL_TYPE{0.5}));
   nmp_y = static_cast<int>(floorf(m_pos[1] + REAL_TYPE{0.5}));
@@ -495,44 +507,23 @@ void assign_forces(P3MGpuData const &params,
                    float const *const __restrict__ part_q,
                    float *const __restrict__ part_f,
                    REAL_TYPE const prefactor) {
-  auto const cao = params.cao;
-  auto const cao3 = int_pow<3>(cao);
-  unsigned int parts_per_block = 1u, n_blocks = 1u;
-
-  while ((parts_per_block + 1u) * static_cast<unsigned int>(cao3) <= 1024u) {
-    parts_per_block++;
-  }
-
-  if ((params.n_part % parts_per_block) == 0u)
-    n_blocks = std::max<unsigned>(1u, params.n_part / parts_per_block);
-  else
-    n_blocks = params.n_part / parts_per_block + 1u;
-
-  dim3 block(parts_per_block * static_cast<unsigned int>(cao),
-             static_cast<unsigned int>(cao), static_cast<unsigned int>(cao));
-  dim3 grid(n_blocks, 1u, 1u);
-  while (grid.x > 65536u) {
-    grid.y++;
-    if (n_blocks % grid.y == 0u)
-      grid.x = std::max<unsigned>(1u, n_blocks / grid.y);
-    else
-      grid.x = n_blocks / grid.y + 1u;
-  }
+  auto const cao = static_cast<unsigned int>(params.cao);
+  auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part);
+  dim3 block(parts_per_block * cao, cao, cao);
+  dim3 grid = p3m_make_grid(n_blocks);
 
   /* Switch for assignment templates, the shared version only is faster for cao
    * > 2 */
-  auto const data_length =
-      3u *
-      static_cast<std::size_t>(parts_per_block *
-                               static_cast<unsigned int>(cao)) *
-      sizeof(float);
-  switch (cao) {
+  auto const data_length = std::size_t(3u) *
+                           static_cast<std::size_t>(parts_per_block) *
+                           static_cast<std::size_t>(cao) * sizeof(float);
+  switch (params.cao) {
   case 1:
-    (assign_forces_kernel<1, false>)<<<grid, block, 0, nullptr>>>(
+    (assign_forces_kernel<1, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
         params, part_pos, part_q, part_f, prefactor, parts_per_block);
     break;
   case 2:
-    (assign_forces_kernel<2, false>)<<<grid, block, 0, nullptr>>>(
+    (assign_forces_kernel<2, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
         params, part_pos, part_q, part_f, prefactor, parts_per_block);
     break;
   case 3:
@@ -561,16 +552,17 @@ void assign_forces(P3MGpuData const &params,
   cuda_check_errors_exit(block, grid, "assign_forces", __FILE__, __LINE__);
 }
 
-/* Init the internal data structures of the P3M GPU.
+/**
+ * @brief Initialize the internal data structure of the P3M GPU.
  * Mainly allocation on the device and influence function calculation.
- * Be advised: this needs mesh^3*5*sizeof(REAL_TYPE) of device memory.
+ * Be advised: this needs `mesh^3*5*sizeof(REAL_TYPE)` of device memory.
  * We use real to complex FFTs, so the size of the reciprocal mesh
- * is (cuFFT convention) Nx x Ny x [ Nz /2 + 1 ].
+ * is (cuFFT convention) `Nx * Ny * ( Nz /2 + 1 )`.
  */
 void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
-                  const int mesh[3], double alpha, Utils::Vector3d const &box_l,
-                  unsigned n_part) {
-  if (mesh[0] == -1 && mesh[1] == -1 && mesh[2] == -1)
+                  Utils::Vector3i const &mesh, double alpha,
+                  Utils::Vector3d const &box_l, std::size_t n_part) {
+  if (mesh == Utils::Vector3i::broadcast(-1))
     throw std::runtime_error("P3M: invalid mesh size");
 
   if (not data) {
@@ -579,7 +571,8 @@ void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
 
   auto &p3m_gpu_data = data->p3m_gpu_data;
   bool do_reinit = false, mesh_changed = false;
-  p3m_gpu_data.n_part = n_part;
+  assert(n_part <= std::numeric_limits<unsigned int>::max());
+  p3m_gpu_data.n_part = static_cast<unsigned>(n_part);
 
   if (not data->is_initialized or p3m_gpu_data.alpha != alpha) {
     p3m_gpu_data.alpha = static_cast<REAL_TYPE>(alpha);
@@ -593,15 +586,16 @@ void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
     do_reinit = true;
   }
 
-  if (not data->is_initialized or (p3m_gpu_data.mesh[0] != mesh[0]) or
-      (p3m_gpu_data.mesh[1] != mesh[1]) or (p3m_gpu_data.mesh[2] != mesh[2])) {
-    std::copy(mesh, mesh + 3, p3m_gpu_data.mesh);
+  if (not data->is_initialized or mesh != Utils::Vector3i(p3m_gpu_data.mesh)) {
+    std::copy(mesh.begin(), mesh.end(), p3m_gpu_data.mesh);
     mesh_changed = true;
     do_reinit = true;
   }
 
-  if (not data->is_initialized or (p3m_gpu_data.box[0] != box_l[0]) or
-      (p3m_gpu_data.box[1] != box_l[1]) or (p3m_gpu_data.box[2] != box_l[2])) {
+  if (auto constexpr eps =
+          static_cast<double>(std::numeric_limits<float>::epsilon());
+      not data->is_initialized or
+      (box_l - Utils::Vector3d(p3m_gpu_data.box)).norm() >= eps) {
     std::copy(box_l.begin(), box_l.end(), p3m_gpu_data.box);
     do_reinit = true;
   }
@@ -609,7 +603,7 @@ void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
   p3m_gpu_data.mesh_z_padded = (mesh[2] / 2 + 1) * 2;
   p3m_gpu_data.mesh_size = mesh[0] * mesh[1] * p3m_gpu_data.mesh_z_padded;
 
-  for (int i = 0; i < 3; i++) {
+  for (auto i = 0u; i < 3u; ++i) {
     p3m_gpu_data.hi[i] =
         static_cast<REAL_TYPE>(p3m_gpu_data.mesh[i]) / p3m_gpu_data.box[i];
   }
@@ -689,11 +683,11 @@ void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
  *  \brief The long-range part of the P3M algorithm.
  */
 void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu,
-                                double prefactor, unsigned n_part) {
+                                double prefactor, std::size_t n_part) {
   auto &p3m_gpu_data = data.p3m_gpu_data;
-  p3m_gpu_data.n_part = n_part;
+  p3m_gpu_data.n_part = static_cast<unsigned>(n_part);
 
-  if (p3m_gpu_data.n_part == 0u)
+  if (n_part == 0u)
     return;
 
   auto const positions_device = gpu.get_particle_positions_device();
@@ -719,8 +713,7 @@ void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu,
   if (FFT_FORW_FFT(data.p3m_fft.forw_plan,
                    (REAL_TYPE *)p3m_gpu_data.charge_mesh,
                    p3m_gpu_data.charge_mesh) != CUFFT_SUCCESS) {
-    fprintf(stderr, "CUFFT error: Forward FFT failed\n");
-    return;
+    throw std::runtime_error("CUFFT error: Forward FFT failed");
   }
 
   /* Do convolution */
diff --git a/src/core/electrostatics/p3m_gpu_cuda.cuh b/src/core/electrostatics/p3m_gpu_cuda.cuh
index 6303b62e393..da2a7cd8202 100644
--- a/src/core/electrostatics/p3m_gpu_cuda.cuh
+++ b/src/core/electrostatics/p3m_gpu_cuda.cuh
@@ -23,12 +23,13 @@
 
 #include <utils/Vector.hpp>
 
+#include <cstddef>
 #include <memory>
 
 struct P3MGpuParams;
 
 void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &p3m_gpu_data_ptr, int cao,
-                  const int mesh[3], double alpha, Utils::Vector3d const &box_l,
-                  unsigned n_part);
+                  Utils::Vector3i const &mesh, double alpha,
+                  Utils::Vector3d const &box_l, std::size_t n_part);
 void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu,
-                                double prefactor, unsigned n_part);
+                                double prefactor, std::size_t n_part);
diff --git a/src/core/electrostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics/p3m_gpu_error_cuda.cu
index c279fe54625..fbc51fecf4a 100644
--- a/src/core/electrostatics/p3m_gpu_error_cuda.cu
+++ b/src/core/electrostatics/p3m_gpu_error_cuda.cu
@@ -23,12 +23,12 @@
  *  The corresponding header file is p3m_gpu_error.hpp.
  */
 
+#include "p3m/math.hpp"
 #include "p3m_gpu_error.hpp"
 
 #include "cuda/utils.cuh"
 
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <thrust/device_vector.h>
@@ -38,54 +38,18 @@
 
 #include <algorithm>
 #include <cstddef>
+#include <numbers>
 
 #if defined(OMPI_MPI_H) || defined(_MPI_H)
 #error CU-file includes mpi.h! This should not happen!
 #endif
 
-using Utils::int_pow;
-using Utils::sqr;
-
-/** @todo Extend to higher order. This comes from some 1/sin expansion in
- *  @cite hockney88a
- */
-
-template <int cao>
-__device__ static double p3m_analytic_cotangent_sum(int n, double mesh_i) {
-  const double c = sqr(cos(Utils::pi() * mesh_i * n));
-
-  switch (cao) {
-  case 1:
-    return 1;
-  case 2:
-    return (1.0 + c * 2.0) / 3.0;
-  case 3:
-    return (2.0 + c * (11.0 + c * 2.0)) / 15.0;
-  case 4:
-    return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0;
-  case 5:
-    return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) /
-           2835.0;
-  case 6:
-    return (1382.0 +
-            c * (35396.0 +
-                 c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) /
-           155925.0;
-  case 7:
-    return (21844.0 +
-            c * (776661.0 +
-                 c * (2801040.0 +
-                      c * (2123860.0 +
-                           c * (349500.0 + c * (8166.0 + c * 4.0)))))) /
-           6081075.0;
-  }
-
-  return 0.0;
-}
-
 template <int cao>
 __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi,
                                                 double alpha_L, double *he_q) {
+  using Utils::int_pow;
+  using Utils::sqr;
+
   const int nx =
       -mesh.x / 2 + static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
   const int ny =
@@ -102,197 +66,21 @@ __global__ void p3m_k_space_error_gpu_kernel_ik(int3 mesh, double3 meshi,
   if ((nx != 0) || (ny != 0) || (nz != 0)) {
     const double alpha_L_i = 1. / alpha_L;
     const double n2 = sqr(nx) + sqr(ny) + sqr(nz);
-    const double cs = p3m_analytic_cotangent_sum<cao>(nz, meshi.z) *
-                      p3m_analytic_cotangent_sum<cao>(nx, meshi.x) *
-                      p3m_analytic_cotangent_sum<cao>(ny, meshi.y);
-    const double ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2);
+    const double cs = math::analytic_cotangent_sum<cao>(nz, meshi.z) *
+                      math::analytic_cotangent_sum<cao>(nx, meshi.x) *
+                      math::analytic_cotangent_sum<cao>(ny, meshi.y);
+    const double ex = exp(-sqr(std::numbers::pi * alpha_L_i) * n2);
     const double ex2 = sqr(ex);
     const double U2 =
-        int_pow<2 * cao>(Utils::sinc(meshi.x * nx) * Utils::sinc(meshi.y * ny) *
-                         Utils::sinc(meshi.z * nz));
+        int_pow<2 * cao>(math::sinc(meshi.x * nx) * math::sinc(meshi.y * ny) *
+                         math::sinc(meshi.z * nz));
     auto const alias1 = ex2 / n2;
     auto const d = alias1 - sqr(U2 * ex / cs) / n2;
 
-    if (d > 0 && (d / alias1 > ROUND_ERROR_PREC))
+    if (d > 0. and (d / alias1 > ROUND_ERROR_PREC))
       he_q[lind] = d;
   } else {
-    he_q[lind] = 0;
-  }
-}
-
-__global__ void p3m_k_space_error_gpu_kernel_ad(const int3 mesh,
-                                                const double3 meshi, int cao,
-                                                double alpha_L, double *he_q) {
-  auto const nx =
-      -mesh.x / 2 + static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
-  auto const ny =
-      -mesh.y / 2 + static_cast<int>(blockDim.y * blockIdx.y + threadIdx.y);
-  auto const nz =
-      -mesh.z / 2 + static_cast<int>(blockDim.z * blockIdx.z + threadIdx.z);
-
-  if ((nx >= mesh.x / 2) || (ny >= mesh.y / 2) || (nz >= mesh.z / 2))
-    return;
-
-  int lind = ((nx + mesh.x / 2) * mesh.y * mesh.z + (ny + mesh.y / 2) * mesh.z +
-              (nz + mesh.z / 2));
-
-  auto const alpha_L_i = 1. / alpha_L;
-  double alias1, alias2, alias3, alias4;
-  alias1 = alias2 = alias3 = alias4 = 0;
-
-  if ((nx != 0) || (ny != 0) || (nz != 0)) {
-    for (int mx = -1; mx <= 1; mx++) {
-      auto const nmx = static_cast<double>(nx + mx * mesh.x);
-      for (int my = -1; my <= 1; my++) {
-        auto const nmy = static_cast<double>(ny + my * mesh.y);
-        for (int mz = -1; mz <= 1; mz++) {
-          auto const nmz = static_cast<double>(nz + mz * mesh.z);
-
-          auto const n2 = static_cast<double>(sqr(nmx) + sqr(nmy) + sqr(nmz));
-          auto const ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2);
-          auto const ex2 = sqr(ex);
-          auto const U2 =
-              pow(Utils::sinc(meshi.x * nmx) * Utils::sinc(meshi.y * nmy) *
-                      Utils::sinc(meshi.z * nmz),
-                  2.0 * cao);
-
-          alias1 += ex2 / n2;
-          alias2 += U2 * ex;
-          alias3 += U2 * n2;
-          alias4 += U2;
-        }
-      }
-    }
-
-    if ((alias3 == 0.0) || (alias4 == 0.0))
-      he_q[lind] = 0;
-    else
-      he_q[lind] = alias1 - (alias2 * alias2) / (alias3 * alias4);
-
-  } else {
-    he_q[lind] = 0;
-  }
-}
-
-__global__ void p3m_k_space_error_gpu_kernel_ik_i(const int3 mesh,
-                                                  const double3 meshi, int cao,
-                                                  double alpha_L,
-                                                  double *he_q) {
-
-  auto const nx =
-      -mesh.x / 2 + static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
-  auto const ny =
-      -mesh.y / 2 + static_cast<int>(blockDim.y * blockIdx.y + threadIdx.y);
-  auto const nz =
-      -mesh.z / 2 + static_cast<int>(blockDim.z * blockIdx.z + threadIdx.z);
-
-  if ((nx >= mesh.x / 2) || (ny >= mesh.y / 2) || (nz >= mesh.z / 2))
-    return;
-
-  int lind = ((nx + mesh.x / 2) * mesh.y * mesh.z + (ny + mesh.y / 2) * mesh.z +
-              (nz + mesh.z / 2));
-
-  auto const alpha_L_i = 1. / alpha_L;
-  double alias1, alias2, alias3, alias4;
-  alias1 = alias2 = alias3 = alias4 = 0;
-
-  if ((nx != 0) || (ny != 0) || (nz != 0)) {
-    for (int mx = -1; mx <= 1; mx++) {
-      auto const nmx = static_cast<double>(nx + mx * mesh.x);
-      for (int my = -1; my <= 1; my++) {
-        auto const nmy = static_cast<double>(ny + my * mesh.y);
-        for (int mz = -1; mz <= 1; mz++) {
-          auto const nmz = static_cast<double>(nz + mz * mesh.z);
-
-          auto const n2 = static_cast<double>(sqr(nmx) + sqr(nmy) + sqr(nmz));
-          auto const ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2);
-          auto const ex2 = sqr(ex);
-          auto const U2 =
-              pow(Utils::sinc(meshi.x * nmx) * Utils::sinc(meshi.y * nmy) *
-                      Utils::sinc(meshi.z * nmz),
-                  2.0 * cao);
-
-          alias1 += ex2 / n2;
-          alias2 += U2 * ex * (nx * nmx + ny * nmy + nz * nmz) / n2;
-          alias3 += U2;
-
-          if (((mx + my + mz) % 2) == 0) { // consider only even terms!
-            alias4 += U2;
-          } else {
-            alias4 -= U2;
-          }
-        }
-      }
-    }
-
-    he_q[lind] =
-        alias1 - (alias2 * alias2) / (0.5 * (nx * nx + ny * ny + nz * nz) *
-                                      (alias3 * alias3 + alias4 * alias4));
-
-  } else {
-    he_q[lind] = 0;
-  }
-}
-
-__global__ void p3m_k_space_error_gpu_kernel_ad_i(const int3 mesh,
-                                                  const double3 meshi, int cao,
-                                                  double alpha_L,
-                                                  double *he_q) {
-
-  auto const nx =
-      -mesh.x / 2 + static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
-  auto const ny =
-      -mesh.y / 2 + static_cast<int>(blockDim.y * blockIdx.y + threadIdx.y);
-  auto const nz =
-      -mesh.z / 2 + static_cast<int>(blockDim.z * blockIdx.z + threadIdx.z);
-
-  if ((nx >= mesh.x / 2) || (ny >= mesh.y / 2) || (nz >= mesh.z / 2))
-    return;
-
-  int lind = ((nx + mesh.x / 2) * mesh.y * mesh.z + (ny + mesh.y / 2) * mesh.z +
-              (nz + mesh.z / 2));
-
-  auto const alpha_L_i = 1. / alpha_L;
-  double alias1, alias2, alias3, alias4, alias5, alias6;
-  alias1 = alias2 = alias3 = alias4 = alias5 = alias6 = 0;
-
-  if ((nx != 0) && (ny != 0) && (nz != 0)) {
-    for (int mx = -1; mx <= 1; mx++) {
-      auto const nmx = static_cast<double>(nx + mx * mesh.x);
-      for (int my = -1; my <= 1; my++) {
-        auto const nmy = static_cast<double>(ny + my * mesh.y);
-        for (int mz = -1; mz <= 1; mz++) {
-          auto const nmz = static_cast<double>(nz + mz * mesh.z);
-
-          auto const n2 = sqr(nmx) + sqr(nmy) + sqr(nmz);
-          auto const ex = exp(-sqr(Utils::pi() * alpha_L_i) * n2);
-          auto const ex2 = sqr(ex);
-          auto const U2 =
-              pow(Utils::sinc(meshi.x * nmx) * Utils::sinc(meshi.y * nmy) *
-                      Utils::sinc(meshi.z * nmz),
-                  2.0 * cao);
-
-          alias1 += ex2 / n2;
-          alias2 += U2 * ex;
-          alias3 += U2 * n2;
-          alias4 += U2;
-
-          if (((mx + my + mz) % 2) == 0) { // even term
-            alias5 += U2 * n2;
-            alias6 += U2;
-          } else { // odd term: minus sign!
-            alias5 -= U2 * n2;
-            alias6 -= U2;
-          }
-        }
-      }
-    }
-
-    he_q[lind] =
-        (alias1 - sqr(alias2) / (0.5 * (alias3 * alias4 + alias5 * alias6)));
-
-  } else {
-    he_q[lind] = 0;
+    he_q[lind] = 0.;
   }
 }
 
@@ -351,5 +139,5 @@ double p3m_k_space_error_gpu(double prefactor, const int *mesh, int cao,
 
   auto const he_q_final = thrust::reduce(he_q.begin(), he_q.end());
 
-  return 2 * prefactor * sum_q2 * sqrt(he_q_final / npart) / (box[1] * box[2]);
+  return 2. * prefactor * sum_q2 * sqrt(he_q_final / npart) / (box[1] * box[2]);
 }
diff --git a/src/core/electrostatics/scafacos_impl.cpp b/src/core/electrostatics/scafacos_impl.cpp
index 871fd74d58f..0700bb14ba9 100644
--- a/src/core/electrostatics/scafacos_impl.cpp
+++ b/src/core/electrostatics/scafacos_impl.cpp
@@ -33,15 +33,14 @@
 #include "system/System.hpp"
 #include "tuning.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 
-#include <boost/range/algorithm/min_element.hpp>
-
 #include <algorithm>
 #include <cassert>
 #include <cmath>
+#include <iterator>
 #include <limits>
+#include <span>
 #include <string>
 
 std::shared_ptr<CoulombScafacos>
@@ -76,8 +75,8 @@ void CoulombScafacosImpl::update_particle_forces() const {
   auto it_fields = fields.begin();
   for (auto &p : cell_structure.local_particles()) {
     p.force() += prefactor * p.q() *
-                 Utils::Vector3d(Utils::Span<const double>(&*it_fields, 3));
-    it_fields += 3;
+                 Utils::Vector3d(std::span<const double>(&*it_fields, 3ul));
+    std::advance(it_fields, 3);
   }
 
   /* Check that the particle number did not change */
@@ -97,8 +96,8 @@ void CoulombScafacosImpl::tune_r_cut() {
   auto const &local_geo = *system.local_geo;
   auto const verlet_skin = system.cell_structure->get_verlet_skin();
 
-  auto const min_box_l = *boost::min_element(box_geo.length());
-  auto const min_local_box_l = *boost::min_element(local_geo.length());
+  auto const min_box_l = std::ranges::min(box_geo.length());
+  auto const min_local_box_l = std::ranges::min(local_geo.length());
 
   /* The bisection code breaks down when r_min < 1 for several methods
    * (e.g. p2nfft, p3m, ewald) if the mesh size is not fixed (ScaFaCoS
diff --git a/src/core/electrostatics/specfunc.cpp b/src/core/electrostatics/specfunc.cpp
index 698995b8129..ec9d70f1cca 100644
--- a/src/core/electrostatics/specfunc.cpp
+++ b/src/core/electrostatics/specfunc.cpp
@@ -47,10 +47,10 @@
 
 #include "specfunc.hpp"
 
-#include <utils/constants.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cmath>
+#include <numbers>
 #include <tuple>
 #include <utility>
 
@@ -251,7 +251,7 @@ double K0(double x) {
   if (x <= 2.0) {
     auto const c = evaluateAsChebychevSeriesAt(bk0_cs, 0.5 * x * x - 1.0);
     auto const i0 = evaluateAsChebychevSeriesAt(bi0_cs, x * x / 4.5 - 1.0);
-    return (-std::log(x) + Utils::ln_2()) * i0 + c;
+    return (-std::log(x) + std::numbers::ln2) * i0 + c;
   }
   auto const c =
       (x <= 8.0) ? evaluateAsChebychevSeriesAt(ak0_cs, (16.0 / x - 5.0) / 3.0)
@@ -263,7 +263,7 @@ double K1(double x) {
   if (x <= 2.0) {
     auto const c = evaluateAsChebychevSeriesAt(bk1_cs, 0.5 * x * x - 1.0);
     auto const i1 = x * evaluateAsChebychevSeriesAt(bi1_cs, x * x / 4.5 - 1.0);
-    return (std::log(x) - Utils::ln_2()) * i1 + c / x;
+    return (std::log(x) - std::numbers::ln2) * i1 + c / x;
   }
   auto const c =
       (x <= 8.0) ? evaluateAsChebychevSeriesAt(ak1_cs, (16.0 / x - 5.0) / 3.0)
@@ -328,7 +328,7 @@ double LPK0(double x) {
       d0 = x2 * d0 - dd0 + bi0_cs[j];
       dd0 = tmp0;
     }
-    auto const tmp = std::log(x) - Utils::ln_2();
+    auto const tmp = std::log(x) - std::numbers::ln2;
     auto const ret = -tmp * (0.5 * (bi0_cs[0] + x2 * d0) - dd0);
 
     /* K0/K1 correction */
@@ -388,7 +388,7 @@ double LPK1(double x) {
       d1 = x2 * d1 - dd1 + bi1_cs[j];
       dd1 = tmp1;
     }
-    auto const tmp = std::log(x) - Utils::ln_2();
+    auto const tmp = std::log(x) - std::numbers::ln2;
     auto const ret = x * tmp * (0.5 * (bi1_cs[0] + x2 * d1) - dd1);
 
     /* K0/K1 correction */
@@ -464,7 +464,7 @@ std::pair<double, double> LPK01(double x) {
       dd0 = tmp0;
       dd1 = tmp1;
     }
-    auto const tmp = std::log(x) - Utils::ln_2();
+    auto const tmp = std::log(x) - std::numbers::ln2;
     auto k0 = -tmp * (0.5 * (bi0_cs[0] + x2 * d0) - dd0);
     auto k1 = x * tmp * (0.5 * (bi1_cs[0] + x2 * d1) - dd1);
 
diff --git a/src/core/electrostatics/specfunc.cuh b/src/core/electrostatics/specfunc.cuh
deleted file mode 100644
index c256d86ebfe..00000000000
--- a/src/core/electrostatics/specfunc.cuh
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (C) 2014-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/* Original gsl header
- * specfunc/bessel_K0.cpp
- *
- * Copyright (C) 1996, 1997, 1998, 1999, 2000 Gerard Jungman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* Original Author: G. Jungman */
-
-/** @file
- *  This file contains implementations for the modified Bessel functions of
- *  first and second kind. The implementations are based on the GSL code (see
- *  the original GSL header above) and are duplicated from \ref specfunc.cpp.
- */
-
-#pragma once
-
-#include "config/config.hpp"
-
-#include <utils/constants.hpp>
-
-/** @name Chebyshev expansions based on SLATEC bk0(), bk0e() */
-/**@{*/
-__constant__ static float bk0_data[11] = {
-    -.5f - 0.03532739323390276872f, 0.3442898999246284869f,
-    0.03597993651536150163f,        0.00126461541144692592f,
-    0.00002286212103119451f,        0.00000025347910790261f,
-    0.00000000190451637722f,        0.00000000001034969525f,
-    0.00000000000004259816f,        0.00000000000000013744f,
-    0.00000000000000000035f};
-__constant__ static int bk0_size = 11;
-
-__constant__ static float ak0_data[17] = {
-    2.5f - 0.07643947903327941f, -0.02235652605699819f, 0.00077341811546938f,
-    -0.00004281006688886f,       0.00000308170017386f,  -0.00000026393672220f,
-    0.00000002563713036f,        -0.00000000274270554f, 0.00000000031694296f,
-    -0.00000000003902353f,       0.00000000000506804f,  -0.00000000000068895f,
-    0.00000000000009744f,        -0.00000000000001427f, 0.00000000000000215f,
-    -0.00000000000000033f,       0.00000000000000005f};
-__constant__ static int ak0_size = 16;
-
-__constant__ static float ak02_data[14] = {
-    2.5f - 0.01201869826307592f, -0.00917485269102569f, 0.00014445509317750f,
-    -0.00000401361417543f,       0.00000015678318108f,  -0.00000000777011043f,
-    0.00000000046111825f,        -0.00000000003158592f, 0.00000000000243501f,
-    -0.00000000000020743f,       0.00000000000001925f,  -0.00000000000000192f,
-    0.00000000000000020f,        -0.00000000000000002f};
-__constant__ static int ak02_size = 13;
-/**@}*/
-
-/** @name Chebyshev expansions based on SLATEC besi0() */
-/**@{*/
-__constant__ static float bi0_data[12] = {
-    5.5f - .07660547252839144951f, 1.92733795399380827000f,
-    .22826445869203013390f,        .01304891466707290428f,
-    .00043442709008164874f,        .00000942265768600193f,
-    .00000014340062895106f,        .00000000161384906966f,
-    .00000000001396650044f,        .00000000000009579451f,
-    .00000000000000053339f,        .00000000000000000245f};
-__constant__ static int bi0_size = 12;
-/**@}*/
-
-/** @name Chebyshev expansions based on SLATEC besk1(), besk1e() */
-/**@{*/
-__constant__ static float bk1_data[11] = {
-    1.5f + 0.0253002273389477705f, -0.3531559607765448760f,
-    -0.1226111808226571480f,       -0.0069757238596398643f,
-    -0.0001730288957513052f,       -0.0000024334061415659f,
-    -0.0000000221338763073f,       -0.0000000001411488392f,
-    -0.0000000000006666901f,       -0.0000000000000024274f,
-    -0.0000000000000000070f};
-__constant__ static int bk1_size = 11;
-
-__constant__ static float ak1_data[17] = {
-    2.5f + 0.27443134069738830f, 0.07571989953199368f,  -0.00144105155647540f,
-    0.00006650116955125f,        -0.00000436998470952f, 0.00000035402774997f,
-    -0.00000003311163779f,       0.00000000344597758f,  -0.00000000038989323f,
-    0.00000000004720819f,        -0.00000000000604783f, 0.00000000000081284f,
-    -0.00000000000011386f,       0.00000000000001654f,  -0.00000000000000248f,
-    0.00000000000000038f,        -0.00000000000000006f};
-__constant__ static int ak1_size = 17;
-
-__constant__ static float ak12_data[14] = {
-    2.5f + 0.06379308343739001f, 0.02832887813049721f,  -0.00024753706739052f,
-    0.00000577197245160f,        -0.00000020689392195f, 0.00000000973998344f,
-    -0.00000000055853361f,       0.00000000003732996f,  -0.00000000000282505f,
-    0.00000000000023720f,        -0.00000000000002176f, 0.00000000000000215f,
-    -0.00000000000000022f,       0.00000000000000002f};
-__constant__ static int ak12_size = 14;
-/**@}*/
-
-/** @name Chebyshev expansions based on SLATEC besi1(), besi1e() */
-/**@{*/
-__constant__ static float bi1_data[11] = {
-    1.75f - 0.001971713261099859f, 0.407348876675464810f, 0.034838994299959456f,
-    0.001545394556300123f,         0.000041888521098377f, 0.000000764902676483f,
-    0.000000010042493924f,         0.000000000099322077f, 0.000000000000766380f,
-    0.000000000000004741f,         0.000000000000000024f};
-__constant__ static int bi1_size = 11;
-/**@}*/
-
-__device__ float evaluateAsChebychevSeriesAt(float const *c, int n, float x) {
-  auto const x2 = 2.0f * x;
-  auto dd = c[n - 1];
-  auto d = x2 * dd + c[n - 2];
-  for (int j = n - 3; j >= 1; j--) {
-    auto const tmp = d;
-    d = x2 * d - dd + c[j];
-    dd = tmp;
-  }
-  return x * d - dd + c[0] / 2.0f;
-}
-
-__device__ float evaluateAsTaylorSeriesAt(float const *c, int n, float x) {
-  int cnt = n - 1;
-  auto r = c[cnt];
-  while (--cnt >= 0)
-    r = r * x + c[cnt];
-  return r;
-}
-
-__device__ float dev_K0(float x) {
-  auto const c =
-      evaluateAsChebychevSeriesAt(x <= 2.0f   ? bk0_data
-                                  : x <= 8.0f ? ak0_data
-                                              : ak02_data,
-                                  x <= 2.0f   ? bk0_size
-                                  : x <= 8.0f ? ak0_size
-                                              : ak02_size,
-                                  x <= 2.0f   ? x * x / 2.0f - 1.0f
-                                  : x <= 8.0f ? (16.0f / x - 5.0f) / 3.0f
-                                              : (16.0f / x - 1.0f));
-  if (x <= 2.0f) {
-    auto const I0 =
-        evaluateAsChebychevSeriesAt(bi0_data, bi0_size, x * x / 4.5f - 1.0f);
-    return (-log(x) + Utils::ln_2<float>()) * I0 + c;
-  }
-  return exp(-x) * c * rsqrt(x);
-}
-
-__device__ float dev_K1(float x) {
-  auto const c =
-      evaluateAsChebychevSeriesAt(x <= 2.0f   ? bk1_data
-                                  : x <= 8.0f ? ak1_data
-                                              : ak12_data,
-                                  x <= 2.0f   ? bk1_size
-                                  : x <= 8.0f ? ak1_size
-                                              : ak12_size,
-                                  x <= 2.0f   ? x * x / 2.0f - 1.0f
-                                  : x <= 8.0f ? (16.0f / x - 5.0f) / 3.0f
-                                              : (16.0f / x - 1.0f));
-  if (x <= 2.0f) {
-    auto const I1 = x * evaluateAsChebychevSeriesAt(bi1_data, bi1_size,
-                                                    x * x / 4.5f - 1.0f);
-    return (log(x) - Utils::ln_2<float>()) * I1 + c / x;
-  }
-  return exp(-x) * c * rsqrt(x);
-}
diff --git a/src/core/electrostatics/specfunc.hpp b/src/core/electrostatics/specfunc.hpp
index 5356f23c470..f0f2f6aefc6 100644
--- a/src/core/electrostatics/specfunc.hpp
+++ b/src/core/electrostatics/specfunc.hpp
@@ -36,9 +36,9 @@
 
 #pragma once
 
-#include <utils/Span.hpp>
-
 #include <cassert>
+#include <numeric>
+#include <span>
 #include <utility>
 
 /** Hurwitz zeta function. This function was taken from the GSL code. */
@@ -89,21 +89,19 @@ std::pair<double, double> LPK01(double x);
 /** Evaluate the polynomial interpreted as a Taylor series via the
  *  Horner scheme.
  */
-inline double evaluateAsTaylorSeriesAt(Utils::Span<const double> series,
+inline double evaluateAsTaylorSeriesAt(std::span<const double> series,
                                        double x) {
   assert(not series.empty());
-  auto cnt = static_cast<int>(series.size()) - 1;
-  auto const *c = series.data();
-  auto r = c[cnt];
-  while (--cnt >= 0)
-    r = r * x + c[cnt];
-  return r;
+  auto const value = std::accumulate(
+      series.rbegin(), series.rend(), 0.,
+      [x](auto const &acc, auto const &coeff) { return acc * x + coeff; });
+  return value;
 }
 
 /** Evaluate the polynomial interpreted as a Chebychev series. Requires a
  *  series with at least three coefficients, i.e. no linear approximations!
  */
-inline double evaluateAsChebychevSeriesAt(Utils::Span<const double> series,
+inline double evaluateAsChebychevSeriesAt(std::span<const double> series,
                                           double x) {
   assert(series.size() >= 3);
 
diff --git a/src/core/energy.cpp b/src/core/energy.cpp
index 3f8e21f3271..6bbbf823253 100644
--- a/src/core/energy.cpp
+++ b/src/core/energy.cpp
@@ -31,9 +31,8 @@
 #include "electrostatics/coulomb.hpp"
 #include "magnetostatics/dipoles.hpp"
 
-#include <utils/Span.hpp>
-
 #include <memory>
+#include <span>
 
 namespace System {
 
@@ -65,12 +64,12 @@ std::shared_ptr<Observable_stat> System::calculate_energy() {
 
   short_range_loop(
       [this, coulomb_kernel_ptr = get_ptr(coulomb_kernel), &obs_energy](
-          Particle const &p1, int bond_id, Utils::Span<Particle *> partners) {
+          Particle const &p1, int bond_id, std::span<Particle *> partners) {
         auto const &iaparams = *bonded_ia_params.at(bond_id);
         auto const result = calc_bonded_energy(iaparams, p1, partners, *box_geo,
                                                coulomb_kernel_ptr);
         if (result) {
-          obs_energy.bonded_contribution(bond_id)[0] += result.get();
+          obs_energy.bonded_contribution(bond_id)[0] += result.value();
           return false;
         }
         return true;
diff --git a/src/core/energy_inline.hpp b/src/core/energy_inline.hpp
index 68859ac6b76..5736118405d 100644
--- a/src/core/energy_inline.hpp
+++ b/src/core/energy_inline.hpp
@@ -55,13 +55,14 @@
 #include "errorhandling.hpp"
 #include "exclusions.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 
-#include <boost/optional.hpp>
-#include <boost/range/algorithm/find_if.hpp>
 #include <boost/variant.hpp>
 
+#include <optional>
+#include <span>
+#include <string>
+
 /** Calculate non-bonded energies between a pair of particles.
  *  @param p1         particle 1.
  *  @param p2         particle 2.
@@ -200,9 +201,9 @@ inline void add_non_bonded_pair_energy(
 #endif
 }
 
-inline boost::optional<double>
+inline std::optional<double>
 calc_bonded_energy(Bonded_IA_Parameters const &iaparams, Particle const &p1,
-                   Utils::Span<Particle *> partners, BoxGeometry const &box_geo,
+                   std::span<Particle *> partners, BoxGeometry const &box_geo,
                    Coulomb::ShortRangeEnergyKernel::kernel_type const *kernel) {
   auto const n_partners = static_cast<int>(partners.size());
 
diff --git a/src/core/error_handling/RuntimeError.hpp b/src/core/error_handling/RuntimeError.hpp
index 54acd74604d..595cc55008d 100644
--- a/src/core/error_handling/RuntimeError.hpp
+++ b/src/core/error_handling/RuntimeError.hpp
@@ -65,12 +65,12 @@ struct RuntimeError {
   /** Boost serialization */
   friend class boost::serialization::access;
   template <class Archive> void serialize(Archive &ar, const unsigned int) {
-    ar &m_level;
-    ar &m_who;
-    ar &m_what;
-    ar &m_function;
-    ar &m_file;
-    ar &m_line;
+    ar & m_level;
+    ar & m_who;
+    ar & m_what;
+    ar & m_function;
+    ar & m_file;
+    ar & m_line;
   }
 
   ErrorLevel m_level;
diff --git a/src/core/fft/CMakeLists.txt b/src/core/fft/CMakeLists.txt
new file mode 100644
index 00000000000..359cc857ac8
--- /dev/null
+++ b/src/core/fft/CMakeLists.txt
@@ -0,0 +1,24 @@
+#
+# Copyright (C) 2018-2024 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+if(FFTW3_FOUND)
+  target_link_libraries(espresso_core PUBLIC FFTW3::FFTW3)
+endif()
+
+target_sources(espresso_core PRIVATE fft.cpp)
diff --git a/src/core/p3m/fft.cpp b/src/core/fft/fft.cpp
similarity index 70%
rename from src/core/p3m/fft.cpp
rename to src/core/fft/fft.cpp
index 0c95721c528..29997d3c897 100644
--- a/src/core/p3m/fft.cpp
+++ b/src/core/fft/fft.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -25,26 +25,25 @@
  *
  */
 
-#include "config/config.hpp"
+#include "fft.hpp"
+#include "vector.hpp"
 
-#if defined(P3M) || defined(DP3M)
-
-#include "p3m/fft.hpp"
-
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 #include <utils/index.hpp>
 #include <utils/math/permute_ifield.hpp>
 
-#include <boost/none.hpp>
-#include <boost/optional.hpp>
+#include <boost/mpi/communicator.hpp>
 
 #include <fftw3.h>
 #include <mpi.h>
 
+#include <algorithm>
 #include <cmath>
+#include <cstddef>
 #include <cstdio>
 #include <cstring>
+#include <optional>
+#include <span>
 #include <stdexcept>
 #include <utility>
 #include <vector>
@@ -60,7 +59,8 @@ using Utils::permute_ifield;
 #define REQ_FFT_BACK 302
 /**@}*/
 
-namespace {
+namespace fft {
+
 /** This ugly function does the bookkeeping: which nodes have to
  *  communicate to each other, when you change the node grid.
  *  Changing the regular decomposition requires communication. This
@@ -81,14 +81,13 @@ namespace {
  *  \param[out] node_list2  Linear node index list for grid2.
  *  \param[out] pos         Positions of the nodes in grid2
  *  \param[out] my_pos      Position of comm.rank() in grid2.
- *  \param[in]  comm        MPI communicator.
+ *  \param[in]  rank        MPI rank.
  *  \return Size of the communication group.
  */
-boost::optional<std::vector<int>>
+std::optional<std::vector<int>>
 find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2,
-                 Utils::Span<const int> node_list1, Utils::Span<int> node_list2,
-                 Utils::Span<int> pos, Utils::Span<int> my_pos,
-                 boost::mpi::communicator const &comm) {
+                 std::span<int const> node_list1, std::span<int> node_list2,
+                 std::span<int> pos, std::span<int> my_pos, int rank) {
   int i;
   /* communication group cell size on grid1 and grid2 */
   int s1[3], s2[3];
@@ -109,20 +108,20 @@ find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2,
   int my_group = 0;
 
   /* calculate dimension of comm. group cells for both grids */
-  if ((grid1[0] * grid1[1] * grid1[2]) != (grid2[0] * grid2[1] * grid2[2]))
-    return boost::none; /* unlike number of nodes */
+  if (Utils::product(grid1) != Utils::product(grid2))
+    return std::nullopt; /* unlike number of nodes */
   for (i = 0; i < 3; i++) {
     s1[i] = grid1[i] / grid2[i];
     if (s1[i] == 0)
       s1[i] = 1;
     else if (grid1[i] != grid2[i] * s1[i])
-      return boost::none; /* grids do not match!!! */
+      return std::nullopt; /* grids do not match!!! */
 
     s2[i] = grid2[i] / grid1[i];
     if (s2[i] == 0)
       s2[i] = 1;
     else if (grid2[i] != grid1[i] * s2[i])
-      return boost::none; /* grids do not match!!! */
+      return std::nullopt; /* grids do not match!!! */
 
     ds[i] = grid2[i] / s2[i];
     g_size *= s2[i];
@@ -153,7 +152,7 @@ find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2,
           pos[3 * n + 2] = p2[2];
           if (my_group == 1)
             group[i] = n;
-          if (n == comm.rank() && my_group == 0) {
+          if (n == rank && my_group == 0) {
             my_group = 1;
             c_pos = i;
             my_pos[0] = p2[0];
@@ -177,6 +176,7 @@ find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2,
   return {group};
 }
 
+namespace {
 /** Calculate the local fft mesh. Calculate the local mesh (@p loc_mesh)
  *  of a node at position (@p n_pos) in a node grid (@p n_grid) for a global
  *  mesh of size (@p mesh) and a mesh offset (@p mesh_off (in mesh units))
@@ -185,7 +185,7 @@ find_comm_groups(Utils::Vector3i const &grid1, Utils::Vector3i const &grid2,
  * \param[in]  n_pos    Position of the node in @p n_grid.
  * \param[in]  n_grid   node grid.
  * \param[in]  mesh     global mesh dimensions.
- * \param[in]  mesh_off global mesh offset (see \ref p3m_data_struct).
+ * \param[in]  mesh_off global mesh offset.
  * \param[out] loc_mesh local mesh dimension.
  * \param[out] start    first point of local mesh in global mesh.
  * \return Number of mesh points in local mesh.
@@ -230,7 +230,7 @@ int calc_local_mesh(const int *n_pos, const int *n_grid, const int *mesh,
  *  \param[in]  pos2     Position of recv node in @p grid2.
  *  \param[in]  grid2    node grid 2.
  *  \param[in]  mesh     global mesh dimensions.
- *  \param[in]  mesh_off global mesh offset (see \ref p3m_data_struct).
+ *  \param[in]  mesh_off global mesh offset.
  *  \param[out] block    send block specification.
  *  \return Size of the send block.
  */
@@ -351,65 +351,66 @@ void pack_block_permute2(double const *const in, double *const out,
   }
 }
 
+} // namespace
+
 /** Communicate the grid data according to the given forward FFT plan.
+ *  \param comm   MPI communicator.
  *  \param plan   FFT communication plan.
  *  \param in     input mesh.
  *  \param out    output mesh.
- *  \param fft    FFT communication plan.
- *  \param comm   MPI communicator.
  */
-void forw_grid_comm(fft_forw_plan plan, const double *in, double *out,
-                    fft_data_struct &fft,
-                    const boost::mpi::communicator &comm) {
-  for (int i = 0; i < plan.group.size(); i++) {
-    plan.pack_function(in, fft.send_buf.data(), &(plan.send_block[6 * i]),
-                       &(plan.send_block[6 * i + 3]), plan.old_mesh,
+void fft_data_struct::forw_grid_comm(boost::mpi::communicator const &comm,
+                                     fft_forw_plan const &plan,
+                                     double const *in, double *out) {
+  for (std::size_t i = 0ul; i < plan.group.size(); i++) {
+    plan.pack_function(in, send_buf.data(), &(plan.send_block[6ul * i]),
+                       &(plan.send_block[6ul * i + 3ul]), plan.old_mesh,
                        plan.element);
 
     if (plan.group[i] != comm.rank()) {
-      MPI_Sendrecv(fft.send_buf.data(), plan.send_size[i], MPI_DOUBLE,
-                   plan.group[i], REQ_FFT_FORW, fft.recv_buf.data(),
+      MPI_Sendrecv(send_buf.data(), plan.send_size[i], MPI_DOUBLE,
+                   plan.group[i], REQ_FFT_FORW, recv_buf.data(),
                    plan.recv_size[i], MPI_DOUBLE, plan.group[i], REQ_FFT_FORW,
                    comm, MPI_STATUS_IGNORE);
     } else { /* Self communication... */
-      std::swap(fft.send_buf, fft.recv_buf);
+      std::swap(send_buf, recv_buf);
     }
-    fft_unpack_block(fft.recv_buf.data(), out, &(plan.recv_block[6 * i]),
-                     &(plan.recv_block[6 * i + 3]), plan.new_mesh,
+    fft_unpack_block(recv_buf.data(), out, &(plan.recv_block[6ul * i]),
+                     &(plan.recv_block[6ul * i + 3ul]), plan.new_mesh,
                      plan.element);
   }
 }
 
 /** Communicate the grid data according to the given backward FFT plan.
+ *  \param comm   MPI communicator.
  *  \param plan_f Forward FFT plan.
  *  \param plan_b Backward FFT plan.
  *  \param in     input mesh.
  *  \param out    output mesh.
- *  \param fft    FFT communication plan.
- *  \param comm   MPI communicator.
  */
-void back_grid_comm(fft_forw_plan plan_f, fft_back_plan plan_b,
-                    const double *in, double *out, fft_data_struct &fft,
-                    const boost::mpi::communicator &comm) {
+void fft_data_struct::back_grid_comm(boost::mpi::communicator const &comm,
+                                     fft_forw_plan const &plan_f,
+                                     fft_back_plan const &plan_b,
+                                     double const *in, double *out) {
   /* Back means: Use the send/receive stuff from the forward plan but
      replace the receive blocks by the send blocks and vice
      versa. Attention then also new_mesh and old_mesh are exchanged */
 
-  for (int i = 0; i < plan_f.group.size(); i++) {
-    plan_b.pack_function(in, fft.send_buf.data(), &(plan_f.recv_block[6 * i]),
-                         &(plan_f.recv_block[6 * i + 3]), plan_f.new_mesh,
+  for (std::size_t i = 0ul; i < plan_f.group.size(); i++) {
+    plan_b.pack_function(in, send_buf.data(), &(plan_f.recv_block[6ul * i]),
+                         &(plan_f.recv_block[6ul * i + 3ul]), plan_f.new_mesh,
                          plan_f.element);
 
     if (plan_f.group[i] != comm.rank()) { /* send first, receive second */
-      MPI_Sendrecv(fft.send_buf.data(), plan_f.recv_size[i], MPI_DOUBLE,
-                   plan_f.group[i], REQ_FFT_BACK, fft.recv_buf.data(),
+      MPI_Sendrecv(send_buf.data(), plan_f.recv_size[i], MPI_DOUBLE,
+                   plan_f.group[i], REQ_FFT_BACK, recv_buf.data(),
                    plan_f.send_size[i], MPI_DOUBLE, plan_f.group[i],
                    REQ_FFT_BACK, comm, MPI_STATUS_IGNORE);
     } else { /* Self communication... */
-      std::swap(fft.send_buf, fft.recv_buf);
+      std::swap(send_buf, recv_buf);
     }
-    fft_unpack_block(fft.recv_buf.data(), out, &(plan_f.send_block[6 * i]),
-                     &(plan_f.send_block[6 * i + 3]), plan_f.old_mesh,
+    fft_unpack_block(recv_buf.data(), out, &(plan_f.send_block[6ul * i]),
+                     &(plan_f.send_block[6ul * i + 3ul]), plan_f.old_mesh,
                      plan_f.element);
   }
 }
@@ -465,7 +466,7 @@ int map_3don2d_grid(int const g3d[3], int g2d[3]) {
 }
 
 /** Calculate most square 2D grid. */
-void calc_2d_grid(int n, int grid[3]) {
+static void calc_2d_grid(int n, int grid[3]) {
   grid[0] = n;
   grid[1] = 1;
   grid[2] = 1;
@@ -478,24 +479,25 @@ void calc_2d_grid(int n, int grid[3]) {
     }
   }
 }
-} // namespace
 
-int fft_init(Utils::Vector3i const &ca_mesh_dim, int const *ca_mesh_margin,
-             Utils::Vector3i const &global_mesh_dim,
-             Utils::Vector3d const &global_mesh_off, int &ks_pnum,
-             fft_data_struct &fft, Utils::Vector3i const &grid,
-             boost::mpi::communicator const &comm) {
+int fft_data_struct::initialize_fft(boost::mpi::communicator const &comm,
+                                    Utils::Vector3i const &ca_mesh_dim,
+                                    int const *ca_mesh_margin,
+                                    Utils::Vector3i const &global_mesh_dim,
+                                    Utils::Vector3d const &global_mesh_off,
+                                    int &ks_pnum, Utils::Vector3i const &grid) {
 
   int n_grid[4][3];         /* The four node grids. */
   int my_pos[4][3];         /* The position of comm.rank() in the node grids. */
   std::vector<int> n_id[4]; /* linear node identity lists for the node grids. */
   std::vector<int> n_pos[4]; /* positions of nodes in the node grids. */
 
+  int const rank = comm.rank();
   int node_pos[3];
-  MPI_Cart_coords(comm, comm.rank(), 3, node_pos);
+  MPI_Cart_coords(comm, rank, 3, node_pos);
 
-  fft.max_comm_size = 0;
-  fft.max_mesh_size = 0;
+  max_comm_size = 0;
+  max_mesh_size = 0;
   for (int i = 0; i < 4; i++) {
     n_id[i].resize(1 * comm.size());
     n_pos[i].resize(3 * comm.size());
@@ -518,225 +520,221 @@ int fft_init(Utils::Vector3i const &ca_mesh_dim, int const *ca_mesh_margin,
   /* FFT node grids (n_grid[1 - 3]) */
   calc_2d_grid(comm.size(), n_grid[1]);
   /* resort n_grid[1] dimensions if necessary */
-  fft.plan[1].row_dir = map_3don2d_grid(n_grid[0], n_grid[1]);
-  fft.plan[0].n_permute = 0;
+  forw[1].row_dir = map_3don2d_grid(n_grid[0], n_grid[1]);
+  forw[0].n_permute = 0;
   for (int i = 1; i < 4; i++)
-    fft.plan[i].n_permute = (fft.plan[1].row_dir + i) % 3;
+    forw[i].n_permute = (forw[1].row_dir + i) % 3;
   for (int i = 0; i < 3; i++) {
     n_grid[2][i] = n_grid[1][(i + 1) % 3];
     n_grid[3][i] = n_grid[1][(i + 2) % 3];
   }
-  fft.plan[2].row_dir = (fft.plan[1].row_dir - 1) % 3;
-  fft.plan[3].row_dir = (fft.plan[1].row_dir - 2) % 3;
+  forw[2].row_dir = (forw[1].row_dir - 1) % 3;
+  forw[3].row_dir = (forw[1].row_dir - 2) % 3;
 
   /* === communication groups === */
   /* copy local mesh off real space charge assignment grid */
   for (int i = 0; i < 3; i++)
-    fft.plan[0].new_mesh[i] = ca_mesh_dim[i];
+    forw[0].new_mesh[i] = ca_mesh_dim[i];
 
   for (int i = 1; i < 4; i++) {
-    using Utils::make_span;
     auto group = find_comm_groups(
         {n_grid[i - 1][0], n_grid[i - 1][1], n_grid[i - 1][2]},
         {n_grid[i][0], n_grid[i][1], n_grid[i][2]}, n_id[i - 1],
-        make_span(n_id[i]), make_span(n_pos[i]), my_pos[i], comm);
+        std::span(n_id[i]), std::span(n_pos[i]), my_pos[i], rank);
     if (not group) {
       /* try permutation */
-      std::swap(n_grid[i][(fft.plan[i].row_dir + 1) % 3],
-                n_grid[i][(fft.plan[i].row_dir + 2) % 3]);
+      std::swap(n_grid[i][(forw[i].row_dir + 1) % 3],
+                n_grid[i][(forw[i].row_dir + 2) % 3]);
 
       group = find_comm_groups(
           {n_grid[i - 1][0], n_grid[i - 1][1], n_grid[i - 1][2]},
-          {n_grid[i][0], n_grid[i][1], n_grid[i][2]}, make_span(n_id[i - 1]),
-          make_span(n_id[i]), make_span(n_pos[i]), my_pos[i], comm);
+          {n_grid[i][0], n_grid[i][1], n_grid[i][2]}, std::span(n_id[i - 1]),
+          std::span(n_id[i]), std::span(n_pos[i]), my_pos[i], rank);
 
       if (not group) {
         throw std::runtime_error("INTERNAL ERROR: fft_find_comm_groups error");
       }
     }
 
-    fft.plan[i].group = *group;
+    forw[i].group = group.value();
 
-    fft.plan[i].send_block.resize(6 * fft.plan[i].group.size());
-    fft.plan[i].send_size.resize(fft.plan[i].group.size());
-    fft.plan[i].recv_block.resize(6 * fft.plan[i].group.size());
-    fft.plan[i].recv_size.resize(fft.plan[i].group.size());
+    forw[i].send_block.resize(6 * forw[i].group.size());
+    forw[i].send_size.resize(forw[i].group.size());
+    forw[i].recv_block.resize(6 * forw[i].group.size());
+    forw[i].recv_size.resize(forw[i].group.size());
 
-    fft.plan[i].new_size = calc_local_mesh(
+    forw[i].new_size = calc_local_mesh(
         my_pos[i], n_grid[i], global_mesh_dim.data(), global_mesh_off.data(),
-        fft.plan[i].new_mesh, fft.plan[i].start);
-    permute_ifield(fft.plan[i].new_mesh, 3, -(fft.plan[i].n_permute));
-    permute_ifield(fft.plan[i].start, 3, -(fft.plan[i].n_permute));
-    fft.plan[i].n_ffts = fft.plan[i].new_mesh[0] * fft.plan[i].new_mesh[1];
+        forw[i].new_mesh, forw[i].start);
+    permute_ifield(forw[i].new_mesh, 3, -(forw[i].n_permute));
+    permute_ifield(forw[i].start, 3, -(forw[i].n_permute));
+    forw[i].n_ffts = forw[i].new_mesh[0] * forw[i].new_mesh[1];
 
     /* === send/recv block specifications === */
-    for (int j = 0; j < fft.plan[i].group.size(); j++) {
+    for (std::size_t j = 0ul; j < forw[i].group.size(); j++) {
       /* send block: comm.rank() to comm-group-node i (identity: node) */
-      int node = fft.plan[i].group[j];
-      fft.plan[i].send_size[j] = calc_send_block(
+      int node = forw[i].group[j];
+      forw[i].send_size[j] = calc_send_block(
           my_pos[i - 1], n_grid[i - 1], &(n_pos[i][3 * node]), n_grid[i],
           global_mesh_dim.data(), global_mesh_off.data(),
-          &(fft.plan[i].send_block[6 * j]));
-      permute_ifield(&(fft.plan[i].send_block[6 * j]), 3,
-                     -(fft.plan[i - 1].n_permute));
-      permute_ifield(&(fft.plan[i].send_block[6 * j + 3]), 3,
-                     -(fft.plan[i - 1].n_permute));
-      if (fft.plan[i].send_size[j] > fft.max_comm_size)
-        fft.max_comm_size = fft.plan[i].send_size[j];
+          &(forw[i].send_block[6ul * j]));
+      permute_ifield(&(forw[i].send_block[6ul * j]), 3,
+                     -(forw[i - 1].n_permute));
+      permute_ifield(&(forw[i].send_block[6ul * j + 3ul]), 3,
+                     -(forw[i - 1].n_permute));
+      if (forw[i].send_size[j] > max_comm_size)
+        max_comm_size = forw[i].send_size[j];
       /* First plan send blocks have to be adjusted, since the CA grid
          may have an additional margin outside the actual domain of the
          node */
       if (i == 1) {
-        for (int k = 0; k < 3; k++)
-          fft.plan[1].send_block[6 * j + k] += ca_mesh_margin[2 * k];
+        for (std::size_t k = 0ul; k < 3ul; k++)
+          forw[1].send_block[6ul * j + k] += ca_mesh_margin[2ul * k];
       }
       /* recv block: comm.rank() from comm-group-node i (identity: node) */
-      fft.plan[i].recv_size[j] = calc_send_block(
+      forw[i].recv_size[j] = calc_send_block(
           my_pos[i], n_grid[i], &(n_pos[i - 1][3 * node]), n_grid[i - 1],
           global_mesh_dim.data(), global_mesh_off.data(),
-          &(fft.plan[i].recv_block[6 * j]));
-      permute_ifield(&(fft.plan[i].recv_block[6 * j]), 3,
-                     -(fft.plan[i].n_permute));
-      permute_ifield(&(fft.plan[i].recv_block[6 * j + 3]), 3,
-                     -(fft.plan[i].n_permute));
-      if (fft.plan[i].recv_size[j] > fft.max_comm_size)
-        fft.max_comm_size = fft.plan[i].recv_size[j];
+          &(forw[i].recv_block[6ul * j]));
+      permute_ifield(&(forw[i].recv_block[6ul * j]), 3, -(forw[i].n_permute));
+      permute_ifield(&(forw[i].recv_block[6ul * j + 3ul]), 3,
+                     -(forw[i].n_permute));
+      if (forw[i].recv_size[j] > max_comm_size)
+        max_comm_size = forw[i].recv_size[j];
     }
 
-    for (int j = 0; j < 3; j++)
-      fft.plan[i].old_mesh[j] = fft.plan[i - 1].new_mesh[j];
+    for (std::size_t j = 0ul; j < 3ul; j++)
+      forw[i].old_mesh[j] = forw[i - 1].new_mesh[j];
     if (i == 1) {
-      fft.plan[i].element = 1;
+      forw[i].element = 1;
     } else {
-      fft.plan[i].element = 2;
-      for (int j = 0; j < fft.plan[i].group.size(); j++) {
-        fft.plan[i].send_size[j] *= 2;
-        fft.plan[i].recv_size[j] *= 2;
+      forw[i].element = 2;
+      for (std::size_t j = 0ul; j < forw[i].group.size(); j++) {
+        forw[i].send_size[j] *= 2;
+        forw[i].recv_size[j] *= 2;
       }
     }
   }
 
   /* Factor 2 for complex fields */
-  fft.max_comm_size *= 2;
-  fft.max_mesh_size = Utils::product(ca_mesh_dim);
+  max_comm_size *= 2;
+  max_mesh_size = Utils::product(ca_mesh_dim);
   for (int i = 1; i < 4; i++)
-    if (2 * fft.plan[i].new_size > fft.max_mesh_size)
-      fft.max_mesh_size = 2 * fft.plan[i].new_size;
+    if (2 * forw[i].new_size > max_mesh_size)
+      max_mesh_size = 2 * forw[i].new_size;
 
   /* === pack function === */
   for (int i = 1; i < 4; i++) {
-    fft.plan[i].pack_function = pack_block_permute2;
+    forw[i].pack_function = pack_block_permute2;
   }
   ks_pnum = 6;
-  if (fft.plan[1].row_dir == 2) {
-    fft.plan[1].pack_function = fft_pack_block;
+  if (forw[1].row_dir == 2) {
+    forw[1].pack_function = fft_pack_block;
     ks_pnum = 4;
-  } else if (fft.plan[1].row_dir == 1) {
-    fft.plan[1].pack_function = pack_block_permute1;
+  } else if (forw[1].row_dir == 1) {
+    forw[1].pack_function = pack_block_permute1;
     ks_pnum = 5;
   }
 
-  fft.send_buf.resize(fft.max_comm_size);
-  fft.recv_buf.resize(fft.max_comm_size);
-  fft.data_buf.resize(fft.max_mesh_size);
-  auto *c_data = (fftw_complex *)(fft.data_buf.data());
+  send_buf.resize(max_comm_size);
+  recv_buf.resize(max_comm_size);
+  data_buf.resize(max_mesh_size);
+  auto *c_data = (fftw_complex *)(data_buf.data());
 
   /* === FFT Routines (Using FFTW / RFFTW package)=== */
   for (int i = 1; i < 4; i++) {
-    fft.plan[i].dir = FFTW_FORWARD;
-    /* FFT plan creation.*/
-
-    if (fft.init_tag)
-      fftw_destroy_plan(fft.plan[i].our_fftw_plan);
-    fft.plan[i].our_fftw_plan = fftw_plan_many_dft(
-        1, &fft.plan[i].new_mesh[2], fft.plan[i].n_ffts, c_data, nullptr, 1,
-        fft.plan[i].new_mesh[2], c_data, nullptr, 1, fft.plan[i].new_mesh[2],
-        fft.plan[i].dir, FFTW_PATIENT);
+    if (init_tag) {
+      forw[i].destroy_plan();
+    }
+    forw[i].dir = FFTW_FORWARD;
+    forw[i].plan_handle =
+        fftw_plan_many_dft(1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data,
+                           nullptr, 1, forw[i].new_mesh[2], c_data, nullptr, 1,
+                           forw[i].new_mesh[2], forw[i].dir, FFTW_PATIENT);
+    assert(forw[i].plan_handle);
   }
 
   /* === The BACK Direction === */
   /* this is needed because slightly different functions are used */
   for (int i = 1; i < 4; i++) {
-    fft.back[i].dir = FFTW_BACKWARD;
-
-    if (fft.init_tag)
-      fftw_destroy_plan(fft.back[i].our_fftw_plan);
-    fft.back[i].our_fftw_plan = fftw_plan_many_dft(
-        1, &fft.plan[i].new_mesh[2], fft.plan[i].n_ffts, c_data, nullptr, 1,
-        fft.plan[i].new_mesh[2], c_data, nullptr, 1, fft.plan[i].new_mesh[2],
-        fft.back[i].dir, FFTW_PATIENT);
-
-    fft.back[i].pack_function = pack_block_permute1;
+    if (init_tag) {
+      back[i].destroy_plan();
+    }
+    back[i].dir = FFTW_BACKWARD;
+    back[i].plan_handle =
+        fftw_plan_many_dft(1, &forw[i].new_mesh[2], forw[i].n_ffts, c_data,
+                           nullptr, 1, forw[i].new_mesh[2], c_data, nullptr, 1,
+                           forw[i].new_mesh[2], back[i].dir, FFTW_PATIENT);
+    back[i].pack_function = pack_block_permute1;
+    assert(back[i].plan_handle);
   }
-  if (fft.plan[1].row_dir == 2) {
-    fft.back[1].pack_function = fft_pack_block;
-  } else if (fft.plan[1].row_dir == 1) {
-    fft.back[1].pack_function = pack_block_permute2;
+  if (forw[1].row_dir == 2) {
+    back[1].pack_function = fft_pack_block;
+  } else if (forw[1].row_dir == 1) {
+    back[1].pack_function = pack_block_permute2;
   }
 
-  fft.init_tag = true;
+  init_tag = true;
 
-  return fft.max_mesh_size;
+  return max_mesh_size;
 }
 
-void fft_perform_forw(double *data, fft_data_struct &fft,
-                      const boost::mpi::communicator &comm) {
+void fft_data_struct::forward_fft(boost::mpi::communicator const &comm,
+                                  double *data) {
   /* ===== first direction  ===== */
 
   auto *c_data = (fftw_complex *)data;
-  auto *c_data_buf = (fftw_complex *)fft.data_buf.data();
+  auto *c_data_buf = (fftw_complex *)data_buf.data();
 
   /* communication to current dir row format (in is data) */
-  forw_grid_comm(fft.plan[1], data, fft.data_buf.data(), fft, comm);
+  forw_grid_comm(comm, forw[1], data, data_buf.data());
 
-  /* complexify the real data array (in is fft.data_buf) */
-  for (int i = 0; i < fft.plan[1].new_size; i++) {
-    data[2 * i + 0] = fft.data_buf[i]; /* real value */
-    data[2 * i + 1] = 0;               /* complex value */
+  /* complexify the real data array (in is data_buf) */
+  for (int i = 0; i < forw[1].new_size; i++) {
+    data[2 * i + 0] = data_buf[i]; /* real value */
+    data[2 * i + 1] = 0;           /* complex value */
   }
   /* perform FFT (in/out is data)*/
-  fftw_execute_dft(fft.plan[1].our_fftw_plan, c_data, c_data);
+  fftw_execute_dft(forw[1].plan_handle, c_data, c_data);
   /* ===== second direction ===== */
   /* communication to current dir row format (in is data) */
-  forw_grid_comm(fft.plan[2], data, fft.data_buf.data(), fft, comm);
-  /* perform FFT (in/out is fft.data_buf) */
-  fftw_execute_dft(fft.plan[2].our_fftw_plan, c_data_buf, c_data_buf);
+  forw_grid_comm(comm, forw[2], data, data_buf.data());
+  /* perform FFT (in/out is data_buf) */
+  fftw_execute_dft(forw[2].plan_handle, c_data_buf, c_data_buf);
   /* ===== third direction  ===== */
-  /* communication to current dir row format (in is fft.data_buf) */
-  forw_grid_comm(fft.plan[3], fft.data_buf.data(), data, fft, comm);
+  /* communication to current dir row format (in is data_buf) */
+  forw_grid_comm(comm, forw[3], data_buf.data(), data);
   /* perform FFT (in/out is data)*/
-  fftw_execute_dft(fft.plan[3].our_fftw_plan, c_data, c_data);
+  fftw_execute_dft(forw[3].plan_handle, c_data, c_data);
 
   /* REMARK: Result has to be in data. */
 }
 
-void fft_perform_back(double *data, bool check_complex, fft_data_struct &fft,
-                      const boost::mpi::communicator &comm) {
+void fft_data_struct::backward_fft(boost::mpi::communicator const &comm,
+                                   double *data, bool check_complex) {
 
   auto *c_data = (fftw_complex *)data;
-  auto *c_data_buf = (fftw_complex *)fft.data_buf.data();
+  auto *c_data_buf = (fftw_complex *)data_buf.data();
 
   /* ===== third direction  ===== */
 
   /* perform FFT (in is data) */
-  fftw_execute_dft(fft.back[3].our_fftw_plan, c_data, c_data);
+  fftw_execute_dft(back[3].plan_handle, c_data, c_data);
   /* communicate (in is data)*/
-  back_grid_comm(fft.plan[3], fft.back[3], data, fft.data_buf.data(), fft,
-                 comm);
+  back_grid_comm(comm, forw[3], back[3], data, data_buf.data());
 
   /* ===== second direction ===== */
-  /* perform FFT (in is fft.data_buf) */
-  fftw_execute_dft(fft.back[2].our_fftw_plan, c_data_buf, c_data_buf);
-  /* communicate (in is fft.data_buf) */
-  back_grid_comm(fft.plan[2], fft.back[2], fft.data_buf.data(), data, fft,
-                 comm);
+  /* perform FFT (in is data_buf) */
+  fftw_execute_dft(back[2].plan_handle, c_data_buf, c_data_buf);
+  /* communicate (in is data_buf) */
+  back_grid_comm(comm, forw[2], back[2], data_buf.data(), data);
 
   /* ===== first direction  ===== */
   /* perform FFT (in is data) */
-  fftw_execute_dft(fft.back[1].our_fftw_plan, c_data, c_data);
+  fftw_execute_dft(back[1].plan_handle, c_data, c_data);
   /* throw away the (hopefully) empty complex component (in is data) */
-  for (int i = 0; i < fft.plan[1].new_size; i++) {
-    fft.data_buf[i] = data[2 * i]; /* real value */
+  for (int i = 0; i < forw[1].new_size; i++) {
+    data_buf[i] = data[2 * i]; /* real value */
     // Vincent:
     if (check_complex and std::abs(data[2 * i + 1]) > 1e-5) {
       printf("Complex value is not zero (i=%d,data=%g)!!!\n", i,
@@ -745,9 +743,8 @@ void fft_perform_back(double *data, bool check_complex, fft_data_struct &fft,
         throw std::runtime_error("Complex value is not zero");
     }
   }
-  /* communicate (in is fft.data_buf) */
-  back_grid_comm(fft.plan[1], fft.back[1], fft.data_buf.data(), data, fft,
-                 comm);
+  /* communicate (in is data_buf) */
+  back_grid_comm(comm, forw[1], back[1], data_buf.data(), data);
 
   /* REMARK: Result has to be in data. */
 }
@@ -799,4 +796,16 @@ void fft_unpack_block(double const *const in, double *const out,
     li_out += s_out_offset;
   }
 }
-#endif
+
+void fft_plan::destroy_plan() {
+  if (plan_handle) {
+    fftw_destroy_plan(plan_handle);
+    plan_handle = nullptr;
+  }
+}
+
+namespace detail {
+void fft_free(void *const p) { ::fftw_free(p); }
+void *fft_malloc(std::size_t length) { return ::fftw_malloc(length); }
+} // namespace detail
+} // namespace fft
diff --git a/src/core/p3m/fft.hpp b/src/core/fft/fft.hpp
similarity index 50%
rename from src/core/p3m/fft.hpp
rename to src/core/fft/fft.hpp
index 0744be24e6d..8747448a371 100644
--- a/src/core/p3m/fft.hpp
+++ b/src/core/fft/fft.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -18,90 +18,67 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef CORE_P3M_FFT_HPP
-#define CORE_P3M_FFT_HPP
+
+#pragma once
+
 /** \file
  *
  *  Routines, row decomposition, data structures and communication for the
  *  3D-FFT.
  *
- *  The 3D-FFT is split into 3 ond dimensional FFTs. The data is
+ *  The 3D-FFT is split into three 1D-FFTs. The data is
  *  distributed in such a way, that for the actual direction of the
  *  FFT each node has a certain number of rows for which it performs a
  *  1D-FFT. After performing the FFT on that direction the data is
  *  redistributed.
  *
- *  For simplicity at the moment I have implemented a full complex to
- *  complex FFT (even though a real to complex FFT would be
- *  sufficient)
- *
- *  \todo Combine the forward and backward structures.
- *  \todo The packing routines could be moved to utils.hpp when they are needed
- * elsewhere.
- *
- *  For more information about FFT usage, see \ref fft.cpp "fft.cpp".
+ *  For simplicity, a full complex-to-complex FFT is implemented,
+ *  even though a real-to-complex FFT would be sufficient.
  */
 
-#include "config/config.hpp"
-
-#if defined(P3M) || defined(DP3M)
+#include "vector.hpp"
 
 #include <utils/Vector.hpp>
 
-#include <boost/mpi/communicator.hpp>
-
-#include <fftw3.h>
-
+#include <array>
 #include <cstddef>
-#include <new>
+#include <memory>
+#include <optional>
+#include <span>
+#include <utility>
 #include <vector>
 
-/** Aligned allocator for fft data. */
-template <class T> struct fft_allocator {
-  typedef T value_type;
-  fft_allocator() noexcept = default; // default ctor not required
-  template <class U> explicit fft_allocator(const fft_allocator<U> &) {}
-  template <class U> bool operator==(const fft_allocator<U> &) const {
-    return true;
-  }
-  template <class U> bool operator!=(const fft_allocator<U> &) const {
-    return false;
-  }
-
-  T *allocate(const std::size_t n) const {
-    if (n == 0) {
-      return nullptr;
-    }
-    if (n > static_cast<std::size_t>(-1) / sizeof(T)) {
-      throw std::bad_array_new_length();
-    }
-    void *const pv = fftw_malloc(n * sizeof(T));
-    if (!pv) {
-      throw std::bad_alloc();
-    }
-    return static_cast<T *>(pv);
-  }
-  void deallocate(T *const p, std::size_t) const noexcept { fftw_free(p); }
-};
+struct fftw_plan_s;
+namespace boost::mpi {
+class environment;
+class communicator;
+} // namespace boost::mpi
 
-template <class T> using fft_vector = std::vector<T, fft_allocator<T>>;
+namespace fft {
 
-/** Structure for performing a 1D FFT.
- *
- *  This includes the information about the redistribution of the 3D
- *  FFT *grid before the actual FFT.
- */
-struct fft_forw_plan {
-  /** plan direction: 0 = Forward FFT, 1 = Backward FFT. */
+struct fft_plan {
+  using fftw_plan = fftw_plan_s *;
+
+  ~fft_plan() { destroy_plan(); }
+
+  /** plan direction: forward or backward FFT (enum value from FFTW). */
   int dir;
+  /** plan for the FFT. */
+  fftw_plan plan_handle = nullptr;
+  /** packing function for send blocks. */
+  void (*pack_function)(double const *const, double *const, int const *,
+                        int const *, int const *, int);
+  void destroy_plan();
+};
+
+/** @brief Plan for a forward 1D FFT of a flattened 3D array. */
+struct fft_forw_plan : public fft_plan {
   /** row direction of that FFT. */
   int row_dir;
   /** permutations from normal coordinate system. */
   int n_permute;
   /** number of 1D FFTs. */
   int n_ffts;
-  /** plan for fft. */
-  fftw_plan our_fftw_plan;
 
   /** size of local mesh before communication. */
   int old_mesh[3];
@@ -115,9 +92,6 @@ struct fft_forw_plan {
   /** group of nodes which have to communicate with each other. */
   std::vector<int> group;
 
-  /** packing function for send blocks. */
-  void (*pack_function)(double const *const, double *const, int const *,
-                        int const *, int const *, int);
   /** Send block specification. 6 integers for each node: start[3], size[3]. */
   std::vector<int> send_block;
   /** Send block communication sizes. */
@@ -130,30 +104,30 @@ struct fft_forw_plan {
   int element;
 };
 
-/** Additional information for backwards FFT. */
-struct fft_back_plan {
-  /** plan direction. (e.g. fftw macro) */
-  int dir;
-  /** plan for fft. */
-  fftw_plan our_fftw_plan;
-
-  /** packing function for send blocks. */
-  void (*pack_function)(double const *const, double *const, int const *,
-                        int const *, int const *, int);
-};
+/** @brief Plan for a backward 1D FFT of a flattened 3D array. */
+struct fft_back_plan : public fft_plan {};
 
-/** Information about the three one dimensional FFTs and how the nodes
- *  have to communicate inbetween.
+/**
+ * @brief Information about the three one dimensional FFTs and how the nodes
+ * have to communicate inbetween.
  *
- *  @note FFT numbering starts with 1 for technical reasons (because we have 4
- *        node grids, the index 0 is used for the real space charge assignment
- *        grid).
+ * @note FFT numbering starts with 1 for technical reasons (because we have 4
+ * node grids, the index 0 is used for the real space charge assignment grid).
  */
-struct fft_data_struct { // NOLINT(bugprone-reserved-identifier)
+struct fft_data_struct {
+private:
+  /**
+   * @brief Handle to the MPI environment.
+   * Has to be the first member in the class definition, so that FFT plans
+   * are destroyed before the MPI environment expires (non-static class
+   * members are destroyed in the reverse order of their initialization).
+   */
+  std::shared_ptr<boost::mpi::environment> m_mpi_env;
+
   /** Information for forward FFTs. */
-  fft_forw_plan plan[4];
+  std::array<fft_forw_plan, 4u> forw;
   /** Information for backward FFTs. */
-  fft_back_plan back[4];
+  std::array<fft_back_plan, 4u> back;
 
   /** Whether FFT is initialized or not. */
   bool init_tag = false;
@@ -169,47 +143,64 @@ struct fft_data_struct { // NOLINT(bugprone-reserved-identifier)
   /** receive buffer. */
   std::vector<double> recv_buf;
   /** Buffer for receive data. */
-  fft_vector<double> data_buf;
+  fft::vector<double> data_buf;
+
+public:
+  explicit fft_data_struct(decltype(m_mpi_env) mpi_env)
+      : m_mpi_env{std::move(mpi_env)} {}
+
+  // disable copy construction: unsafe because we store raw pointers
+  // to FFT plans (avoids double-free and use-after-free)
+  fft_data_struct &operator=(fft_data_struct const &) = delete;
+  fft_data_struct(fft_data_struct const &) = delete;
+
+  /** Initialize everything connected to the 3D-FFT.
+   *
+   *  \param[in]  comm            MPI communicator.
+   *  \param[in]  ca_mesh_dim     Local CA mesh dimensions.
+   *  \param[in]  ca_mesh_margin  Local CA mesh margins.
+   *  \param[in]  global_mesh_dim Global CA mesh dimensions.
+   *  \param[in]  global_mesh_off Global CA mesh offset.
+   *  \param[out] ks_pnum         Number of permutations in k-space.
+   *  \param[in]  grid            Number of nodes in each spatial dimension.
+   *  \return Maximal size of local fft mesh (needed for allocation of ca_mesh).
+   */
+  int initialize_fft(boost::mpi::communicator const &comm,
+                     Utils::Vector3i const &ca_mesh_dim,
+                     int const *ca_mesh_margin,
+                     Utils::Vector3i const &global_mesh_dim,
+                     Utils::Vector3d const &global_mesh_off, int &ks_pnum,
+                     Utils::Vector3i const &grid);
+
+  /** Perform an in-place forward 3D FFT.
+   *  \warning The content of \a data is overwritten.
+   *  \param[in,out] data  Mesh.
+   *  \param[in]     comm  MPI communicator
+   */
+  void forward_fft(boost::mpi::communicator const &comm, double *data);
+
+  /** Perform an in-place backward 3D FFT.
+   *  \warning The content of \a data is overwritten.
+   *  \param[in,out] data           Mesh.
+   *  \param[in]     check_complex  Throw an error if the complex component is
+   *                                non-zero.
+   *  \param[in]     comm           MPI communicator.
+   */
+  void backward_fft(boost::mpi::communicator const &comm, double *data,
+                    bool check_complex);
+
+  auto get_mesh_size() const { return forw[3u].new_mesh; }
+
+  auto get_mesh_start() const { return forw[3u].start; }
+
+private:
+  void forw_grid_comm(boost::mpi::communicator const &comm,
+                      fft_forw_plan const &plan, double const *in, double *out);
+  void back_grid_comm(boost::mpi::communicator const &comm,
+                      fft_forw_plan const &plan_f, fft_back_plan const &plan_b,
+                      double const *in, double *out);
 };
 
-/** Initialize everything connected to the 3D-FFT.
- *
- *  \param[in]  ca_mesh_dim     Local CA mesh dimensions.
- *  \param[in]  ca_mesh_margin  Local CA mesh margins.
- *  \param[in]  global_mesh_dim Global CA mesh dimensions.
- *  \param[in]  global_mesh_off Global CA mesh offset.
- *  \param[out] ks_pnum         Number of permutations in k-space.
- *  \param[out] fft             FFT plan.
- *  \param[in]  grid            Number of nodes in each spatial dimension.
- *  \param[in]  comm            MPI communicator.
- *  \return Maximal size of local fft mesh (needed for allocation of ca_mesh).
- */
-int fft_init(Utils::Vector3i const &ca_mesh_dim, int const *ca_mesh_margin,
-             Utils::Vector3i const &global_mesh_dim,
-             Utils::Vector3d const &global_mesh_off, int &ks_pnum,
-             fft_data_struct &fft, Utils::Vector3i const &grid,
-             boost::mpi::communicator const &comm);
-
-/** Perform an in-place forward 3D FFT.
- *  \warning The content of \a data is overwritten.
- *  \param[in,out] data  Mesh.
- *  \param[in,out] fft   FFT plan.
- *  \param[in]     comm  MPI communicator
- */
-void fft_perform_forw(double *data, fft_data_struct &fft,
-                      const boost::mpi::communicator &comm);
-
-/** Perform an in-place backward 3D FFT.
- *  \warning The content of \a data is overwritten.
- *  \param[in,out] data           Mesh.
- *  \param[in]     check_complex  Throw an error if the complex component is
- *                                non-zero.
- *  \param[in,out] fft            FFT plan.
- *  \param[in]     comm           MPI communicator.
- */
-void fft_perform_back(double *data, bool check_complex, fft_data_struct &fft,
-                      const boost::mpi::communicator &comm);
-
 /** Pack a block (<tt>size[3]</tt> starting at <tt>start[3]</tt>) of an input
  *  3d-grid with dimension <tt>dim[3]</tt> into an output 3d-block with
  *  dimension <tt>size[3]</tt>.
@@ -245,6 +236,12 @@ void fft_pack_block(double const *in, double *out, int const start[3],
 void fft_unpack_block(double const *in, double *out, int const start[3],
                       int const size[3], int const dim[3], int element);
 
-#endif // defined(P3M) || defined(DP3M)
+int map_3don2d_grid(int const g3d[3], int g2d[3]);
+
+std::optional<std::vector<int>> find_comm_groups(Utils::Vector3i const &,
+                                                 Utils::Vector3i const &,
+                                                 std::span<int const>,
+                                                 std::span<int>, std::span<int>,
+                                                 std::span<int>, int);
 
-#endif
+} // namespace fft
diff --git a/src/core/fft/vector.hpp b/src/core/fft/vector.hpp
new file mode 100644
index 00000000000..e74df7a3de0
--- /dev/null
+++ b/src/core/fft/vector.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <limits>
+#include <stdexcept>
+#include <vector>
+
+namespace fft {
+namespace detail {
+void fft_free(void *p);
+void *fft_malloc(std::size_t length);
+} // namespace detail
+
+/** @brief Aligned allocator for FFT data. */
+template <class T> struct allocator {
+  typedef T value_type;
+  allocator() noexcept = default; // default ctor not required
+  template <class U> explicit allocator(const allocator<U> &) {}
+  template <class U> bool operator==(const allocator<U> &) const {
+    return true;
+  }
+  template <class U> bool operator!=(const allocator<U> &) const {
+    return false;
+  }
+
+  T *allocate(const std::size_t n) const {
+    if (n == 0) {
+      return nullptr;
+    }
+    if (n > std::numeric_limits<std::size_t>::max() / sizeof(T)) {
+      throw std::bad_array_new_length();
+    }
+    void *const pv = detail::fft_malloc(n * sizeof(T));
+    if (!pv) {
+      throw std::bad_alloc();
+    }
+    return static_cast<T *>(pv);
+  }
+
+  void deallocate(T *const p, std::size_t) const noexcept {
+    detail::fft_free(static_cast<void *>(p));
+  }
+};
+
+template <class T> using vector = std::vector<T, allocator<T>>;
+
+} // namespace fft
diff --git a/src/core/forces.cpp b/src/core/forces.cpp
index 5ce4aee3683..bc317eb119b 100644
--- a/src/core/forces.cpp
+++ b/src/core/forces.cpp
@@ -51,6 +51,7 @@
 #include "thermostat.hpp"
 #include "virtual_sites/relative.hpp"
 
+#include <utils/Vector.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <boost/variant.hpp>
@@ -62,6 +63,7 @@
 #include <cassert>
 #include <cmath>
 #include <memory>
+#include <span>
 #include <variant>
 
 /** External particle forces */
@@ -172,7 +174,7 @@ void System::System::calculate_forces() {
   short_range_loop(
       [coulomb_kernel_ptr = get_ptr(coulomb_kernel),
        &bond_breakage = *bond_breakage, &box_geo = *box_geo](
-          Particle &p1, int bond_id, Utils::Span<Particle *> partners) {
+          Particle &p1, int bond_id, std::span<Particle *> partners) {
         return add_bonded_force(p1, bond_id, partners, bond_breakage, box_geo,
                                 coulomb_kernel_ptr);
       },
@@ -216,7 +218,7 @@ void System::System::calculate_forces() {
 
   if (thermostat->lb and (propagation->used_propagations &
                           PropagationMode::TRANS_LB_MOMENTUM_EXCHANGE)) {
-    lb_couple_particles(time_step);
+    lb_couple_particles();
   }
 
 #ifdef CUDA
diff --git a/src/core/forces_inline.hpp b/src/core/forces_inline.hpp
index f6d643d1e71..5f73826b4a3 100644
--- a/src/core/forces_inline.hpp
+++ b/src/core/forces_inline.hpp
@@ -68,23 +68,20 @@
 #include "exclusions.hpp"
 #include "thermostat.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 
-#include <boost/optional.hpp>
 #include <boost/variant.hpp>
 
 #include <optional>
+#include <span>
 #include <tuple>
 
-inline ParticleForce calc_central_radial_force(Particle const &p1,
-                                               Particle const &p2,
-                                               IA_parameters const &ia_params,
+inline ParticleForce calc_central_radial_force(IA_parameters const &ia_params,
                                                Utils::Vector3d const &d,
                                                double const dist) {
 
   ParticleForce pf{};
-  double force_factor = 0;
+  auto force_factor = 0.;
 /* Lennard-Jones */
 #ifdef LENNARD_JONES
   force_factor += lj_pair_force_factor(ia_params, dist);
@@ -219,7 +216,7 @@ inline void add_non_bonded_pair_force(
 #ifdef EXCLUSIONS
     if (do_nonbonded(p1, p2)) {
 #endif
-      pf += calc_central_radial_force(p1, p2, ia_params, d, dist);
+      pf += calc_central_radial_force(ia_params, d, dist);
       pf += calc_central_radial_charge_force(p1, p2, ia_params, d, dist,
                                              coulomb_kernel);
       pf += calc_non_central_force(p1, p2, ia_params, d, dist);
@@ -293,7 +290,7 @@ inline void add_non_bonded_pair_force(
  *  @param[in] dx          Vector between @p p1 and @p p2.
  *  @param[in] kernel      Coulomb force kernel.
  */
-inline boost::optional<Utils::Vector3d> calc_bond_pair_force(
+inline std::optional<Utils::Vector3d> calc_bond_pair_force(
     Particle const &p1, Particle const &p2,
     Bonded_IA_Parameters const &iaparams, Utils::Vector3d const &dx,
     Coulomb::ShortRangeForceKernel::kernel_type const *kernel) {
@@ -339,7 +336,7 @@ inline bool add_bonded_two_body_force(
   if (auto const *iap = boost::get<ThermalizedBond>(&iaparams)) {
     auto result = iap->forces(p1, p2, dx);
     if (result) {
-      auto const &forces = result.get();
+      auto const &forces = result.value();
 
       p1.force() += std::get<0>(forces);
       p2.force() += std::get<1>(forces);
@@ -349,11 +346,11 @@ inline bool add_bonded_two_body_force(
   } else {
     auto result = calc_bond_pair_force(p1, p2, iaparams, dx, kernel);
     if (result) {
-      p1.force() += result.get();
-      p2.force() -= result.get();
+      p1.force() += result.value();
+      p2.force() -= result.value();
 
 #ifdef NPT
-      npt_add_virial_force_contribution(result.get(), dx);
+      npt_add_virial_force_contribution(result.value(), dx);
 #endif
       return false;
     }
@@ -361,7 +358,7 @@ inline bool add_bonded_two_body_force(
   return true;
 }
 
-inline boost::optional<
+inline std::optional<
     std::tuple<Utils::Vector3d, Utils::Vector3d, Utils::Vector3d>>
 calc_bonded_three_body_force(Bonded_IA_Parameters const &iaparams,
                              BoxGeometry const &box_geo, Particle const &p1,
@@ -398,7 +395,7 @@ inline bool add_bonded_three_body_force(Bonded_IA_Parameters const &iaparams,
   auto const result =
       calc_bonded_three_body_force(iaparams, box_geo, p1, p2, p3);
   if (result) {
-    auto const &forces = result.get();
+    auto const &forces = result.value();
 
     p1.force() += std::get<0>(forces);
     p2.force() += std::get<1>(forces);
@@ -409,8 +406,8 @@ inline bool add_bonded_three_body_force(Bonded_IA_Parameters const &iaparams,
   return true;
 }
 
-inline boost::optional<std::tuple<Utils::Vector3d, Utils::Vector3d,
-                                  Utils::Vector3d, Utils::Vector3d>>
+inline std::optional<std::tuple<Utils::Vector3d, Utils::Vector3d,
+                                Utils::Vector3d, Utils::Vector3d>>
 calc_bonded_four_body_force(Bonded_IA_Parameters const &iaparams,
                             BoxGeometry const &box_geo, Particle const &p1,
                             Particle const &p2, Particle const &p3,
@@ -443,7 +440,7 @@ inline bool add_bonded_four_body_force(Bonded_IA_Parameters const &iaparams,
   auto const result =
       calc_bonded_four_body_force(iaparams, box_geo, p1, p2, p3, p4);
   if (result) {
-    auto const &forces = result.get();
+    auto const &forces = result.value();
 
     p1.force() += std::get<0>(forces);
     p2.force() += std::get<1>(forces);
@@ -457,20 +454,20 @@ inline bool add_bonded_four_body_force(Bonded_IA_Parameters const &iaparams,
 }
 
 inline bool
-add_bonded_force(Particle &p1, int bond_id, Utils::Span<Particle *> partners,
+add_bonded_force(Particle &p1, int bond_id, std::span<Particle *> partners,
                  BondBreakage::BondBreakage &bond_breakage,
                  BoxGeometry const &box_geo,
                  Coulomb::ShortRangeForceKernel::kernel_type const *kernel) {
 
   // Consider for bond breakage
-  if (partners.size() == 1) { // pair bonds
+  if (partners.size() == 1u) { // pair bonds
     auto d = box_geo.get_mi_vector(p1.pos(), partners[0]->pos()).norm();
     if (bond_breakage.check_and_handle_breakage(
             p1.id(), {{partners[0]->id(), std::nullopt}}, bond_id, d)) {
       return false;
     }
   }
-  if (partners.size() == 2) { // angle bond
+  if (partners.size() == 2u) { // angle bond
     auto d =
         box_geo.get_mi_vector(partners[0]->pos(), partners[1]->pos()).norm();
     if (bond_breakage.check_and_handle_breakage(
diff --git a/src/core/ghosts.cpp b/src/core/ghosts.cpp
index 5a992d1870f..23cce1f5b9d 100644
--- a/src/core/ghosts.cpp
+++ b/src/core/ghosts.cpp
@@ -34,7 +34,6 @@
 #include "Particle.hpp"
 #include "system/System.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/serialization/memcpy_archive.hpp>
 
 #include <boost/archive/binary_iarchive.hpp>
@@ -51,6 +50,7 @@
 #include <cstddef>
 #include <functional>
 #include <iterator>
+#include <span>
 #include <vector>
 
 /** Tag for ghosts communications. */
@@ -82,6 +82,8 @@ class CommBuf {
   auto &bonds() { return bondbuf; }
   const auto &bonds() const { return bondbuf; }
 
+  auto make_span() { return std::span(buf.data(), buf.size()); }
+
 private:
   std::vector<char> buf;     ///< Buffer for everything but bonds
   std::vector<char> bondbuf; ///< Buffer for bond lists
@@ -94,7 +96,7 @@ class SerializationSizeCalculator {
 public:
   auto size() const { return m_size; }
 
-  template <class T> auto &operator<<(T &t) {
+  template <class T> auto &operator<<(T &) {
     m_size += sizeof(T);
     return *this;
   }
@@ -126,43 +128,43 @@ serialize_and_reduce(Archive &ar, Particle &p, unsigned int data_parts,
                      BoxGeometry const &box_geo,
                      Utils::Vector3d const *ghost_shift) {
   if (data_parts & GHOSTTRANS_PROPRTS) {
-    ar &p.id() & p.mol_id() & p.type() & p.propagation();
+    ar & p.id() & p.mol_id() & p.type() & p.propagation();
 #ifdef ROTATION
-    ar &p.rotation();
+    ar & p.rotation();
 #ifdef ROTATIONAL_INERTIA
-    ar &p.rinertia();
+    ar & p.rinertia();
 #endif
 #endif
 #ifdef MASS
-    ar &p.mass();
+    ar & p.mass();
 #endif
 #ifdef ELECTROSTATICS
-    ar &p.q();
+    ar & p.q();
 #endif
 #ifdef DIPOLES
-    ar &p.dipm();
+    ar & p.dipm();
 #endif
 #ifdef LB_ELECTROHYDRODYNAMICS
-    ar &p.mu_E();
+    ar & p.mu_E();
 #endif
 #ifdef VIRTUAL_SITES_RELATIVE
-    ar &p.vs_relative();
+    ar & p.vs_relative();
 #endif
 #ifdef THERMOSTAT_PER_PARTICLE
-    ar &p.gamma();
+    ar & p.gamma();
 #ifdef ROTATION
-    ar &p.gamma_rot();
+    ar & p.gamma_rot();
 #endif
 #endif
 #ifdef EXTERNAL_FORCES
-    ar &p.fixed();
-    ar &p.ext_force();
+    ar & p.fixed();
+    ar & p.ext_force();
 #ifdef ROTATION
-    ar &p.ext_torque();
+    ar & p.ext_torque();
 #endif
 #endif
 #ifdef ENGINE
-    ar &p.swimming();
+    ar & p.swimming();
 #endif
   }
   if (data_parts & GHOSTTRANS_POSITION) {
@@ -171,42 +173,42 @@ serialize_and_reduce(Archive &ar, Particle &p, unsigned int data_parts,
       auto pos = p.pos() + *ghost_shift;
       auto img = p.image_box();
       box_geo.fold_position(pos, img);
-      ar &pos;
-      ar &img;
+      ar & pos;
+      ar & img;
     } else {
-      ar &p.pos();
-      ar &p.image_box();
+      ar & p.pos();
+      ar & p.image_box();
     }
 #ifdef ROTATION
-    ar &p.quat();
+    ar & p.quat();
 #endif
 #ifdef BOND_CONSTRAINT
-    ar &p.pos_last_time_step();
+    ar & p.pos_last_time_step();
 #endif
   }
   if (data_parts & GHOSTTRANS_MOMENTUM) {
-    ar &p.v();
+    ar & p.v();
 #ifdef ROTATION
-    ar &p.omega();
+    ar & p.omega();
 #endif
   }
   if (data_parts & GHOSTTRANS_FORCE) {
     if (policy == ReductionPolicy::UPDATE and
         direction == SerializationDirection::LOAD) {
       Utils::Vector3d force;
-      ar &force;
+      ar & force;
       p.force() += force;
     } else {
-      ar &p.force();
+      ar & p.force();
     }
 #ifdef ROTATION
     if (policy == ReductionPolicy::UPDATE and
         direction == SerializationDirection::LOAD) {
       Utils::Vector3d torque;
-      ar &torque;
+      ar & torque;
       p.torque() += torque;
     } else {
-      ar &p.torque();
+      ar & p.torque();
     }
 #endif
   }
@@ -215,10 +217,10 @@ serialize_and_reduce(Archive &ar, Particle &p, unsigned int data_parts,
     if (policy == ReductionPolicy::UPDATE and
         direction == SerializationDirection::LOAD) {
       Utils::Vector3d correction;
-      ar &correction;
+      ar & correction;
       p.rattle_correction() += correction;
     } else {
-      ar &p.rattle_correction();
+      ar & p.rattle_correction();
     }
   }
 #endif
@@ -255,7 +257,7 @@ static void prepare_send_buffer(CommBuf &send_buffer,
   send_buffer.resize(calc_transmit_size(ghost_comm, box_geo, data_parts));
   send_buffer.bonds().clear();
 
-  auto archiver = Utils::MemcpyOArchive{Utils::make_span(send_buffer)};
+  auto archiver = Utils::MemcpyOArchive{send_buffer.make_span()};
 
   /* Construct archive that pushes back to the bond buffer */
   namespace io = boost::iostreams;
@@ -309,7 +311,7 @@ static void put_recv_buffer(CommBuf &recv_buffer,
                             BoxGeometry const &box_geo,
                             unsigned int data_parts) {
   /* put back data */
-  auto archiver = Utils::MemcpyIArchive{Utils::make_span(recv_buffer)};
+  auto archiver = Utils::MemcpyIArchive{recv_buffer.make_span()};
 
   if (data_parts & GHOSTTRANS_PARTNUM) {
     for (auto part_list : ghost_comm.part_lists) {
@@ -348,7 +350,7 @@ static void
 add_rattle_correction_from_recv_buffer(CommBuf &recv_buffer,
                                        const GhostCommunication &ghost_comm) {
   /* put back data */
-  auto archiver = Utils::MemcpyIArchive{Utils::make_span(recv_buffer)};
+  auto archiver = Utils::MemcpyIArchive{recv_buffer.make_span()};
   for (auto &part_list : ghost_comm.part_lists) {
     for (Particle &part : *part_list) {
       ParticleRattle pr;
@@ -362,7 +364,7 @@ add_rattle_correction_from_recv_buffer(CommBuf &recv_buffer,
 static void add_forces_from_recv_buffer(CommBuf &recv_buffer,
                                         const GhostCommunication &ghost_comm) {
   /* put back data */
-  auto archiver = Utils::MemcpyIArchive{Utils::make_span(recv_buffer)};
+  auto archiver = Utils::MemcpyIArchive{recv_buffer.make_span()};
   for (auto &part_list : ghost_comm.part_lists) {
     for (Particle &part : *part_list) {
       ParticleForce pf;
@@ -393,8 +395,8 @@ static void cell_cell_transfer(GhostCommunication const &ghost_comm,
       assert(src_part.size() == dst_part.size());
 
       for (std::size_t i = 0; i < src_part.size(); i++) {
-        auto ar_out = Utils::MemcpyOArchive{Utils::make_span(buffer)};
-        auto ar_in = Utils::MemcpyIArchive{Utils::make_span(buffer)};
+        auto ar_out = Utils::MemcpyOArchive{buffer.make_span()};
+        auto ar_in = Utils::MemcpyIArchive{buffer.make_span()};
         auto &p1 = src_part.begin()[i];
         auto &p2 = dst_part.begin()[i];
         serialize_and_reduce(ar_out, p1, data_parts, ReductionPolicy::UPDATE,
diff --git a/src/core/immersed_boundary/ImmersedBoundaries.cpp b/src/core/immersed_boundary/ImmersedBoundaries.cpp
index 2b59bbefb8d..0beaef4bb25 100644
--- a/src/core/immersed_boundary/ImmersedBoundaries.cpp
+++ b/src/core/immersed_boundary/ImmersedBoundaries.cpp
@@ -28,14 +28,11 @@
 
 #include "bonded_interactions/bonded_interaction_data.hpp"
 
-#include <utils/Span.hpp>
-#include <utils/Vector.hpp>
-#include <utils/constants.hpp>
-
 #include <boost/mpi/collectives/all_reduce.hpp>
 #include <boost/range/algorithm/find_if.hpp>
 
 #include <functional>
+#include <span>
 #include <utility>
 #include <vector>
 
@@ -106,7 +103,7 @@ void ImmersedBoundaries::calc_volumes(CellStructure &cs) {
 
   // Loop over all particles on local node
   cs.bond_loop([&tempVol, &box_geo](Particle &p1, int bond_id,
-                                    Utils::Span<Particle *> partners) {
+                                    std::span<Particle *> partners) {
     auto const vol_cons_params = vol_cons_parameters(p1);
 
     if (vol_cons_params &&
@@ -160,7 +157,7 @@ void ImmersedBoundaries::calc_volume_force(CellStructure &cs) {
   auto const &box_geo = *System::get_system().box_geo;
 
   cs.bond_loop([this, &box_geo](Particle &p1, int bond_id,
-                                Utils::Span<Particle *> partners) {
+                                std::span<Particle *> partners) {
     if (boost::get<IBMTriel>(bonded_ia_params.at(bond_id).get()) != nullptr) {
       // Check if particle has an IBM Triel bonded interaction and an
       // IBM VolCons bonded interaction. Basically this loops over all
diff --git a/src/core/immersed_boundary/ImmersedBoundaries.hpp b/src/core/immersed_boundary/ImmersedBoundaries.hpp
index 2cc7d7e8fd4..bf3a2e02c2a 100644
--- a/src/core/immersed_boundary/ImmersedBoundaries.hpp
+++ b/src/core/immersed_boundary/ImmersedBoundaries.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef IMMERSED_BOUNDARY_IMMERSED_BOUNDARIES_HPP
-#define IMMERSED_BOUNDARY_IMMERSED_BOUNDARIES_HPP
+
+#pragma once
 
 #include "config/config.hpp"
 
@@ -43,7 +43,7 @@ class ImmersedBoundaries {
   }
   double get_current_volume(int softID) const {
     assert(softID >= 0);
-    assert(softID < VolumesCurrent.size());
+    assert(static_cast<std::size_t>(softID) < VolumesCurrent.size());
     return VolumesCurrent[static_cast<unsigned int>(softID)];
   }
 
@@ -55,5 +55,3 @@ class ImmersedBoundaries {
   bool VolumeInitDone;
   bool BoundariesFound;
 };
-
-#endif
diff --git a/src/core/immersed_boundary/ibm_common.cpp b/src/core/immersed_boundary/ibm_common.cpp
index 176950a4d99..34e1da702a2 100644
--- a/src/core/immersed_boundary/ibm_common.cpp
+++ b/src/core/immersed_boundary/ibm_common.cpp
@@ -24,30 +24,30 @@
 #include "system/System.hpp"
 
 #include <utils/Vector.hpp>
+#include <utils/serialization/optional.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
-#include <boost/optional.hpp>
-#include <boost/serialization/optional.hpp>
 
+#include <optional>
 #include <stdexcept>
 
 Utils::Vector3d get_ibm_particle_position(int pid) {
   auto &cell_structure = *System::get_system().cell_structure;
   auto *p = cell_structure.get_local_particle(pid);
-  boost::optional<Particle> opt_part{boost::none};
+  std::optional<Particle> opt_part{std::nullopt};
 
   if (p and not p->is_ghost()) {
     opt_part = *p;
   }
   opt_part = boost::mpi::all_reduce(comm_cart, opt_part,
-                                    [](boost::optional<Particle> const &acc,
-                                       boost::optional<Particle> const &item) {
+                                    [](std::optional<Particle> const &acc,
+                                       std::optional<Particle> const &item) {
                                       if (acc) {
                                         return acc;
                                       }
                                       return item;
                                     });
   if (opt_part)
-    return opt_part.get().pos();
+    return opt_part.value().pos();
   throw std::runtime_error("Immersed Boundary: Particle not found");
-}
\ No newline at end of file
+}
diff --git a/src/core/immersed_boundary/ibm_tribend.cpp b/src/core/immersed_boundary/ibm_tribend.cpp
index 35b922dd5fe..49585855af5 100644
--- a/src/core/immersed_boundary/ibm_tribend.cpp
+++ b/src/core/immersed_boundary/ibm_tribend.cpp
@@ -27,6 +27,7 @@
 
 #include <algorithm>
 #include <cmath>
+#include <numbers>
 #include <tuple>
 
 std::tuple<Utils::Vector3d, Utils::Vector3d, Utils::Vector3d, Utils::Vector3d>
@@ -99,7 +100,7 @@ IBMTribend::IBMTribend(const int ind1, const int ind2, const int ind3,
 
   // Compute theta0
   if (flat) {
-    theta0 = 0;
+    theta0 = 0.;
   } else {
     // Get particles
     auto const pos1 = get_ibm_particle_position(ind1);
@@ -121,13 +122,13 @@ IBMTribend::IBMTribend(const int ind1, const int ind2, const int ind3,
     auto const n2 = n2l / n2l.norm();
 
     // calculate theta0 by taking the acos of the scalar n1*n2
-    auto const sc = std::min(1.0, n1 * n2);
+    auto const sc = std::min(1., n1 * n2);
 
     theta0 = acos(sc);
 
     auto const desc = dx1 * vector_product(n1, n2);
-    if (desc < 0)
-      theta0 = 2.0 * Utils::pi() - theta0;
+    if (desc < 0.)
+      theta0 = 2. * std::numbers::pi - theta0;
   }
 
   // NOTE: This is the bare bending modulus used by the program.
diff --git a/src/core/immersed_boundary/ibm_tribend.hpp b/src/core/immersed_boundary/ibm_tribend.hpp
index 3a013d040db..3ac3068ccfd 100644
--- a/src/core/immersed_boundary/ibm_tribend.hpp
+++ b/src/core/immersed_boundary/ibm_tribend.hpp
@@ -60,8 +60,8 @@ struct IBMTribend {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &kb;
-    ar &theta0;
+    ar & kb;
+    ar & theta0;
   }
 };
 
diff --git a/src/core/immersed_boundary/ibm_triel.cpp b/src/core/immersed_boundary/ibm_triel.cpp
index b5c80cb4c07..14f3fa9ccf2 100644
--- a/src/core/immersed_boundary/ibm_triel.cpp
+++ b/src/core/immersed_boundary/ibm_triel.cpp
@@ -26,8 +26,7 @@
 #include <utils/Vector.hpp>
 #include <utils/math/sqr.hpp>
 
-#include <boost/optional.hpp>
-
+#include <optional>
 #include <tuple>
 
 namespace {
@@ -71,7 +70,7 @@ RotateForces(Utils::Vector2d const &f1_rot, Utils::Vector2d const &f2_rot,
 }
 } // namespace
 
-boost::optional<std::tuple<Utils::Vector3d, Utils::Vector3d, Utils::Vector3d>>
+std::optional<std::tuple<Utils::Vector3d, Utils::Vector3d, Utils::Vector3d>>
 IBMTriel::calc_forces(Utils::Vector3d const &vec1,
                       Utils::Vector3d const &vec2) const {
 
diff --git a/src/core/immersed_boundary/ibm_triel.hpp b/src/core/immersed_boundary/ibm_triel.hpp
index 429ce876a44..b4cd1c0620b 100644
--- a/src/core/immersed_boundary/ibm_triel.hpp
+++ b/src/core/immersed_boundary/ibm_triel.hpp
@@ -24,8 +24,7 @@
 
 #include <utils/Vector.hpp>
 
-#include <boost/optional.hpp>
-
+#include <optional>
 #include <tuple>
 
 enum class tElasticLaw { NeoHookean, Skalak };
@@ -67,26 +66,26 @@ struct IBMTriel {
    *  The equations can be found in Appendix C of @cite kruger12a.
    *  @return the forces on @p p1, @p p2, @p p3
    */
-  boost::optional<std::tuple<Utils::Vector3d, Utils::Vector3d, Utils::Vector3d>>
+  std::optional<std::tuple<Utils::Vector3d, Utils::Vector3d, Utils::Vector3d>>
   calc_forces(Utils::Vector3d const &vec1, Utils::Vector3d const &vec2) const;
 
 private:
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &l0;
-    ar &lp0;
-    ar &sinPhi0;
-    ar &cosPhi0;
-    ar &area0;
-    ar &a1;
-    ar &a2;
-    ar &b1;
-    ar &b2;
-    ar &maxDist;
-    ar &elasticLaw;
-    ar &k1;
-    ar &k2;
+    ar & l0;
+    ar & lp0;
+    ar & sinPhi0;
+    ar & cosPhi0;
+    ar & area0;
+    ar & a1;
+    ar & a2;
+    ar & b1;
+    ar & b2;
+    ar & maxDist;
+    ar & elasticLaw;
+    ar & k1;
+    ar & k2;
   }
 };
 
diff --git a/src/core/immersed_boundary/ibm_volcons.hpp b/src/core/immersed_boundary/ibm_volcons.hpp
index 4db131865dd..afa992f1f36 100644
--- a/src/core/immersed_boundary/ibm_volcons.hpp
+++ b/src/core/immersed_boundary/ibm_volcons.hpp
@@ -41,9 +41,9 @@ struct IBMVolCons {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &softID;
-    ar &volRef;
-    ar &kappaV;
+    ar & softID;
+    ar & volRef;
+    ar & kappaV;
   }
 };
 
diff --git a/src/core/integrate.cpp b/src/core/integrate.cpp
index 61b925b2669..dddc9da6621 100644
--- a/src/core/integrate.cpp
+++ b/src/core/integrate.cpp
@@ -294,7 +294,8 @@ void walberla_agrid_sanity_checks(std::string method,
                       << "left waLBerla: [" << lattice_left << "]"
                       << "\nMPI rank " << ::this_node << ": "
                       << "right ESPResSo: [" << geo_right << "], "
-                      << "right waLBerla: [" << lattice_right << "]";
+                      << "right waLBerla: [" << lattice_right << "]"
+                      << "\nfor method: " << method;
     throw std::runtime_error(
         "waLBerla and ESPResSo disagree about domain decomposition.");
   }
@@ -588,7 +589,7 @@ int System::System::integrate(int n_steps, int reuse_forces) {
     if (thermostat->lb and
         (propagation.used_propagations & PropagationMode::TRANS_LB_TRACER)) {
       lb_tracers_add_particle_force_to_fluid(*cell_structure, *box_geo,
-                                             *local_geo, lb, time_step);
+                                             *local_geo, lb);
     }
 #endif
     integrator_step_2(particles, propagation, *thermostat, time_step);
diff --git a/src/core/integrators/steepest_descent.cpp b/src/core/integrators/steepest_descent.cpp
index 0e7b9b3540b..414d9c0b7c0 100644
--- a/src/core/integrators/steepest_descent.cpp
+++ b/src/core/integrators/steepest_descent.cpp
@@ -52,7 +52,7 @@ bool steepest_descent_step(const ParticleRange &particles) {
     auto f = 0.0;
 
     // For all Cartesian coordinates
-    for (unsigned int j = 0; j < 3; j++) {
+    for (auto j = 0u; j < 3u; ++j) {
       // Skip, if coordinate is fixed
       if (!p.is_fixed_along(j)) {
         // Square of force on particle
diff --git a/src/core/io/writer/h5md_core.cpp b/src/core/io/writer/h5md_core.cpp
index 175ae92636c..54c7017f392 100644
--- a/src/core/io/writer/h5md_core.cpp
+++ b/src/core/io/writer/h5md_core.cpp
@@ -415,9 +415,11 @@ void File::write(const ParticleRange &particles, double time, int step,
         [&](auto const &p) { return box_geo.folded_position(p.pos()); });
   }
   if (m_fields & H5MD_OUT_IMG) {
-    write_td_particle_property<3>(prefix, n_part_global, particles,
-                                  datasets["particles/atoms/image/value"],
-                                  [](auto const &p) { return p.image_box(); });
+    write_td_particle_property<3>(
+        prefix, n_part_global, particles,
+        datasets["particles/atoms/image/value"], [&](auto const &p) {
+          return box_geo.folded_image_box(p.pos(), p.image_box());
+        });
   }
   if (m_fields & H5MD_OUT_VEL) {
     write_td_particle_property<3>(prefix, n_part_global, particles,
@@ -446,7 +448,7 @@ void File::write_connectivity(const ParticleRange &particles) {
     auto nbonds_local = static_cast<decltype(bond)::index>(bond.shape()[1]);
     for (auto const b : p.bonds()) {
       auto const partner_ids = b.partner_ids();
-      if (partner_ids.size() == 1) {
+      if (partner_ids.size() == 1u) {
         bond.resize(boost::extents[1][nbonds_local + 1][2]);
         bond[0][nbonds_local][0] = p.id();
         bond[0][nbonds_local][1] = partner_ids[0];
diff --git a/src/core/lb/LBNone.hpp b/src/core/lb/LBNone.hpp
index bf913c53818..61248c5fe56 100644
--- a/src/core/lb/LBNone.hpp
+++ b/src/core/lb/LBNone.hpp
@@ -33,6 +33,7 @@ struct LBNone {
   double get_tau() const { throw NoLBActive{}; }
   double get_kT() const { throw NoLBActive{}; }
   Utils::VectorXd<9> get_pressure_tensor() const { throw NoLBActive{}; }
+  bool is_gpu() const { throw NoLBActive{}; }
   std::optional<Utils::Vector3d> get_velocity_at_pos(Utils::Vector3d const &,
                                                      bool) const {
     throw NoLBActive{};
@@ -41,8 +42,16 @@ struct LBNone {
                                            bool) const {
     throw NoLBActive{};
   }
-  bool add_force_at_pos(Utils::Vector3d const &pos,
-                        Utils::Vector3d const &force) const {
+  bool add_force_at_pos(Utils::Vector3d const &,
+                        Utils::Vector3d const &) const {
+    throw NoLBActive{};
+  }
+  void add_forces_at_pos(std::vector<Utils::Vector3d> const &,
+                         std::vector<Utils::Vector3d> const &) const {
+    throw NoLBActive{};
+  }
+  std::vector<Utils::Vector3d>
+  get_velocities_at_pos(std::vector<Utils::Vector3d> const &) const {
     throw NoLBActive{};
   }
   Utils::Vector3d get_momentum() const { throw NoLBActive{}; }
@@ -52,6 +61,7 @@ struct LBNone {
   void lebc_sanity_checks(unsigned int, unsigned int) const {
     throw NoLBActive{};
   }
+  void veto_boxl_change() const { throw NoLBActive{}; }
   void on_cell_structure_change() const { throw NoLBActive{}; }
   void on_boxl_change() const { throw NoLBActive{}; }
   void on_node_grid_change() const { throw NoLBActive{}; }
diff --git a/src/core/lb/LBWalberla.cpp b/src/core/lb/LBWalberla.cpp
index 8d8995a9274..d1dec131603 100644
--- a/src/core/lb/LBWalberla.cpp
+++ b/src/core/lb/LBWalberla.cpp
@@ -39,6 +39,8 @@
 
 namespace LB {
 
+bool LBWalberla::is_gpu() const { return lb_fluid->is_gpu(); }
+
 double LBWalberla::get_kT() const { return lb_fluid->get_kT(); }
 
 Utils::VectorXd<9> LBWalberla::get_pressure_tensor() const {
@@ -73,6 +75,16 @@ bool LBWalberla::add_force_at_pos(Utils::Vector3d const &pos,
   return lb_fluid->add_force_at_pos(pos, force);
 }
 
+void LBWalberla::add_forces_at_pos(std::vector<Utils::Vector3d> const &pos,
+                                   std::vector<Utils::Vector3d> const &forces) {
+  lb_fluid->add_forces_at_pos(pos, forces);
+}
+
+std::vector<Utils::Vector3d>
+LBWalberla::get_velocities_at_pos(std::vector<Utils::Vector3d> const &pos) {
+  return lb_fluid->get_velocities_at_pos(pos);
+}
+
 void LBWalberla::veto_time_step(double time_step) const {
   walberla_tau_sanity_checks("LB", lb_params->get_tau(), time_step);
 }
diff --git a/src/core/lb/LBWalberla.hpp b/src/core/lb/LBWalberla.hpp
index 16e35eb2b9f..14cf5d02f27 100644
--- a/src/core/lb/LBWalberla.hpp
+++ b/src/core/lb/LBWalberla.hpp
@@ -58,6 +58,7 @@ struct LBWalberla {
   auto get_agrid() const { return lb_params->get_agrid(); }
   auto get_lattice_speed() const { return get_agrid() / get_tau(); }
   Utils::VectorXd<9> get_pressure_tensor() const;
+  bool is_gpu() const;
   std::optional<Utils::Vector3d>
   get_velocity_at_pos(Utils::Vector3d const &pos,
                       bool consider_points_in_halo) const;
@@ -66,6 +67,10 @@ struct LBWalberla {
   Utils::Vector3d get_momentum() const;
   bool add_force_at_pos(Utils::Vector3d const &pos,
                         Utils::Vector3d const &force);
+  void add_forces_at_pos(std::vector<Utils::Vector3d> const &pos,
+                         std::vector<Utils::Vector3d> const &forces);
+  std::vector<Utils::Vector3d>
+  get_velocities_at_pos(std::vector<Utils::Vector3d> const &pos);
   void propagate();
   void veto_time_step(double time_step) const;
   void veto_kT(double kT) const;
@@ -74,9 +79,10 @@ struct LBWalberla {
                           unsigned int shear_plane_normal) const;
 
   void on_cell_structure_change() const {}
-  void on_boxl_change() const {
+  void veto_boxl_change() const {
     throw std::runtime_error("MD cell geometry change not supported by LB");
   }
+  void on_boxl_change() const { veto_boxl_change(); }
   void on_node_grid_change() const {
     throw std::runtime_error("MPI topology change not supported by LB");
   }
diff --git a/src/core/lb/Solver.cpp b/src/core/lb/Solver.cpp
index 8951595d829..009f953c578 100644
--- a/src/core/lb/Solver.cpp
+++ b/src/core/lb/Solver.cpp
@@ -63,7 +63,10 @@ static void check_solver(std::unique_ptr<Solver::Implementation> const &ptr) {
 
 bool Solver::is_solver_set() const { return LB::is_solver_set(impl); }
 
-void Solver::reset() { System::get_system().lb.impl->solver = std::nullopt; }
+void Solver::reset() {
+  System::get_system().lb.impl->solver = std::nullopt;
+  m_conv = Conversions{};
+}
 
 void Solver::propagate() {
   check_solver(impl);
@@ -107,6 +110,12 @@ void Solver::on_cell_structure_change() {
   }
 }
 
+void Solver::veto_boxl_change() const {
+  if (impl->solver) {
+    std::visit([](auto const &ptr) { ptr->veto_boxl_change(); }, *impl->solver);
+  }
+}
+
 void Solver::on_boxl_change() {
   if (impl->solver) {
     std::visit([](auto &ptr) { ptr->on_boxl_change(); }, *impl->solver);
@@ -131,6 +140,11 @@ void Solver::on_temperature_change() {
   }
 }
 
+bool Solver::is_gpu() const {
+  check_solver(impl);
+  return std::visit([](auto &ptr) { return ptr->is_gpu(); }, *impl->solver);
+}
+
 double Solver::get_agrid() const {
   check_solver(impl);
   return std::visit([](auto &ptr) { return ptr->get_agrid(); }, *impl->solver);
@@ -159,9 +173,8 @@ Solver::get_interpolated_velocity(Utils::Vector3d const &pos) const {
      (Eq. (11) Ahlrichs and Duenweg, JCP 111(17):8225 (1999)) */
   return std::visit(
       [&](auto &ptr) {
-        auto const agrid = ptr->get_agrid();
         auto const &box_geo = *System::get_system().box_geo;
-        auto const lb_pos = box_geo.folded_position(pos) / agrid;
+        auto const lb_pos = box_geo.folded_position(pos) * m_conv.pos_to_lb;
         return ptr->get_velocity_at_pos(lb_pos, false);
       },
       *impl->solver);
@@ -171,9 +184,8 @@ std::optional<double>
 Solver::get_interpolated_density(Utils::Vector3d const &pos) const {
   return std::visit(
       [&](auto &ptr) {
-        auto const agrid = ptr->get_agrid();
         auto const &box_geo = *System::get_system().box_geo;
-        auto const lb_pos = box_geo.folded_position(pos) / agrid;
+        auto const lb_pos = box_geo.folded_position(pos) * m_conv.pos_to_lb;
         return ptr->get_density_at_pos(lb_pos, false);
       },
       *impl->solver);
@@ -183,10 +195,46 @@ Utils::Vector3d
 Solver::get_coupling_interpolated_velocity(Utils::Vector3d const &pos) const {
   return std::visit(
       [&](auto &ptr) {
-        auto const agrid = ptr->get_agrid();
-        auto const res = ptr->get_velocity_at_pos(pos / agrid, true);
+        auto const res = ptr->get_velocity_at_pos(pos * m_conv.pos_to_lb, true);
         assert(res);
-        return *res * (ptr->get_agrid() / ptr->get_tau());
+        return *res * m_conv.vel_to_md;
+      },
+      *impl->solver);
+}
+
+std::vector<Utils::Vector3d> Solver::get_coupling_interpolated_velocities(
+    std::vector<Utils::Vector3d> const &pos) const {
+  return std::visit(
+      [&](auto &ptr) {
+        std::vector<Utils::Vector3d> pos_lb;
+        pos_lb.reserve(pos.size());
+        for (auto const &pos_md : pos) {
+          pos_lb.emplace_back(pos_md * m_conv.pos_to_lb);
+        }
+        auto res = ptr->get_velocities_at_pos(pos_lb);
+        for (auto &v : res) {
+          v *= m_conv.vel_to_md;
+        }
+        return res;
+      },
+      *impl->solver);
+}
+
+void Solver::add_forces_at_pos(std::vector<Utils::Vector3d> const &pos,
+                               std::vector<Utils::Vector3d> const &forces) {
+  std::visit(
+      [&](auto &ptr) {
+        std::vector<Utils::Vector3d> pos_lb;
+        std::vector<Utils::Vector3d> force_lb;
+        pos_lb.reserve(pos.size());
+        force_lb.reserve(pos.size());
+        for (auto const &pos_md : pos) {
+          pos_lb.emplace_back(pos_md * m_conv.pos_to_lb);
+        }
+        for (auto const &force_md : forces) {
+          force_lb.emplace_back(force_md * m_conv.force_to_lb);
+        }
+        ptr->add_forces_at_pos(pos_lb, force_lb);
       },
       *impl->solver);
 }
@@ -195,7 +243,8 @@ void Solver::add_force_density(Utils::Vector3d const &pos,
                                Utils::Vector3d const &force_density) {
   std::visit(
       [&](auto &ptr) {
-        if (not ptr->add_force_at_pos(pos / ptr->get_agrid(), force_density)) {
+        if (not ptr->add_force_at_pos(pos * m_conv.pos_to_lb,
+                                      force_density * m_conv.force_to_lb)) {
           throw std::runtime_error("Cannot apply force to LB");
         }
       },
@@ -226,6 +275,9 @@ void Solver::set<LBWalberla>(std::shared_ptr<LBWalberlaBase> lb_fluid,
   auto const &lebc = system.box_geo->lees_edwards_bc();
   lb_fluid->check_lebc(lebc.shear_direction, lebc.shear_plane_normal);
   impl->solver = lb_instance;
+  auto const agrid = lb_instance->get_agrid();
+  auto const tau = lb_instance->get_tau();
+  m_conv = Conversions{1. / agrid, agrid / tau, tau * tau / agrid};
 }
 #endif // WALBERLA
 
diff --git a/src/core/lb/Solver.hpp b/src/core/lb/Solver.hpp
index e6012696d0c..6eefdc27cfe 100644
--- a/src/core/lb/Solver.hpp
+++ b/src/core/lb/Solver.hpp
@@ -34,6 +34,11 @@ namespace LB {
 
 struct Solver : public System::Leaf<Solver> {
   struct Implementation;
+  struct Conversions {
+    double pos_to_lb;
+    double vel_to_md;
+    double force_to_lb;
+  };
 
   Solver();
 
@@ -90,6 +95,8 @@ struct Solver : public System::Leaf<Solver> {
   void lebc_sanity_checks(unsigned int shear_direction,
                           unsigned int shear_plane_normal) const;
 
+  bool is_gpu() const;
+
   /**
    * @brief Get the LB time step.
    */
@@ -135,12 +142,19 @@ struct Solver : public System::Leaf<Solver> {
   /**
    * @brief Calculate the interpolated fluid velocity in MD units.
    * Special method used only for particle coupling. Uses the LB ghost layer.
+   * Achieved by linear interpolation (eq. 11 in @cite ahlrichs99a).
    * @param pos Position in MD units at which the velocity is to be calculated.
    * @retval interpolated fluid velocity.
    */
   Utils::Vector3d
   get_coupling_interpolated_velocity(Utils::Vector3d const &pos) const;
 
+  std::vector<Utils::Vector3d> get_coupling_interpolated_velocities(
+      std::vector<Utils::Vector3d> const &pos) const;
+
+  void add_forces_at_pos(std::vector<Utils::Vector3d> const &pos,
+                         std::vector<Utils::Vector3d> const &forces);
+
   /**
    * @brief Add a force density to the fluid at the given position.
    * @param pos            Position at which the force density is to be applied.
@@ -154,10 +168,12 @@ struct Solver : public System::Leaf<Solver> {
   void on_cell_structure_change();
   void on_timestep_change();
   void on_temperature_change();
+  void veto_boxl_change() const;
 
 private:
   /** @brief Pointer-to-implementation. */
   std::unique_ptr<Implementation> impl;
+  Conversions m_conv{};
 };
 
 } // namespace LB
diff --git a/src/core/lb/particle_coupling.cpp b/src/core/lb/particle_coupling.cpp
index 48b1afa9f15..f5e49f505a1 100644
--- a/src/core/lb/particle_coupling.cpp
+++ b/src/core/lb/particle_coupling.cpp
@@ -33,7 +33,6 @@
 #include <utils/Vector.hpp>
 
 #include <boost/mpi.hpp>
-#include <boost/serialization/optional.hpp>
 
 #ifdef CALIPER
 #include <caliper/cali.h>
@@ -46,14 +45,6 @@
 #include <stdexcept>
 #include <vector>
 
-void add_md_force(LB::Solver &lb, Utils::Vector3d const &pos,
-                  Utils::Vector3d const &force, double time_step) {
-  /* transform momentum transfer to lattice units
-     (eq. (12) @cite ahlrichs99a) */
-  auto const delta_j = (time_step / lb.get_lattice_speed()) * force;
-  lb.add_force_density(pos, delta_j);
-}
-
 static Thermostat::GammaType lb_handle_particle_anisotropy(Particle const &p,
                                                            double lb_gamma) {
 #ifdef THERMOSTAT_PER_PARTICLE
@@ -69,20 +60,26 @@ static Thermostat::GammaType lb_handle_particle_anisotropy(Particle const &p,
 #endif // THERMOSTAT_PER_PARTICLE
 }
 
-Utils::Vector3d lb_drag_force(LB::Solver const &lb, double lb_gamma,
-                              Particle const &p,
-                              Utils::Vector3d const &shifted_pos,
-                              Utils::Vector3d const &vel_offset) {
-  /* calculate fluid velocity at particle's position
-     this is done by linear interpolation (eq. (11) @cite ahlrichs99a) */
-  auto const v_fluid = lb.get_coupling_interpolated_velocity(shifted_pos);
-  auto const v_drift = v_fluid + vel_offset;
+static Utils::Vector3d lb_drag_force(Particle const &p, double lb_gamma,
+                                     Utils::Vector3d const &v_fluid) {
+#ifdef LB_ELECTROHYDRODYNAMICS
+  auto const v_drift = v_fluid + p.mu_E();
+#else
+  auto const &v_drift = v_fluid;
+#endif
   auto const gamma = lb_handle_particle_anisotropy(p, lb_gamma);
 
   /* calculate viscous force (eq. (9) @cite ahlrichs99a) */
   return Utils::hadamard_product(gamma, v_drift - p.v());
 }
 
+Utils::Vector3d lb_drag_force(LB::Solver const &lb, double lb_gamma,
+                              Particle const &p,
+                              Utils::Vector3d const &shifted_pos) {
+  auto const v_fluid = lb.get_coupling_interpolated_velocity(shifted_pos);
+  return lb_drag_force(p, lb_gamma, v_fluid);
+}
+
 /**
  * @brief Check if a position is within the local box + halo.
  *
@@ -113,6 +110,52 @@ bool in_local_halo(LocalBox const &local_box, Utils::Vector3d const &pos,
   return in_local_domain(local_box, pos, halo);
 }
 
+static void positions_in_halo_impl(Utils::Vector3d const &pos_folded,
+                                   Utils::Vector3d const &halo_lower_corner,
+                                   Utils::Vector3d const &halo_upper_corner,
+                                   BoxGeometry const &box_geo,
+                                   std::vector<Utils::Vector3d> &res) {
+
+  // Lees-Edwards: pre-calc positional offset folded into the simulation box
+  double folded_le_offset = 0.;
+  if (box_geo.type() == BoxType::LEES_EDWARDS) {
+    auto const &le = box_geo.lees_edwards_bc();
+    folded_le_offset = Algorithm::periodic_fold(
+        le.pos_offset, box_geo.length()[le.shear_direction]);
+  }
+
+  for (int i : {-1, 0, 1}) {
+    for (int j : {-1, 0, 1}) {
+      for (int k : {-1, 0, 1}) {
+        Utils::Vector3d shift{{double(i), double(j), double(k)}};
+
+        // Lees Edwards: folded position incl. LE pos offset
+        // This is needed to ensure that the position from which `pos_shifted`
+        // is calculated below, is always in the primary simulation box.
+        auto with_le_offset = [&](auto pos) {
+          auto const &le = box_geo.lees_edwards_bc();
+          pos[le.shear_direction] = Algorithm::periodic_fold(
+              pos[le.shear_direction] +
+                  shift[le.shear_plane_normal] * folded_le_offset,
+              box_geo.length()[le.shear_direction]);
+          return pos;
+        };
+
+        Utils::Vector3d pos_shifted =
+            (box_geo.type() != BoxType::LEES_EDWARDS) ? // no Lees Edwards
+                pos_folded + Utils::hadamard_product(box_geo.length(), shift)
+                                                      : // Lees Edwards
+                with_le_offset(pos_folded) +
+                    Utils::hadamard_product(box_geo.length(), shift);
+
+        if (in_box(pos_shifted, halo_lower_corner, halo_upper_corner)) {
+          res.emplace_back(pos_shifted);
+        }
+      }
+    }
+  }
+}
+
 /**
  * @brief Return a vector of positions shifted by +,- box length in each
  * coordinate
@@ -125,36 +168,35 @@ std::vector<Utils::Vector3d> positions_in_halo(Utils::Vector3d const &pos,
   auto const halo_vec = Utils::Vector3d::broadcast(halo);
   auto const fully_inside_lower = local_box.my_left() + 2. * halo_vec;
   auto const fully_inside_upper = local_box.my_right() - 2. * halo_vec;
-  if (in_box(pos, fully_inside_lower, fully_inside_upper)) {
-    return {pos};
+  auto const pos_folded = box_geo.folded_position(pos);
+  if (in_box(pos_folded, fully_inside_lower, fully_inside_upper)) {
+    return {pos_folded};
   }
   auto const halo_lower_corner = local_box.my_left() - halo_vec;
   auto const halo_upper_corner = local_box.my_right() + halo_vec;
-
   std::vector<Utils::Vector3d> res;
-  for (int i : {-1, 0, 1}) {
-    for (int j : {-1, 0, 1}) {
-      for (int k : {-1, 0, 1}) {
-        Utils::Vector3d shift{{double(i), double(j), double(k)}};
-        Utils::Vector3d pos_shifted =
-            pos + Utils::hadamard_product(box_geo.length(), shift);
-
-        if (box_geo.type() == BoxType::LEES_EDWARDS) {
-          auto le = box_geo.lees_edwards_bc();
-          auto normal_shift = (pos_shifted - pos)[le.shear_plane_normal];
-          if (normal_shift > std::numeric_limits<double>::epsilon())
-            pos_shifted[le.shear_direction] += le.pos_offset;
-          if (normal_shift < -std::numeric_limits<double>::epsilon())
-            pos_shifted[le.shear_direction] -= le.pos_offset;
-        }
+  positions_in_halo_impl(pos_folded, halo_lower_corner, halo_upper_corner,
+                         box_geo, res);
+  return res;
+}
 
-        if (in_box(pos_shifted, halo_lower_corner, halo_upper_corner)) {
-          res.push_back(pos_shifted);
-        }
-      }
+static auto lees_edwards_vel_shift(Utils::Vector3d const &pos_shifted_by_box_l,
+                                   Utils::Vector3d const &orig_pos,
+                                   BoxGeometry const &box_geo) {
+  Utils::Vector3d vel_shift{{0., 0., 0.}};
+  if (box_geo.type() == BoxType::LEES_EDWARDS) {
+    auto le = box_geo.lees_edwards_bc();
+    auto normal_shift =
+        (pos_shifted_by_box_l - orig_pos)[le.shear_plane_normal];
+    // normal_shift is +,- box_l or 0 up to floating point errors
+    if (normal_shift > std::numeric_limits<double>::epsilon()) {
+      vel_shift[le.shear_direction] -= le.shear_velocity;
+    }
+    if (normal_shift < -std::numeric_limits<double>::epsilon()) {
+      vel_shift[le.shear_direction] += le.shear_velocity;
     }
   }
-  return res;
+  return vel_shift;
 }
 
 namespace LB {
@@ -171,45 +213,121 @@ Utils::Vector3d ParticleCoupling::get_noise_term(Particle const &p) const {
   return m_noise_pref_wo_gamma * Utils::hadamard_product(sqrt(gamma), noise);
 }
 
-void ParticleCoupling::kernel(Particle &p) {
-  auto const agrid = m_lb.get_agrid();
+void ParticleCoupling::kernel(std::vector<Particle *> const &particles) {
+  if (particles.empty()) {
+    return;
+  }
+  enum coupling_modes { none, particle_force, swimmer_force_on_fluid };
+  auto const halo = 0.5 * m_lb.get_agrid();
+  auto const halo_vec = Utils::Vector3d::broadcast(halo);
+  auto const fully_inside_lower = m_local_box.my_left() + 2. * halo_vec;
+  auto const fully_inside_upper = m_local_box.my_right() - 2. * halo_vec;
+  auto const halo_lower_corner = m_local_box.my_left() - halo_vec;
+  auto const halo_upper_corner = m_local_box.my_right() + halo_vec;
+  std::vector<Utils::Vector3d> positions_velocity_coupling;
+  std::vector<Utils::Vector3d> positions_force_coupling;
+  std::vector<Utils::Vector3d> force_coupling_forces;
+  std::vector<uint8_t> positions_force_coupling_counter;
+  std::vector<Particle *> coupled_particles;
+  for (auto ptr : particles) {
+    auto &p = *ptr;
+    auto span_size = 1u;
+    auto const folded_pos = m_box_geo.folded_position(p.pos());
+    if (in_box(folded_pos, fully_inside_lower, fully_inside_upper)) {
+      positions_force_coupling.emplace_back(folded_pos);
+    } else {
+      auto const old_size = positions_force_coupling.size();
+      positions_in_halo_impl(folded_pos, halo_lower_corner, halo_upper_corner,
+                             m_box_geo, positions_force_coupling);
+      auto const new_size = positions_force_coupling.size();
+      span_size = static_cast<uint8_t>(new_size - old_size);
+    }
+    auto coupling_mode = none;
+#ifdef ENGINE
+    if (p.swimming().is_engine_force_on_fluid) {
+      coupling_mode = swimmer_force_on_fluid;
+    }
+#endif
+    if (coupling_mode == none) {
+      for (auto end = positions_force_coupling.end(), it = end - span_size;
+           it != end; ++it) {
+        auto const &pos = *it;
+        if (pos >= halo_lower_corner and pos < halo_upper_corner) {
+          positions_velocity_coupling.emplace_back(pos);
+          coupling_mode = particle_force;
+          break;
+        }
+      }
+    }
+    if (coupling_mode == none) {
+      positions_force_coupling.erase(positions_force_coupling.end() - span_size,
+                                     positions_force_coupling.end());
+    } else {
+      coupled_particles.emplace_back(ptr);
+      positions_force_coupling_counter.emplace_back(span_size);
+    }
+  }
 
-  // Calculate coupling force
-  Utils::Vector3d force_on_particle = {};
-  auto const halo_pos = positions_in_halo(m_box_geo.folded_position(p.pos()),
-                                          m_box_geo, m_local_box, agrid);
+  if (coupled_particles.empty()) {
+    return;
+  }
+  auto interpolated_velocities =
+      m_lb.get_coupling_interpolated_velocities(positions_velocity_coupling);
 
+  auto const &domain_lower_corner = m_local_box.my_left();
+  auto const &domain_upper_corner = m_local_box.my_right();
+  auto it_interpolated_velocities = interpolated_velocities.begin();
+  auto it_positions_force_coupling = positions_force_coupling.begin();
+  auto it_positions_velocity_coupling = positions_velocity_coupling.begin();
+  auto it_positions_force_coupling_counter =
+      positions_force_coupling_counter.begin();
+  for (auto ptr : coupled_particles) {
+    auto &p = *ptr;
+    auto coupling_mode = particle_force;
 #ifdef ENGINE
-  if (not p.swimming().is_engine_force_on_fluid)
+    if (p.swimming().is_engine_force_on_fluid) {
+      coupling_mode = swimmer_force_on_fluid;
+    }
 #endif
-    for (auto const &pos : halo_pos) {
-      if (in_local_halo(m_local_box, pos, agrid)) {
-        auto const vel_offset = lb_drift_velocity_offset(p);
-        auto const drag_force =
-            lb_drag_force(m_lb, m_thermostat.gamma, p, pos, vel_offset);
-        auto const random_force = get_noise_term(p);
-        force_on_particle = drag_force + random_force;
-        break;
+    Utils::Vector3d force_on_particle = {};
+    if (coupling_mode == particle_force) {
+      auto &v_fluid = *it_interpolated_velocities;
+      if (m_box_geo.type() == BoxType::LEES_EDWARDS) {
+        // Account for the case where the interpolated velocity has been read
+        // from a ghost of the particle across the LE boundary (or vice verssa)
+        // Then the particle velocity is shifted by +,- the LE shear velocity
+        auto const vel_correction = lees_edwards_vel_shift(
+            *it_positions_velocity_coupling, p.pos(), m_box_geo);
+        v_fluid += vel_correction;
       }
+      auto const drag_force = lb_drag_force(p, m_thermostat.gamma, v_fluid);
+      auto const random_force = get_noise_term(p);
+      force_on_particle = drag_force + random_force;
+      ++it_interpolated_velocities;
+      ++it_positions_velocity_coupling;
     }
 
-  auto force_on_fluid = -force_on_particle;
+    auto force_on_fluid = -force_on_particle;
 #ifdef ENGINE
-  if (p.swimming().is_engine_force_on_fluid) {
-    force_on_fluid = p.calc_director() * p.swimming().f_swim;
-  }
+    if (coupling_mode == swimmer_force_on_fluid) {
+      force_on_fluid = p.calc_director() * p.swimming().f_swim;
+    }
 #endif
 
-  // couple positions including shifts by one box length to add
-  // forces to ghost layers
-  for (auto const &pos : halo_pos) {
-    if (in_local_domain(m_local_box, pos)) {
-      /* Particle is in our LB volume, so this node
-       * is responsible to adding its force */
-      p.force() += force_on_particle;
+    auto const span_size = *it_positions_force_coupling_counter;
+    ++it_positions_force_coupling_counter;
+    for (uint8_t i{0u}; i < span_size; ++i) {
+      auto &pos = *it_positions_force_coupling;
+      if (pos >= domain_lower_corner and pos < domain_upper_corner) {
+        /* Particle is in our LB volume, so this node
+         * is responsible to adding its force */
+        p.force() += force_on_particle;
+      }
+      force_coupling_forces.emplace_back(force_on_fluid);
+      ++it_positions_force_coupling;
     }
-    add_md_force(m_lb, pos, force_on_fluid, m_time_step);
   }
+  m_lb.add_forces_at_pos(positions_force_coupling, force_coupling_forces);
 }
 
 #if defined(THERMOSTAT_PER_PARTICLE) and defined(PARTICLE_ANISOTROPY)
@@ -228,7 +346,7 @@ static void lb_coupling_sanity_checks(Particle const &p) {
 
 } // namespace LB
 
-void System::System::lb_couple_particles(double time_step) {
+void System::System::lb_couple_particles() {
 #ifdef CALIPER
   CALI_CXX_MARK_FUNCTION;
 #endif
@@ -240,18 +358,19 @@ void System::System::lb_couple_particles(double time_step) {
     }
     auto const real_particles = cell_structure->local_particles();
     auto const ghost_particles = cell_structure->ghost_particles();
-    LB::ParticleCoupling coupling{*thermostat->lb, lb, *box_geo, *local_geo,
-                                  time_step};
+    LB::ParticleCoupling coupling{*thermostat->lb, lb, *box_geo, *local_geo};
     LB::CouplingBookkeeping bookkeeping{*cell_structure};
+    std::vector<Particle *> particles{};
     for (auto const *particle_range : {&real_particles, &ghost_particles}) {
       for (auto &p : *particle_range) {
         if (not LB::is_tracer(p) and bookkeeping.should_be_coupled(p)) {
 #if defined(THERMOSTAT_PER_PARTICLE) and defined(PARTICLE_ANISOTROPY)
           LB::lb_coupling_sanity_checks(p);
 #endif
-          coupling.kernel(p);
+          particles.emplace_back(&p);
         }
       }
     }
+    coupling.kernel(particles);
   }
 }
diff --git a/src/core/lb/particle_coupling.hpp b/src/core/lb/particle_coupling.hpp
index d5e7cdf3a5b..4bb61b50e5c 100644
--- a/src/core/lb/particle_coupling.hpp
+++ b/src/core/lb/particle_coupling.hpp
@@ -26,7 +26,6 @@
 #include "PropagationMode.hpp"
 #include "cell_system/CellStructure.hpp"
 #include "lb/Solver.hpp"
-#include "system/System.hpp"
 #include "thermostat.hpp"
 
 #include <utils/Vector.hpp>
@@ -44,39 +43,26 @@
 bool in_local_halo(LocalBox const &local_box, Utils::Vector3d const &pos,
                    double agrid);
 
-/**
- * @brief Add a force to the lattice force density.
- * @param lb Hydrodynamics solver
- * @param pos Position of the force in LB units.
- * @param force Force in MD units.
- * @param time_step MD time step.
- */
-void add_md_force(LB::Solver &lb, Utils::Vector3d const &pos,
-                  Utils::Vector3d const &force, double time_step);
-
 // internal function exposed for unit testing
 std::vector<Utils::Vector3d> positions_in_halo(Utils::Vector3d const &pos,
-                                               BoxGeometry const &box,
-                                               LocalBox const &local_geo,
+                                               BoxGeometry const &box_geo,
+                                               LocalBox const &local_box,
                                                double agrid);
 
 /** @brief Calculate drag force on a single particle.
  *
- *  See section II.C. @cite ahlrichs99a
+ *  See section II.C. and eq. 9 in @cite ahlrichs99a.
  *
  *  @param[in] lb          The coupled fluid
  *  @param[in] lb_gamma    The friction coefficient
  *  @param[in] p           The coupled particle
- *  @param[in] shifted_pos The particle position in LB units with optional shift
- *  @param[in] vel_offset  Velocity offset in MD units to be added to
- *                         interpolated LB velocity before calculating the force
+ *  @param[in] shifted_pos The particle position in MD units with optional shift
  *
  *  @return The viscous coupling force
  */
 Utils::Vector3d lb_drag_force(LB::Solver const &lb, double lb_gamma,
                               Particle const &p,
-                              Utils::Vector3d const &shifted_pos,
-                              Utils::Vector3d const &vel_offset);
+                              Utils::Vector3d const &shifted_pos);
 
 namespace LB {
 
@@ -85,16 +71,14 @@ class ParticleCoupling {
   LB::Solver &m_lb;
   BoxGeometry const &m_box_geo;
   LocalBox const &m_local_box;
-  double m_time_step;
   double m_noise_pref_wo_gamma;
   bool m_thermalized;
 
 public:
   ParticleCoupling(LBThermostat const &thermostat, LB::Solver &lb,
-                   BoxGeometry const &box_geo, LocalBox const &local_box,
-                   double time_step)
+                   BoxGeometry const &box_geo, LocalBox const &local_box)
       : m_thermostat{thermostat}, m_lb{lb}, m_box_geo{box_geo},
-        m_local_box{local_box}, m_time_step{time_step} {
+        m_local_box{local_box} {
     /* Eq. (16) @cite ahlrichs99a, without the gamma term.
      * The factor 12 comes from the fact that we use random numbers
      * from -0.5 to 0.5 (equally distributed) which have variance 1/12.
@@ -103,23 +87,11 @@ class ParticleCoupling {
     auto constexpr variance_inv = 12.;
     auto const kT = lb.get_kT() * Utils::sqr(lb.get_lattice_speed());
     m_thermalized = (kT != 0.);
-    m_noise_pref_wo_gamma = std::sqrt(variance_inv * 2. * kT / time_step);
+    m_noise_pref_wo_gamma = std::sqrt(variance_inv * 2. * kT / lb.get_tau());
   }
 
   Utils::Vector3d get_noise_term(Particle const &p) const;
-  void kernel(Particle &p);
-
-  /**
-   * @brief Calculate particle drift velocity offset due to ENGINE and
-   * ELECTROHYDRODYNAMICS.
-   */
-  auto lb_drift_velocity_offset(Particle const &p) const {
-    Utils::Vector3d vel_offset{};
-#ifdef LB_ELECTROHYDRODYNAMICS
-    vel_offset += p.mu_E();
-#endif
-    return vel_offset;
-  }
+  void kernel(std::vector<Particle *> const &particles);
 };
 
 /**
diff --git a/src/core/lees_edwards/LeesEdwardsBC.hpp b/src/core/lees_edwards/LeesEdwardsBC.hpp
index 916eb31e491..1e4a935fe27 100644
--- a/src/core/lees_edwards/LeesEdwardsBC.hpp
+++ b/src/core/lees_edwards/LeesEdwardsBC.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef CORE_LEES_EDWARDS_LEES_EDWARDS_BC_HPP
-#define CORE_LEES_EDWARDS_LEES_EDWARDS_BC_HPP
+
+#pragma once
 
 #include <utils/Vector.hpp>
 
@@ -32,7 +32,7 @@ struct LeesEdwardsBC {
   unsigned int shear_direction = invalid_dir;
   unsigned int shear_plane_normal = invalid_dir;
   Utils::Vector3d distance(Utils::Vector3d const &d, Utils::Vector3d const &l,
-                           Utils::Vector3d const &hal_l,
+                           Utils::Vector3d const &,
                            Utils::Vector3d const &l_inv,
                            std::bitset<3> const periodic) const {
 
@@ -58,5 +58,3 @@ struct LeesEdwardsBC {
     return res;
   }
 };
-
-#endif
diff --git a/src/core/lees_edwards/protocols.hpp b/src/core/lees_edwards/protocols.hpp
index bb45fd8084f..615558b1f87 100644
--- a/src/core/lees_edwards/protocols.hpp
+++ b/src/core/lees_edwards/protocols.hpp
@@ -30,8 +30,8 @@ namespace LeesEdwards {
 
 /** Lees-Edwards protocol for un-altered periodic boundary conditions */
 struct Off {
-  double shear_velocity(double time) const { return 0.; }
-  double pos_offset(double time) const { return 0.; }
+  double shear_velocity(double) const { return 0.; }
+  double pos_offset(double) const { return 0.; }
 };
 
 /** Lees-Edwards protocol for linear shearing */
@@ -39,9 +39,9 @@ struct LinearShear {
   LinearShear()
       : m_initial_pos_offset{0.}, m_shear_velocity{0.}, m_time_0{0.} {}
   LinearShear(double initial_offset, double shear_velocity, double time_0)
-      : m_initial_pos_offset{initial_offset},
-        m_shear_velocity{shear_velocity}, m_time_0{time_0} {}
-  double shear_velocity(double time) const { return m_shear_velocity; }
+      : m_initial_pos_offset{initial_offset}, m_shear_velocity{shear_velocity},
+        m_time_0{time_0} {}
+  double shear_velocity(double) const { return m_shear_velocity; }
   double pos_offset(double time) const {
     return m_initial_pos_offset + (time - m_time_0) * m_shear_velocity;
   }
@@ -53,18 +53,22 @@ struct LinearShear {
 /** Lees-Edwards protocol for oscillatory shearing */
 struct OscillatoryShear {
   OscillatoryShear()
-      : m_initial_pos_offset{0.}, m_amplitude{0.}, m_omega{0.}, m_time_0{0.}, m_decay_rate{0.} {}
+      : m_initial_pos_offset{0.}, m_amplitude{0.}, m_omega{0.}, m_time_0{0.},
+        m_decay_rate{0.} {}
   OscillatoryShear(double initial_offset, double amplitude, double omega,
-                   double time_0, double decay_rate)
-      : m_initial_pos_offset{initial_offset},
-        m_amplitude{amplitude}, m_omega{omega}, m_time_0{time_0}, m_decay_rate{decay_rate} {}
+                   double time_0)
+      : m_initial_pos_offset{initial_offset}, m_amplitude{amplitude},
+        m_omega{omega}, m_time_0{time_0} {}
   double pos_offset(double time) const {
-    return m_initial_pos_offset +
-           m_amplitude *exp(-(time-m_time_0)*m_decay_rate) * std::sin(m_omega * (time - m_time_0));
+    return m_initial_pos_offset + m_amplitude *
+                                      exp(-(time - m_time_0) * m_decay_rate) *
+                                      std::sin(m_omega * (time - m_time_0));
   }
   double shear_velocity(double time) const {
-    return -m_decay_rate*exp(-(time-m_time_0)*m_decay_rate) * m_amplitude * std::sin(m_omega * (time - m_time_0)) +\
-           exp(-(time-m_time_0)*m_decay_rate)* m_omega *m_amplitude *std::cos(m_omega*(time-m_time_0));
+    return -m_decay_rate * exp(-(time - m_time_0) * m_decay_rate) *
+               m_amplitude * std::sin(m_omega * (time - m_time_0)) +
+           exp(-(time - m_time_0) * m_decay_rate) * m_omega * m_amplitude *
+               std::cos(m_omega * (time - m_time_0));
   }
   double m_initial_pos_offset;
   double m_amplitude;
diff --git a/src/core/magnetostatics/CMakeLists.txt b/src/core/magnetostatics/CMakeLists.txt
index 8abb34a93db..2d1e828a1e0 100644
--- a/src/core/magnetostatics/CMakeLists.txt
+++ b/src/core/magnetostatics/CMakeLists.txt
@@ -19,7 +19,6 @@
 target_sources(
   espresso_core
   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/dipoles.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/barnes_hut_gpu.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/dipolar_direct_sum.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/dipolar_direct_sum_gpu.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/dlc.cpp
diff --git a/src/core/magnetostatics/barnes_hut_gpu.cpp b/src/core/magnetostatics/barnes_hut_gpu.cpp
deleted file mode 100644
index 7fc44a5ab6e..00000000000
--- a/src/core/magnetostatics/barnes_hut_gpu.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (C) 2016-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "config/config.hpp"
-
-#ifdef DIPOLAR_BARNES_HUT
-
-#include "magnetostatics/barnes_hut_gpu.hpp"
-#include "magnetostatics/barnes_hut_gpu_cuda.cuh"
-
-#include "communication.hpp"
-#include "cuda/utils.hpp"
-#include "errorhandling.hpp"
-#include "system/GpuParticleData.hpp"
-#include "system/System.hpp"
-
-DipolarBarnesHutGpu::DipolarBarnesHutGpu(double prefactor, double epssq,
-                                         double itolsq) {
-  set_prefactor(prefactor);
-  m_epssq = epssq;
-  m_itolsq = itolsq;
-  if (m_itolsq <= 0.) {
-    throw std::domain_error("Parameter 'itolsq' must be > 0");
-  }
-  if (m_epssq <= 0.) {
-    throw std::domain_error("Parameter 'epssq' must be > 0");
-  }
-}
-
-void DipolarBarnesHutGpu::on_activation() const {
-  auto &gpu_particle_data = get_system().gpu;
-  gpu_particle_data.enable_property(GpuParticleData::prop::force);
-  gpu_particle_data.enable_property(GpuParticleData::prop::torque);
-  gpu_particle_data.enable_property(GpuParticleData::prop::pos);
-  gpu_particle_data.enable_property(GpuParticleData::prop::dip);
-  if (this_node == 0) {
-    setBHPrecision(static_cast<float>(m_epssq), static_cast<float>(m_itolsq));
-  }
-}
-
-template <class... Args, class... ArgRef>
-int call_kernel(void (*fp)(Args...), ArgRef &&...args) {
-  int error_code = ES_ERROR;
-  try {
-    fp(args...);
-    error_code = ES_OK;
-  } catch (std::runtime_error const &err) {
-    runtimeErrorMsg() << "DipolarBarnesHutGpu: " << err.what();
-  }
-  return error_code;
-}
-
-int DipolarBarnesHutGpu::initialize_data_structure() {
-  auto &gpu = get_system().gpu;
-  auto const n_part = static_cast<int>(gpu.n_particles());
-  auto const error_code = call_kernel(allocBHmemCopy, n_part, &m_bh_data);
-
-  if (error_code == ES_OK) {
-    auto const positions_device = gpu.get_particle_positions_device();
-    auto const dipoles_device = gpu.get_particle_dipoles_device();
-    fill_bh_data(positions_device, dipoles_device, &m_bh_data);
-    initBHgpu(m_bh_data.blocks);
-    buildBoxBH(m_bh_data.blocks);
-    buildTreeBH(m_bh_data.blocks);
-    summarizeBH(m_bh_data.blocks);
-    sortBH(m_bh_data.blocks);
-  }
-
-  return error_code;
-}
-
-void DipolarBarnesHutGpu::add_long_range_forces() {
-  auto &gpu_particle_data = get_system().gpu;
-  gpu_particle_data.update();
-  if (this_node == 0) {
-    if (initialize_data_structure() == ES_OK) {
-      auto forces_device = gpu_particle_data.get_particle_forces_device();
-      auto torques_device = gpu_particle_data.get_particle_torques_device();
-      call_kernel(forceBH, &m_bh_data, static_cast<float>(prefactor),
-                  forces_device, torques_device);
-    }
-  }
-}
-
-void DipolarBarnesHutGpu::long_range_energy() {
-  auto &gpu_particle_data = get_system().gpu;
-  gpu_particle_data.update();
-  if (this_node == 0) {
-    if (initialize_data_structure() == ES_OK) {
-      auto energy = &(gpu_particle_data.get_energy_device()->dipolar);
-      call_kernel(energyBH, &m_bh_data, static_cast<float>(prefactor), energy);
-    }
-  }
-}
-
-#endif // DIPOLAR_BARNES_HUT
diff --git a/src/core/magnetostatics/barnes_hut_gpu.hpp b/src/core/magnetostatics/barnes_hut_gpu.hpp
deleted file mode 100644
index 0e68a210242..00000000000
--- a/src/core/magnetostatics/barnes_hut_gpu.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2016-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "config/config.hpp"
-
-#ifdef DIPOLAR_BARNES_HUT
-
-#include "magnetostatics/actor.hpp"
-#include "magnetostatics/barnes_hut_gpu_cuda.cuh"
-
-struct DipolarBarnesHutGpu : public Dipoles::Actor<DipolarBarnesHutGpu> {
-  double m_epssq;
-  double m_itolsq;
-  DipolarBarnesHutGpu(double prefactor, double epssq, double itolsq);
-  ~DipolarBarnesHutGpu() { deallocBH(&m_bh_data); }
-
-  void on_activation() const;
-  void on_boxl_change() const {}
-  void on_node_grid_change() const {}
-  void on_periodicity_change() const {}
-  void on_cell_structure_change() const {}
-  void init() const {}
-  void sanity_checks() const {}
-
-  void add_long_range_forces();
-  void long_range_energy();
-
-private:
-  /// Container for pointers to device memory.
-  BHData m_bh_data = {0,       0,       0,       nullptr, nullptr,
-                      nullptr, nullptr, nullptr, nullptr, nullptr,
-                      nullptr, nullptr, nullptr, nullptr};
-  int initialize_data_structure();
-};
-
-#endif // DIPOLAR_BARNES_HUT
diff --git a/src/core/magnetostatics/barnes_hut_gpu_cuda.cu b/src/core/magnetostatics/barnes_hut_gpu_cuda.cu
deleted file mode 100644
index 879d8f60df0..00000000000
--- a/src/core/magnetostatics/barnes_hut_gpu_cuda.cu
+++ /dev/null
@@ -1,1266 +0,0 @@
-/*
- * Copyright (C) 2016-2022 The ESPResSo project
- * Copyright (C) 2012 Alexander (Polyakov) Peletskyi
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/** @file
- *  The method is based on @cite burtscher11a.
- */
-
-#include "config/config.hpp"
-
-#ifdef DIPOLAR_BARNES_HUT
-
-#include "magnetostatics/barnes_hut_gpu_cuda.cuh"
-
-#include "cuda/init.hpp"
-#include "cuda/utils.cuh"
-
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-
-#include <cuda.h>
-
-#include <algorithm>
-#include <cstdio>
-#include <stdexcept>
-
-// Method performance/accuracy parameters
-__constant__ float epssqd[1], itolsqd[1];
-// blkcntd is a factual blocks' count.
-// bottomd is a bottom Barnes-Hut node (the division octant cell) in a linear
-// array representation. maxdepthd is a largest length of the octree "branch"
-// till the "leaf".
-__device__ volatile int bottomd, maxdepthd, blkcntd;
-// half edge of the BH box
-__device__ volatile float radiusd;
-// the struct containing all the device pointers
-__device__ __constant__ volatile BHData bhpara[1];
-
-// The "half-convolution" multi-thread reduction.
-// The thread with a lower index will operate longer and
-// final result (here: the sum) is flowing down towards zero thread.
-__device__ void dds_sumReduction_BH(float *input, float *sum) {
-  auto tid = static_cast<int>(threadIdx.x);
-  for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
-    __syncthreads();
-    if (tid < i / 2)
-      input[tid] += input[i / 2 + tid];
-    if ((i % 2 == 1) && (tid == 0))
-      input[tid] += input[i - 1];
-  }
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    sum[0] = input[0];
-  }
-}
-
-/******************************************************************************/
-/*** initialize memory ********************************************************/
-/******************************************************************************/
-
-__global__ void initializationKernel() {
-  auto const ind = blockDim.x * blockIdx.x + threadIdx.x;
-  if (ind == 0) {
-    *bhpara->err = 0;
-    *bhpara->max_lps = 0;
-    maxdepthd = 1;
-    blkcntd = 0;
-  }
-}
-
-/******************************************************************************/
-/*** compute center and radius ************************************************/
-/******************************************************************************/
-
-__global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() {
-  // min/max positions per the thread:
-  float minp[3], maxp[3];
-  // min/max positions per block:
-  __shared__ float smin[3 * THREADS1], smax[3 * THREADS1];
-  for (int l = 0; l < 3; l++) {
-    minp[l] = maxp[l] = bhpara->r[l];
-  }
-
-  // Scan all bodies.
-  // In order to iterate over all bodies assigned to thread,
-  // it is necessary to step over all threads in the GPU:
-  // inc = [number of blocks: gridDim.x] * [THREADS1 per block within given
-  // kernel]. Hence, this approach could handle an infinite number of bodies
-  // (particles)
-  auto i = static_cast<int>(threadIdx.x);
-  int const inc = THREADS1 * static_cast<int>(gridDim.x);
-  // j is an absolute index of the particle.
-  // It is shifted over a count of the passed block threads behind: blockIdx.x *
-  // THREADS1. NOTE: this loop is extrema search among all particles of the
-  // given thread in the present block. However, one is not among all threads of
-  // this block.
-  for (auto j = i + static_cast<int>(blockIdx.x) * THREADS1;
-       j < bhpara->nbodies; j += inc)
-    for (int l = 0; l < 3; l++) {
-      auto const val = bhpara->r[3 * j + l];
-      minp[l] = min(minp[l], val);
-      maxp[l] = max(maxp[l], val);
-    }
-
-  // For a start point of a reduction in the given block shared memory
-  // of the i-th thread extrema:
-  for (int l = 0; l < 3; l++) {
-    smin[3 * i + l] = minp[l];
-    smax[3 * i + l] = maxp[l];
-  }
-
-  // Now it's time to (min)maximize among all threads of the given block.
-  // Each mim/max operation will be applied between
-  // the shared memory smin/smax and the given thread.
-  // The "half-convolution" multi-thread reduction
-  // the thread with a lower index will operate longer and
-  // final result (here: the shared memory extrema)
-  // is flowing down towards zero thread.
-  for (int t = THREADS1 / 2; t > 0; t /= 2) {
-    __syncthreads();
-    if (i < t) {
-      auto k = i + t;
-      for (int l = 0; l < 3; l++) {
-        smin[3 * i + l] = minp[l] = min(minp[l], smin[3 * k + l]);
-        smax[3 * i + l] = maxp[l] = max(maxp[l], smax[3 * k + l]);
-        // very last minp/maxp assignment will be made by zero thread (i == 0)
-      }
-    }
-  }
-
-  // Thread i == 0 is responsible for a writing
-  // of a given block result into the global memory
-  // and other per-block operations.
-  if (i == 0) {
-    // per k-th block
-    auto k = static_cast<int>(blockIdx.x);
-    for (int l = 0; l < 3; l++) {
-      // global memory storage of the per-block extrema
-      bhpara->minp[3 * k + l] = minp[l];
-      bhpara->maxp[3 * k + l] = maxp[l];
-      // note, that we are in zero thread and its variables minp/maxp
-      // contain de facto already reduced (see above) shared extrema smin/smax
-    }
-
-    auto const n_blocks = static_cast<int>(gridDim.x) - 1;
-    // The block increment is performing by its zero thread.
-    if (n_blocks == atomicInc((unsigned int *)&blkcntd, n_blocks)) {
-      // I'm the (randomly) last block, so combine all other blocks' results
-      // over the index j:
-      for (int im = 0; im <= n_blocks; im++)
-        for (int l = 0; l < 3; l++) {
-          minp[l] = min(minp[l], bhpara->minp[3 * im + l]);
-          maxp[l] = max(maxp[l], bhpara->maxp[3 * im + l]);
-        }
-
-      // Compute 'radius':
-      auto const val = max(maxp[0] - minp[0], maxp[1] - minp[1]);
-      radiusd = max(val, maxp[2] - minp[2]) * 0.5f;
-
-      // NOTE: now the extrema are global.
-      // Present code fragment will be executed once: in zero thread of the last
-      // block.
-
-      k = bhpara->nnodes;
-      // Create the root node of the Barnes-Hut octree.
-      // Bottom node is defined with max possible index just to start
-      // It will be updated within further tree building in
-      // corresponding kernel.
-      bottomd = k;
-      // Weight of the root node init.
-      bhpara->mass[k] = -1.0f;
-      // Sorting init for the tree root.
-      bhpara->start[k] = 0;
-      // Position of the root node should be in the center of just defined BH
-      // box:
-      for (int l = 0; l < 3; l++)
-        bhpara->r[3 * k + l] = (minp[l] + maxp[l]) * 0.5f;
-      // Init further tree building octo- meaning their absence at the
-      // beginning:
-      for (i = 0; i < 8; i++)
-        bhpara->child[8 * k + i] = -1;
-    }
-  }
-}
-
-/******************************************************************************/
-/*** build tree ***************************************************************/
-/******************************************************************************/
-
-__global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
-  int j, depth;
-  float r;
-  float pos[3];
-  float p[3];
-  int n;
-  float root[3];
-
-  // Radius is determined in boundingBoxKernel
-  auto const radius = radiusd;
-  // The root node has been created at the end of the boundingBoxKernel.
-  // Cache the root data:
-  for (int l = 0; l < 3; l++)
-    root[l] = bhpara->r[3 * bhpara->nnodes + l];
-  // Maximum tree depth within the given thread.
-  int localmaxdepth = 1;
-  // Skip the branch following and start from the root.
-  int skip = 1;
-  // Number of loops for the threads sync algorithm
-  int lps = 0;
-  // Increment to move among the bodies assigned to the given thread.
-  // Hence, one should step over all other threads in GPU with
-  // a quantity of blockDim.x * gridDim.x.
-  auto const inc = static_cast<int>(blockDim.x * gridDim.x);
-  // Just a regular 1D GPU index
-  auto i = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
-
-  // Iterate over all bodies assigned to thread.
-  while (i < bhpara->nbodies) {
-    if (skip != 0) {
-      // New body, so start traversing at root. Skip it further.
-      skip = 0;
-      // Particle position corresponding to the given thread and block:
-      for (int l = 0; l < 3; l++)
-        p[l] = bhpara->r[3 * i + l];
-      // Let's start a moving via the tree from the root node 8 * nbodiesd:
-      n = bhpara->nnodes;
-      depth = 1;
-      r = radius;
-      // Determine which child to follow.
-      // j=0..7 determines the octant in a binary representations
-      j = 0;
-      for (int l = 0; l < 3; l++)
-        if (root[l] < p[l])
-          j += static_cast<int>(pow(2, l));
-    }
-
-    // Follow path to leaf cell. Should not happen at the first iteration of
-    // this loop.
-    int ch = bhpara->child[n * 8 + j];
-
-    // Global memory writing related threads sync
-    if (lps++ > THREADS2) {
-      *bhpara->max_lps = lps;
-    }
-    //.. now wait for global memory updates. This impacts on race conditions and
-    // frameworks level optimizations. Further kernels contain a similar code
-    // fragment.
-    __threadfence();
-
-    // The child with the index higher than nbodiesd (number of particles) means
-    // that it is an octant cell, not a body.
-    // Actually, we need nnodesd == 8 * nbodiesd nodes for the cells storage.
-    // Let's iterate this "while" loop before we will reach the particle
-    // or an absence of a child:
-    while (ch >= bhpara->nbodies) {
-      n = ch;
-      depth++;
-      // Going down through the octree depth: radius split corresponds to the
-      // cube split. Corresponding octants are cells, not bodies. Smaller radius
-      // will be used until the next skip == 1
-      r *= 0.5f;
-      j = 0;
-      // Determine which child octant to follow based on body coordinates.
-      // j=0..7 determines the octant in a binary representations.
-      for (int l = 0; l < 3; l++)
-        if (bhpara->r[3 * n + l] < p[l])
-          j += static_cast<int>(pow(2, l));
-      ch = bhpara->child[n * 8 + j];
-    }
-    // Now we are deep enough in the tree, passed all levels of cells and
-    // reached the body (particle).
-    if (ch != -2) { // Skip if child pointer is locked (-2) and try again later.
-      int const locked = n * 8 + j;
-      // Try to lock and iterate towards next body:
-      if (ch == atomicCAS((int *)&bhpara->child[locked], ch, -2)) {
-        // If we are here then child[locked] was equal to "ch" and now is
-        // assigned to -2, it is locked. We will not came here in a case if this
-        // child will be already locked by other threads because other particle
-        // could already do it in other thread. Several particles simultaneously
-        // could force a new octants' split. In this case we will not came here
-        // and "i += inc" will be not executed below. Hence, the present loop
-        // "while (i < nbodiesd)" will mandatory repeat in the given thread for
-        // the given i-th particle until other threads will unlock this cell for
-        // either a body insertion and/or new octant level cells creation.
-        if (ch == -2) {
-          // Cannot be here..
-          *bhpara->err = 1;
-          break;
-        }
-        if (ch == -1) {
-          // If -1 (i.e. no child index) then just insert a new body
-          bhpara->child[locked] = i;
-        } else {
-          int patch = -1;
-          // There already is a body and/or cell(s) in this position.
-          // We should start from a loop of the deepest child cell
-          // determination. Then, we need to create new cells and to distribute
-          // existing and new bodies between these new cells.
-          do {
-            // If we are here then the tree bottom level is moving further:
-            // Note, that the bottomd is not a global tree depth.
-            // It is rather a tree size in 1D representation of nnodesd == 8 *
-            // nbodiesd nodes. These lists will be correctly handled by the
-            // sortKernel later.
-            depth++;
-            int const cell = atomicSub((int *)&bottomd, 1) - 1;
-            if (cell <= bhpara->nbodies) {
-              // This should not happen. A cell cannot have such index. Error.
-              *bhpara->err = 1;
-              bottomd = bhpara->nnodes;
-            }
-            // The "patch" is saving the information about a first cell created
-            // in the current thread before it continues to dive into the tree
-            // depth. Hence, the "patch" defines the new branch inception index.
-            patch = max(patch, cell);
-
-            // Note: "j" is defined by the body position against the parent cell
-            // within above "while (ch >= nbodiesd)" loop.
-            // The new cell will be placed below relatively the center of
-            // corresponding j-th octant:
-            for (int l = 0; l < 3; l++)
-              pos[l] = static_cast<float>((j >> l) & 1) * r;
-            // Note, that negative octants correspond to pos[l] == 0 and
-            // positive octants correspond to pos[l] == r.
-
-            // Going down through the octree depth: radius split corresponds to
-            // the cube split. Corresponding octants are cells, not bodies.
-            // Smaller radius will be used until the next skip == 1
-            r *= 0.5f;
-
-            // Init the node weight coefficients.
-            // Note: particles has mass=1.0 is defined in allocBHmemCopy().
-            bhpara->mass[cell] = -1.0f;
-
-            // The startd array is crucial for the sortKernel.
-            // The original root node already has startd = 0.
-            // Here, we assign -1 to the cells in contrast to bodies.
-            // Bodies do not need this array. They need only array sortd,
-            // which will be defined in the sortKernel for a usage by the force
-            // and energy calculation kernels.
-            bhpara->start[cell] = -1;
-
-            // Now, let's save the cell coordinates locally (pos[l]) and
-            // globally (xd[3 * cell + l]). This location should be shifted from
-            // the octant center defined above (pos[l] before this assignment).
-            // Parent cell coordinates bhpara->r[3 * n + l] will be added.
-            // Parent radius now is equal to 2 * r, where "r" is already updated
-            // above: r *= 0.5f. Hence, the negative octant is defined above by
-            // pos[l] == 0 and positive - by pos[l] == 2 * r. In order to
-            // transform these coordinates into relative octant positions, we
-            // need to add -r to obtain -r and r for negative and positive
-            // octants. Now, the child (cell) octants centers are deriving from
-            // the parent (n) octant center:
-            for (int l = 0; l < 3; l++)
-              pos[l] = bhpara->r[3 * cell + l] =
-                  bhpara->r[3 * n + l] - r + pos[l];
-
-            // By default, the new cell has no children in all k-th octants:
-            for (int k = 0; k < 8; k++)
-              bhpara->child[cell * 8 + k] = -1;
-
-            // This condition should always be true cause "patch" is -1 at the
-            // beginning and the bottomd/cell reduces further.
-            if (patch != cell) {
-              // New cell is assigned as a child of previous "n" parent:
-              bhpara->child[n * 8 + j] = cell;
-            }
-
-            // pos[l] already contains the child cell coordinates.
-            // Let's assign "child" then. First the octant should be selected:
-            j = 0;
-            for (int l = 0; l < 3; l++)
-              if (pos[l] < bhpara->r[3 * ch + l])
-                j += static_cast<int>(pow(2, l));
-            // New element just appeared in the chain of cells. Hence, that what
-            // supposed to be a child ("ch") before entering the present
-            // iteration, now will be a child of the new cell (after this
-            // smallest octant split into new octants):
-            bhpara->child[cell * 8 + j] = ch;
-
-            // Now cell is claimed to be a parent of further iteration of the
-            // present loop.
-            n = cell;
-            j = 0;
-            __threadfence();
-            // Let's handle the particle position (p[l]) corresponding to the
-            // given thread and block against new octant cell (pos[l]):
-            for (int l = 0; l < 3; l++)
-              if (pos[l] < p[l])
-                j += static_cast<int>(pow(2, l));
-
-            // Now the current cell's child should be considering in the new
-            // particle new octant:
-            ch = bhpara->child[n * 8 + j];
-            // Repeat until the two bodies will be different children.
-            // Hence, the current "child" should have no children.
-            // It is equivalent to an absence of other particles
-            // in the i-th particle new smallest octant, otherwise we should
-            // split octants further until these two particles will come to
-            // different octants:
-          } while (ch >= 0);
-
-          // i-th particle assignment as a child to the last created cell:
-          bhpara->child[n * 8 + j] = i;
-          // Push out the subtree among the whole grid.
-          // Data setting must be completed after this point.
-          __threadfence();
-          // The final locked child index is defined by a maximal cell index,
-          // i.e. by a beginning of the new tree of cells created within
-          // the loop "while (ch >= 0)".
-          // The "patch" defines the new just created branch inception index:
-          bhpara->child[locked] = patch;
-        }
-
-        localmaxdepth = max(depth, localmaxdepth);
-        // Each thread started from the skip=1 and made the above
-        // tree building loop procedure (while (ch >= 0)).
-        // They should do the same for remaining (each) particles.
-        // Note, that bottomd is a global variable and it is already updated.
-        // This is already taken into account in further sortKernel logic.
-        // Hence, move on to the next body assigned to the given thread:
-        i += inc;
-        skip = 1;
-        lps = 0;
-      }
-    }
-  }
-  // Record maximum tree depth:
-  atomicMax((int *)&maxdepthd, localmaxdepth);
-}
-
-/******************************************************************************/
-/*** compute center of mass ***************************************************/
-/******************************************************************************/
-
-__global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() {
-  int i, j, cnt;
-  // the node "mass" and its count respectively:
-  float m, cm;
-  // position of equivalent total dipole and its magnitude:
-  // (like a mass and the center of mass)
-  float p[3], u[3];
-  // Per-block BH tree caching:
-  __shared__ int child[THREADS3 * 8];
-
-  // no children by default:
-  for (i = 0; i < 8; i++)
-    child[i * THREADS3 + threadIdx.x] = -1;
-  auto const bottom = bottomd;
-  // Increment towards other particles assigned to the given thread:
-  auto const inc = static_cast<int>(blockDim.x * gridDim.x);
-  // Nodes iteration "k" should start from the "bottomd" level of the cells,
-  // which is a minimal index of the last created cell.
-  // Starting "k" value should be aligned using the warp size
-  // according to the designed threads performance.
-  // k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;
-  auto k = bottom + static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
-  // Threads below the bottom line could proceed to their next cells.
-  // if (k < bottom) k += inc;
-
-  // Assume no missing children:
-  int missing = 0;
-  int missing_max = 0;
-  int iteration = 0;
-  int repeat_flag = 0;
-  __syncthreads(); // throttle
-  // threads sync related
-  int lps = 0;
-  // Iterate over all cells (not particles) assigned to the thread:
-  while (k <= bhpara->nnodes) {
-    if (lps++ > THREADS3) {
-      *bhpara->max_lps = lps;
-      __threadfence();
-    }
-    if (bhpara->mass[k] < 0.) {
-      iteration++;
-      if (missing == 0) {
-        // New cell, so initialize:
-        cm = 0.0f;
-        for (int l = 0; l < 3; l++) {
-          p[l] = 0.0f;
-          u[l] = 0.0f;
-        }
-        cnt = 0;
-        j = 0;
-        for (i = 0; i < 8; i++) {
-          auto const ch = bhpara->child[k * 8 + i];
-          if (ch >= 0) {
-            if (i != j) {
-              // Move child to front (needed later for a speed only).
-              // The child's octant change is incorrect from
-              // a tree organization perspective. However, the sum
-              // will be the same.
-              bhpara->child[k * 8 + i] = -1;
-              bhpara->child[k * 8 + j] = ch;
-            }
-            // Cache a missing child in the block shared memory:
-            child[missing * THREADS3 + threadIdx.x] = ch;
-            m = bhpara->mass[ch];
-            // Is a child the particle? Only particles have non-negative mass
-            // initialized originally. Another option: a cell which already
-            // aggregated masses of other cells and particles. "missing" means
-            // that a non-zero contribution of such kind is missing:
-            missing++;
-            if (m >= 0.0f) {
-              // child is ready
-              missing--;
-              // The child is a cell, not a body (ch >= nbodiesd).
-              // Also, the previous condition (m >= 0.0f) reveals
-              // that its' children total mass is already calculated.
-              // Hence, below command "countd[k] = cnt" is already executed by
-              // other threads/blocks and we can add this count
-              if (ch >= bhpara->nbodies) { // count bodies (needed later)
-                // As far as a child is a cell, its "countd" was already
-                // calculated.
-                cnt += bhpara->count[ch] - 1;
-              }
-              // add child's contribution
-              cm += m;
-              for (int l = 0; l < 3; l++) {
-                p[l] += bhpara->r[3 * ch + l] * m;
-                u[l] += bhpara->u[3 * ch + l];
-              }
-            }
-            j++;
-          } // if (ch >= 0)
-        }
-        missing_max = missing;
-        // Count of children:
-        cnt += j;
-      }
-
-      //__syncthreads();    // throttle
-
-      if (missing != 0) {
-        for (int im = 0; im < missing_max; im++) {
-          // poll missing child
-          auto const ch = child[im * THREADS3 + threadIdx.x];
-          if (ch >= 0) {
-            m = bhpara->mass[ch];
-            // Is a child the particle? Only particles have non-negative mass
-            // initialized originally. Another option: a cell which already
-            // aggregated masses of other cells and particles.
-            if (m >= 0.0f) {
-              // child is now ready
-              missing--;
-              child[im * THREADS3 + threadIdx.x] = -1;
-              // The child is a cell, not a body (ch >= nbodiesd).
-              if (ch >= bhpara->nbodies) {
-                // count bodies (needed later)
-                cnt += bhpara->count[ch] - 1;
-              }
-              // add child's contribution
-              cm += m;
-              for (int l = 0; l < 3; l++) {
-                p[l] += bhpara->r[3 * ch + l] * m;
-                u[l] += bhpara->u[3 * ch + l];
-              }
-            } // m >= 0.0f
-          }   // ch >= 0
-        }     // missing_max
-        // repeat until we are done or child is not ready
-      }
-
-      //__syncthreads(); // throttle
-
-      // (missing == 0) could be true and threads will move to next particles (k
-      // += inc) only if previous conditions (m >= 0.0f) will be true. It can
-      // happen only if cell will obtain the mass (only here below: "massd[k] =
-      // cm") or they will find the very last child: particles. Before that:
-      // do/while loop will continue.
-      if (missing == 0) {
-        // all children are ready, so store computed information
-        bhpara->count[k] = cnt;
-        m = 1.0f / cm;
-        for (int l = 0; l < 3; l++) {
-          bhpara->r[3 * k + l] = p[l] * m;
-          bhpara->u[3 * k + l] = u[l];
-        }
-        __threadfence(); // make sure data are visible before setting mass
-        bhpara->mass[k] = cm;
-        __threadfence();
-        k += inc;
-        iteration = 0;
-        lps = 0;
-      }
-      //__syncthreads(); // throttle
-      if (iteration > THREADS3 + 1) {
-        k += inc;
-        repeat_flag = 1;
-        iteration = 0;
-        missing = 0;
-      }
-    } else {
-      k += inc;
-    }
-    if ((k > bhpara->nnodes) && (repeat_flag)) {
-      repeat_flag = 0;
-      missing = 0;
-      k = bottom + static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
-    }
-  } // while
-}
-
-/******************************************************************************/
-/*** sort bodies **************************************************************/
-/******************************************************************************/
-
-// This kernel concurrently places the bodies into an array such that the bodies
-// appear in the same order in the array as they would during an in-order
-// traversal of the octree. This sorting groups spatially close bodies (in the
-// same octant cells) together, and these grouped bodies are crucial to speed up
-// forceCalculationKernel and energyCalculationKernel
-__global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() {
-  auto const bottom = bottomd;
-  auto const dec = static_cast<int>(blockDim.x * gridDim.x);
-  // Start from the end of the nnodesd == 8 * nbodiesd.
-  // Reverse order is required now cause octant cells which are more close
-  // to the root have a larger count of entities inside (countd[k]).
-  // Particles should be sorted over all entities count in the tree array
-  // representation made by treeBuildingKernel.
-  int k = bhpara->nnodes + 1 - dec +
-          static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
-  // threads sync related
-  int lps = 0;
-
-  // iterate over all cells assigned to thread
-  while (k >= bottom) {
-    int start = bhpara->start[k];
-    // Threads sync related
-    if (lps++ > THREADS4) {
-      *bhpara->max_lps = lps;
-      __threadfence();
-    }
-    // Let's start from the root which has only startd=0 defined
-    // in boundingBoxKernel. All other bodies and cells have -1.
-    if (start >= 0) {
-      for (int i = 0; i < 8; i++) {
-        int const ch = bhpara->child[k * 8 + i];
-        if (ch >= bhpara->nbodies) {
-          // child is a cell
-          bhpara->start[ch] = start;  // set start ID of child
-          start += bhpara->count[ch]; // add # of bodies in subtree
-        } else if (ch >= 0) {
-          // Child is a body.
-          // This particle should be saved with a stepping over
-          // a count of particles in the cells.
-          // treeBuildingKernel already has ordered cells in a
-          // linear array way. The sortKernel just order random particle
-          // indices in the same order. Hence, they will be much faster accessed
-          // by forceCalculationKernel and energyCalculationKernel.
-          bhpara->sort[start] = ch; // record body in 'sorted' array
-          start++;
-        }
-      }
-      k -= dec; // move on to next cell
-      // Threads sync related
-      lps = 0;
-    }
-    //__syncthreads(); // throttle
-  }
-}
-
-/******************************************************************************/
-/*** compute dipole-dipole force and torque ***********************************/
-/******************************************************************************/
-
-__global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
-    float pf, float *force, float *torque) {
-  int i, t;
-  // dr is a distance between particles.
-  // f,h, and N are a target force, field, and torque respectively.
-  // u and uc are dipole moments of two particles.
-  float dr[3], f[3], h[3], u[3], uc[3], N[3];
-  // Shared memory aggregates of each warp-specific stacks
-  // with the MAXDEPTH size per each warp:
-  // "node" is the BH octant sub-cell in the stack.
-  // "pos"=0..7 - which octant we are examining now in the stack.
-  // dq is an array used to determine that the given BH cell is far enough.
-  __shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE],
-      node[MAXDEPTH * THREADS5 / WARPSIZE];
-  __shared__ float dq[MAXDEPTH * THREADS5 / WARPSIZE];
-
-  // Zero thread of the block initialize shared data for all warps.
-  if (threadIdx.x == 0) {
-    // Precompute values that depend only on tree level.
-    // The method's parameters (a trade-off accuracy/performance)
-    // which determine that the
-    // cell is far enough are "itolsqd" and
-    // "epssqd" which define a fraction of the octant cell and
-    // an additive distance respectively.
-    // Their joint contribution for the given tree depth are
-    // calculated within the array dq[i], which will
-    // be compared later with squared distance between the particle
-    // and the cell depending on a cell level.
-    // Original tree box edge (2*radiusd) should be halved
-    // as much as the tree depth takes place.
-    dq[0] = radiusd * radiusd * *itolsqd;
-    for (i = 1; i < maxdepthd; i++) {
-      dq[i] = dq[i - 1] * 0.25f; // halving of the squared distance
-      dq[i - 1] += *epssqd;      // increase thickness of previous cell
-    }
-    dq[i - 1] += *epssqd;
-
-    // Only maximal Barnes-Hut tree depth is allowed.
-    // This error is technically possible, however, most applications
-    // are far from the 1/2^32 particles' convergence.
-    if (maxdepthd > MAXDEPTH) {
-      *bhpara->err = maxdepthd;
-    }
-  }
-  __syncthreads();
-
-  // Only maximal Barnes-Hut tree depth is allowed.
-  if (maxdepthd <= MAXDEPTH) {
-    // How many warps are behind the current thread (?):
-    auto const base = static_cast<int>(threadIdx.x) / WARPSIZE;
-    // Figure out first thread in each warp (lane 0):
-    auto const sbase = base * WARPSIZE;
-    // Initial stack index is its MAXDEPTH portion start for the given warp
-    // count base:
-    auto const j = base * MAXDEPTH;
-
-    // How far the thread is from the warp beginning (?):
-    auto const diff = static_cast<int>(threadIdx.x) - sbase;
-    // Make multiple copies to avoid index calculations later:
-    if (diff < MAXDEPTH) {
-      // Each thread copies its own dq[] element to a part of
-      // dq array dedicated to the given warp:
-      dq[diff + j] = dq[diff];
-    }
-    __syncthreads();
-
-    // Iterate over all bodies assigned to thread:
-    for (auto k = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
-         k < bhpara->nbodies; k += static_cast<int>(blockDim.x * gridDim.x)) {
-      // Sorted body indexes assigned to me:
-      i = bhpara->sort[k]; // get permuted/sorted index
-      // Cache the particle position info:
-      for (int l = 0; l < 3; l++) {
-        u[l] = bhpara->u[3 * i + l];
-        h[l] = 0.0f;
-        f[l] = 0.0f;
-      }
-
-      // Force will be calculated for i-th particle.
-      // All other space is of interest, hence all cells will be considered.
-      // Hence, we should start from the root node (whole Barnes-Hut cube).
-      // Initialize iteration stack, i.e., push root node onto stack.
-      // Let's start from zero octant.
-      auto depth = j;
-      if (sbase == threadIdx.x) {
-        node[j] = bhpara->nnodes;
-        pos[j] = 0;
-      }
-
-      while (depth >= j) {
-        // Stack is not empty (depth is still higher than j-level).
-        // Hence, there are still some children to consider.
-        while ((t = pos[depth]) < 8) {
-          // Node on top of stack has more children to process:
-          auto const n = bhpara->child[node[depth] * 8 + t]; // child pointer
-          if (sbase == threadIdx.x) {
-            // I'm the first thread in the warp.
-            // Let me check for the current depth level of the tree
-            // whether we have more children among 8 octant cells.
-            // Hence, let's go to the next octant in the next iteration only:
-            pos[depth] = t + 1;
-          }
-          __threadfence_block();
-          // There is a child (octant cell) with a dipole moment uxd[3 * n + l]
-          // and the center position bhpara->r[3 * n + l]:
-          if (n >= 0) {
-            auto tmp = 0.0f; // compute distance squared
-            for (int l = 0; l < 3; l++) {
-              dr[l] = -bhpara->r[3 * n + l] + bhpara->r[3 * i + l];
-              tmp += dr[l] * dr[l];
-            }
-
-            // NOTE: i-th particle is specific for the given thread.
-            // However, the above-mentioned index "i" is already sorted by the
-            // sortKernel. Hence, below stack-diving operations will be
-            // performed by different threads of the same warp almost
-            // synchronously because adjacent threads have particles located in
-            // the space not far from each other (=adjacent octants of
-            // corresponding local maxdepth). Hence, such particles will have a
-            // similar stack of the below execution from a perspective of other
-            // Barnes-Hut tree branches distancing against them. Same global
-            // memory fragments (child[]) will be loaded to the same warp
-            // access and no corresponding threads' divergence will take place.
-            // Only in this way, zero thread could control others: see "if
-            // (sbase == threadIdx.x)" above and below.. and this control will
-            // be correct. It will be even with a single stack arrays per the
-            // single warp: The pos, node and dq array fragments are shared
-            // between all threads of the whole warp.
-
-            // Check if all threads agree that cell is far enough away (or is a
-            // body, i.e. n < nbodiesd).
-            if ((n < bhpara->nbodies) ||
-                __all_sync(__activemask(), tmp >= dq[depth])) {
-              if (n != i) {
-
-                auto const d1 = sqrtf(tmp);
-                auto const dd5 = __fdividef(1.0f, tmp * tmp * d1);
-                auto b = 0.0f;
-                auto b2 = 0.0f;
-                auto umd5 = 0.0f;
-                for (int l = 0; l < 3; l++) {
-                  uc[l] = bhpara->u[3 * n + l];
-                  b += uc[l] * dr[l];
-                  b2 += u[l] * dr[l];
-                  umd5 += u[l] * uc[l];
-                }
-                umd5 *= -3.0f * dd5;
-
-                auto const bb2d7 = 15.0f * b * b2 * __fdividef(dd5, tmp);
-
-                for (int l = 0; l < 3; l++) {
-                  h[l] += (b * 3.0f * dr[l] - tmp * uc[l]) * dd5;
-                  f[l] += -dr[l] * (umd5 + bb2d7) +
-                          3.0f * (b * u[l] + b2 * uc[l]) * dd5;
-                }
-              }
-            } else {
-              // If it is not then let's split octants further (more depth).
-              // Push the cell onto the stack:
-              depth++;
-              if (sbase == threadIdx.x) {
-                // The given warp should start from this child as a root
-                // further:
-                node[depth] = n;
-                // Let's start from it zero octant:
-                pos[depth] = 0;
-              }
-              __threadfence_block();
-            }
-          } else {
-            // Early out because all remaining children are also zero.
-            // We should move to the next octant or to the next depth if other
-            // threads already checked other octants:
-            depth = max(j, depth - 1);
-          }
-        }
-        depth--; // Done with this level
-      }
-
-      // Torque:
-      N[0] = u[1] * h[2] - u[2] * h[1];
-      N[1] = u[2] * h[0] - u[0] * h[2];
-      N[2] = u[0] * h[1] - u[1] * h[0];
-
-      for (int l = 0; l < 3; l++) {
-        if (f[l] != f[l] || h[l] != h[l]) { // nan
-          printf("Force Kernel: NAN in particle[%d]\n", i);
-          printf("x = %f, y = %f, z = %f,\n", bhpara->u[3 * i + 0],
-                 bhpara->u[3 * i + 1], bhpara->u[3 * i + 2]);
-          printf("fx = %f, fy = %f, fz = %f,\n", f[0], f[1], f[2]);
-          printf("hx = %f, hy = %f, hz = %f,\n", h[0], h[1], h[2]);
-        }
-        atomicAdd(force + 3 * i + l, f[l] * pf);
-        atomicAdd(torque + 3 * i + l, N[l] * pf);
-      }
-    }
-  }
-}
-
-/******************************************************************************/
-/*** compute energy ***********************************************************/
-/******************************************************************************/
-
-__global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
-    float pf, float *energySum) {
-  // NOTE: the algorithm of this kernel is almost identical to
-  // @ref forceCalculationKernel. See comments there.
-
-  int i, n, t;
-  float dr[3], h[3], u[3], uc[3];
-  __shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE],
-      node[MAXDEPTH * THREADS5 / WARPSIZE];
-  __shared__ float dq[MAXDEPTH * THREADS5 / WARPSIZE];
-  float sum = 0.0;
-  extern __shared__ float res[];
-
-  if (threadIdx.x == 0) {
-    // precompute values that depend only on tree level
-    dq[0] = radiusd * radiusd * *itolsqd;
-    for (i = 1; i < maxdepthd; i++) {
-      dq[i] = dq[i - 1] * 0.25f;
-      dq[i - 1] += *epssqd;
-    }
-    dq[i - 1] += *epssqd;
-
-    if (maxdepthd > MAXDEPTH) {
-      *bhpara->err = maxdepthd;
-    }
-  }
-  __syncthreads();
-
-  if (maxdepthd <= MAXDEPTH) {
-    // figure out first thread in each warp (lane 0)
-    auto const base = static_cast<int>(threadIdx.x) / WARPSIZE;
-    auto const sbase = base * WARPSIZE;
-    auto const j = base * MAXDEPTH;
-
-    auto const diff = static_cast<int>(threadIdx.x) - sbase;
-    // make multiple copies to avoid index calculations later
-    if (diff < MAXDEPTH) {
-      dq[diff + j] = dq[diff];
-    }
-    __syncthreads();
-
-    // iterate over all bodies assigned to thread
-    for (auto k = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
-         k < bhpara->nbodies; k += static_cast<int>(blockDim.x * gridDim.x)) {
-      i = bhpara->sort[k]; // get permuted/sorted index
-      // cache position info
-      for (int l = 0; l < 3; l++) {
-        u[l] = bhpara->u[3 * i + l];
-        h[l] = 0.0f;
-      }
-
-      // initialize iteration stack, i.e., push root node onto stack
-      auto depth = j;
-      if (sbase == threadIdx.x) {
-        node[j] = bhpara->nnodes;
-        pos[j] = 0;
-      }
-
-      while (depth >= j) {
-        // stack is not empty
-        while ((t = pos[depth]) < 8) {
-          // node on top of stack has more children to process
-          n = bhpara->child[node[depth] * 8 + t]; // load child pointer
-          if (sbase == threadIdx.x) {
-            // I'm the first thread in the warp
-            pos[depth] = t + 1;
-          }
-          __threadfence_block();
-          if (n >= 0) {
-            auto tmp = 0.0f;
-            for (int l = 0; l < 3; l++) {
-              dr[l] = -bhpara->r[3 * n + l] + bhpara->r[3 * i + l];
-              tmp += dr[l] * dr[l];
-            }
-            // check if all threads agree that cell is far enough away
-            // (or is a body)
-            if ((n < bhpara->nbodies) ||
-                __all_sync(__activemask(), tmp >= dq[depth])) {
-              if (n != i) {
-                auto const d1 = sqrtf(tmp);
-                auto const dd5 = __fdividef(1.0f, tmp * tmp * d1);
-                auto b = 0.0f;
-                for (int l = 0; l < 3; l++) {
-                  uc[l] = bhpara->u[3 * n + l];
-                  b += uc[l] * dr[l];
-                }
-
-                for (int l = 0; l < 3; l++)
-                  h[l] += (b * 3.0f * dr[l] - tmp * uc[l]) * dd5;
-              }
-            } else {
-              // push cell onto stack
-              depth++;
-              if (sbase == threadIdx.x) {
-                node[depth] = n;
-                pos[depth] = 0;
-              }
-              __threadfence_block();
-            }
-          } else {
-            depth = max(j, depth - 1); // early out because all remaining
-                                       // children are also zero
-          }
-        }
-        depth--; // done with this level
-      }
-
-      for (int l = 0; l < 3; l++) {
-        sum += -u[l] * h[l];
-        if (h[l] != h[l]) { // nan
-          printf("Energy Kernel: NAN in particle[%d]\n", i);
-          printf("x = %f, y = %f, z = %f,\n", bhpara->u[3 * i + 0],
-                 bhpara->u[3 * i + 1], bhpara->u[3 * i + 2]);
-          printf("hx = %f, hy = %f, hz = %f,\n", h[0], h[1], h[2]);
-        }
-      }
-    }
-  }
-
-  sum *= pf;
-  // the self-consistent field energy;
-  // the Barnes-Hut algorithm, probably, does not allow to avoid this /2 cause
-  // it is not symmetric:
-  sum /= 2.0f;
-  // Save per thread result into block shared mem
-  res[threadIdx.x] = sum;
-  // Sum results within a block
-  __syncthreads(); // Wait til all threads in block are done
-  dds_sumReduction_BH(res, &(energySum[blockIdx.x]));
-}
-
-// Required BH CUDA init.
-void initBHgpu(int blocks) {
-  dim3 grid(1, 1, 1);
-  dim3 block(1, 1, 1);
-
-  grid.x = blocks * FACTOR5;
-  block.x = THREADS5;
-
-  KERNELCALL(initializationKernel, grid, block);
-
-  // According to the experimental performance optimization:
-  cudaFuncSetCacheConfig(boundingBoxKernel, cudaFuncCachePreferShared);
-  cudaFuncSetCacheConfig(treeBuildingKernel, cudaFuncCachePreferL1);
-  cudaFuncSetCacheConfig(summarizationKernel, cudaFuncCachePreferShared);
-  cudaFuncSetCacheConfig(sortKernel, cudaFuncCachePreferL1);
-  cudaFuncSetCacheConfig(forceCalculationKernel, cudaFuncCachePreferL1);
-  cudaFuncSetCacheConfig(energyCalculationKernel, cudaFuncCachePreferL1);
-
-  cudaGetLastError(); // reset error value
-}
-
-// Building Barnes-Hut spatial min/max position box
-void buildBoxBH(int blocks) {
-  dim3 grid(1, 1, 1);
-  dim3 block(1, 1, 1);
-
-  grid.x = blocks * FACTOR1;
-  block.x = THREADS1;
-
-  cudaDeviceSynchronize();
-  KERNELCALL(boundingBoxKernel, grid, block);
-  cuda_safe_mem(cudaDeviceSynchronize());
-}
-
-// Building Barnes-Hut tree in a linear child array representation
-// of octant cells and particles inside.
-void buildTreeBH(int blocks) {
-  dim3 grid(1, 1, 1);
-  dim3 block(1, 1, 1);
-
-  grid.x = blocks * FACTOR2;
-  block.x = THREADS2;
-
-  KERNELCALL(treeBuildingKernel, grid, block);
-  cuda_safe_mem(cudaDeviceSynchronize());
-}
-
-// Calculate octant cells masses and cell index counts.
-// Determine cells centers of mass and total dipole moments
-// on all possible levels of the BH tree.
-void summarizeBH(int blocks) {
-  dim3 grid(1, 1, 1);
-  dim3 block(1, 1, 1);
-
-  grid.x = blocks * FACTOR3;
-  block.x = THREADS3;
-
-  KERNELCALL(summarizationKernel, grid, block);
-  cuda_safe_mem(cudaDeviceSynchronize());
-}
-
-// Sort particle indexes according to the BH tree representation.
-// Crucial for the per-warp performance tuning of forceCalculationKernel and
-// energyCalculationKernel.
-void sortBH(int blocks) {
-  dim3 grid(1, 1, 1);
-  dim3 block(1, 1, 1);
-
-  grid.x = blocks * FACTOR4;
-  block.x = THREADS4;
-
-  KERNELCALL(sortKernel, grid, block);
-  cuda_safe_mem(cudaDeviceSynchronize());
-}
-
-// Force calculation.
-void forceBH(BHData *bh_data, float k, float *f, float *torque) {
-  dim3 grid(1, 1, 1);
-  dim3 block(1, 1, 1);
-
-  grid.x = bh_data->blocks * FACTOR5;
-  block.x = THREADS5;
-
-  KERNELCALL(forceCalculationKernel, grid, block, k, f, torque);
-  cuda_safe_mem(cudaDeviceSynchronize());
-
-  int error_code = 0;
-  cuda_safe_mem(cudaMemcpy(&error_code, bh_data->err, sizeof(int),
-                           cudaMemcpyDeviceToHost));
-  if (error_code) {
-    throw std::runtime_error("force kernel encountered a functional error");
-  }
-}
-
-// Energy calculation.
-void energyBH(BHData *bh_data, float k, float *E) {
-  dim3 grid(1, 1, 1);
-  dim3 block(1, 1, 1);
-
-  grid.x = bh_data->blocks * FACTOR5;
-  block.x = THREADS5;
-
-  float *energySum;
-  cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x)));
-  // cleanup the memory for the energy sum
-  cuda_safe_mem(cudaMemset(energySum, 0, (int)(sizeof(float) * grid.x)));
-
-  KERNELCALL_shared(energyCalculationKernel, grid, block,
-                    block.x * sizeof(float), k, energySum);
-  cuda_safe_mem(cudaDeviceSynchronize());
-
-  // Sum the results of all blocks
-  // One energy part per block in the prev kernel
-  thrust::device_ptr<float> t(energySum);
-  float x = thrust::reduce(t, t + grid.x);
-  cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));
-
-  cuda_safe_mem(cudaFree(energySum));
-
-  int error_code = 0;
-  cuda_safe_mem(cudaMemcpy(&error_code, bh_data->err, sizeof(int),
-                           cudaMemcpyDeviceToHost));
-  if (error_code) {
-    throw std::runtime_error("force kernel encountered a functional error");
-  }
-}
-
-void setBHPrecision(float epssq, float itolsq) {
-  cuda_safe_mem(cudaMemcpyToSymbol(epssqd, &epssq, sizeof(float), 0,
-                                   cudaMemcpyHostToDevice));
-  cuda_safe_mem(cudaMemcpyToSymbol(itolsqd, &itolsq, sizeof(float), 0,
-                                   cudaMemcpyHostToDevice));
-}
-
-void deallocBH(BHData *bh_data) {
-  cuda_safe_mem(cudaFree(bh_data->err));
-  cuda_safe_mem(cudaFree(bh_data->max_lps));
-  cuda_safe_mem(cudaFree(bh_data->child));
-  cuda_safe_mem(cudaFree(bh_data->count));
-  cuda_safe_mem(cudaFree(bh_data->start));
-  cuda_safe_mem(cudaFree(bh_data->sort));
-  cuda_safe_mem(cudaFree(bh_data->mass));
-  cuda_safe_mem(cudaFree(bh_data->maxp));
-  cuda_safe_mem(cudaFree(bh_data->minp));
-  cuda_safe_mem(cudaFree(bh_data->r));
-  cuda_safe_mem(cudaFree(bh_data->u));
-  bh_data->err = nullptr;
-  bh_data->max_lps = nullptr;
-  bh_data->child = nullptr;
-  bh_data->count = nullptr;
-  bh_data->start = nullptr;
-  bh_data->sort = nullptr;
-  bh_data->mass = nullptr;
-  bh_data->maxp = nullptr;
-  bh_data->minp = nullptr;
-  bh_data->r = nullptr;
-  bh_data->u = nullptr;
-}
-
-void allocBHmemCopy(int nbodies, BHData *bh_data) {
-  if (bh_data->nbodies == nbodies)
-    return;
-
-  bh_data->nbodies = nbodies;
-
-  auto const devID = cuda_get_device();
-  auto const dev = cuda_get_device_props(devID);
-
-  bh_data->blocks = dev.n_cores;
-  // Each node corresponds to a split of the cubic box in 3D space to equal
-  // cubic boxes hence, 8 nodes per particle is a theoretical octree limit:
-  bh_data->nnodes = bh_data->nbodies * 8;
-
-  int const n_total_threads = 1024 * bh_data->blocks;
-  if (bh_data->nnodes < n_total_threads)
-    bh_data->nnodes = n_total_threads;
-  else
-    bh_data->nnodes = (bh_data->nnodes / n_total_threads) * n_total_threads;
-
-  cuda_safe_mem(cudaFree(bh_data->err));
-  cuda_safe_mem(cudaMalloc((void **)&(bh_data->err), sizeof(int)));
-
-  cuda_safe_mem(cudaFree(bh_data->max_lps));
-  cuda_safe_mem(cudaMalloc((void **)&(bh_data->max_lps), sizeof(int)));
-
-  cuda_safe_mem(cudaFree(bh_data->child));
-  cuda_safe_mem(cudaMalloc((void **)&(bh_data->child),
-                           sizeof(int) * (bh_data->nnodes + 1) * 8));
-
-  cuda_safe_mem(cudaFree(bh_data->count));
-  cuda_safe_mem(cudaMalloc((void **)&(bh_data->count),
-                           sizeof(int) * (bh_data->nnodes + 1)));
-
-  cuda_safe_mem(cudaFree(bh_data->start));
-  cuda_safe_mem(cudaMalloc((void **)&(bh_data->start),
-                           sizeof(int) * (bh_data->nnodes + 1)));
-
-  cuda_safe_mem(cudaFree(bh_data->sort));
-  cuda_safe_mem(cudaMalloc((void **)&(bh_data->sort),
-                           sizeof(int) * (bh_data->nnodes + 1)));
-
-  // Weight coefficients of m_bhnnodes nodes: both particles and octant cells
-  cuda_safe_mem(cudaFree(bh_data->mass));
-  cuda_safe_mem(cudaMalloc((void **)&(bh_data->mass),
-                           sizeof(float) * (bh_data->nnodes + 1)));
-
-  // n particles have unitary weight coefficients.
-  // Cells will be defined with -1 later.
-  auto *mass_tmp = new float[bh_data->nbodies];
-  for (int i = 0; i < bh_data->nbodies; i++) {
-    mass_tmp[i] = 1.0f;
-  }
-  cuda_safe_mem(cudaMemcpy(bh_data->mass, mass_tmp,
-                           sizeof(float) * bh_data->nbodies,
-                           cudaMemcpyHostToDevice));
-  delete[] mass_tmp;
-  // (max[3*i], max[3*i+1], max[3*i+2])
-  // are the octree box dynamical spatial constraints
-  // this array is updating per each block at each interaction calculation
-  // within the boundingBoxKernel
-  cuda_safe_mem(cudaFree(bh_data->maxp));
-  cuda_safe_mem(cudaMalloc((void **)&(bh_data->maxp),
-                           sizeof(float) * bh_data->blocks * FACTOR1 * 3));
-  // (min[3*i], min[3*i+1], min[3*i+2])
-  // are the octree box dynamical spatial constraints
-  // this array is updating per each block at each interaction calculation
-  // within the boundingBoxKernel
-  cuda_safe_mem(cudaFree(bh_data->minp));
-  cuda_safe_mem(cudaMalloc((void **)&(bh_data->minp),
-                           sizeof(float) * bh_data->blocks * FACTOR1 * 3));
-
-  cuda_safe_mem(cudaFree(bh_data->r));
-  cuda_safe_mem(
-      cudaMalloc(&(bh_data->r), 3 * (bh_data->nnodes + 1) * sizeof(float)));
-
-  cuda_safe_mem(cudaFree(bh_data->u));
-  cuda_safe_mem(
-      cudaMalloc(&(bh_data->u), 3 * (bh_data->nnodes + 1) * sizeof(float)));
-}
-
-void fill_bh_data(float const *r, float const *dip, BHData const *bh_data) {
-  auto const size = 3 * bh_data->nbodies * sizeof(float);
-  cuda_safe_mem(cudaMemcpyToSymbol(bhpara, bh_data, sizeof(BHData), 0,
-                                   cudaMemcpyHostToDevice));
-  cuda_safe_mem(cudaMemcpy(bh_data->r, r, size, cudaMemcpyDeviceToDevice));
-  cuda_safe_mem(cudaMemcpy(bh_data->u, dip, size, cudaMemcpyDeviceToDevice));
-}
-
-#endif // DIPOLAR_BARNES_HUT
diff --git a/src/core/magnetostatics/barnes_hut_gpu_cuda.cuh b/src/core/magnetostatics/barnes_hut_gpu_cuda.cuh
deleted file mode 100644
index 1f1cf266a7a..00000000000
--- a/src/core/magnetostatics/barnes_hut_gpu_cuda.cuh
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (C) 2016-2022 The ESPResSo project
- * Copyright (C) 2012 Alexander (Polyakov) Peletskyi
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "config/config.hpp"
-
-#ifdef DIPOLAR_BARNES_HUT
-
-struct BHData {
-  /// CUDA blocks
-  int blocks;
-  /// each node corresponds to a split of the cubic box in 3D space to equal
-  /// cubic boxes hence, 8 octant nodes per particle is a theoretical octree
-  /// limit: a maximal number of octree nodes is "nnodesd" and a number of
-  /// particles "nbodiesd" respectively.
-  int nbodies;
-  int nnodes;
-  /// particle positions on the device:
-  float *r;
-  /// particle dipole moments on the device:
-  float *u;
-  /// Not a real mass. Just a node weight coefficient.
-  float *mass;
-  /// min positions' coordinates of the Barnes-Hut box.
-  float *minp;
-  /// max positions' coordinates of the Barnes-Hut box.
-  float *maxp;
-  /// Error report.
-  int *err;
-  /// Indices of particles sorted according to the tree linear representation.
-  int *sort;
-  /// The tree linear representation.
-  int *child;
-  /// Supplementary array: a tree nodes (division octant cells/particles inside)
-  /// counting.
-  int *count;
-  /// Start indices for the per-cell sorting.
-  int *start;
-  /// trace the max loops for a threads' sync
-  int *max_lps;
-};
-
-/// @name Barnes-Hut thread count for different kernels.
-/// @{
-#define THREADS1 512
-#define THREADS2 1024
-#define THREADS3 1024
-#define THREADS4 1024
-#define THREADS5 256
-/// @}
-
-/// @name Barnes-Hut block factor for different kernels.
-/// block count = factor * number of blocks
-/// @{
-#define FACTOR1 2
-#define FACTOR2 1
-#define FACTOR3 1 /* must all be resident at the same time */
-#define FACTOR4 1 /* must all be resident at the same time */
-#define FACTOR5 4
-/// @}
-
-/// Barnes-Hut warp size.
-#define WARPSIZE 32
-/// Maximal depth of the Barnes-Hut tree branching.
-#define MAXDEPTH 32
-
-/// Barnes-Hut parameters setter.
-void setBHPrecision(float epssq, float itolsq);
-
-/// An allocation of the GPU device memory and an initialization where it is
-/// needed.
-void allocBHmemCopy(int nbodies, BHData *bh_data);
-
-/// A deallocation of the GPU device memory.
-void deallocBH(BHData *bh_data);
-
-/// Copy Barnes-Hut data to @ref bhpara and copy particle data.
-/// @param r       device particle positions to copy
-/// @param dip     device particle dipoles to copy
-/// @param bh_data Barnes-Hut container
-void fill_bh_data(float const *r, float const *dip, BHData const *bh_data);
-
-/// Barnes-Hut CUDA initialization.
-void initBHgpu(int blocks);
-
-/// Building Barnes-Hut spatial min/max position box
-void buildBoxBH(int blocks);
-
-/// Building Barnes-Hut tree in a linear child array representation
-/// of octant cells and particles inside.
-void buildTreeBH(int blocks);
-
-/// Calculate octant cells masses and cell index counts.
-/// Determine cells centers of mass and total dipole moments
-/// on all possible levels of the Barnes-Hut tree.
-void summarizeBH(int blocks);
-
-/// Sort particle indexes according to the Barnes-Hut tree representation.
-/// Crucial for the per-warp performance tuning of @c forceCalculationKernel
-/// and @c energyCalculationKernel.
-void sortBH(int blocks);
-
-/// Barnes-Hut force calculation.
-void forceBH(BHData *bh_data, float k, float *f, float *torque);
-
-/// Barnes-Hut energy calculation.
-void energyBH(BHData *bh_data, float k, float *E);
-
-#endif // DIPOLAR_BARNES_HUT
diff --git a/src/core/magnetostatics/dipolar_direct_sum.cpp b/src/core/magnetostatics/dipolar_direct_sum.cpp
index 2b3630c480c..66fe6ec9319 100644
--- a/src/core/magnetostatics/dipolar_direct_sum.cpp
+++ b/src/core/magnetostatics/dipolar_direct_sum.cpp
@@ -32,13 +32,11 @@
 #include "system/System.hpp"
 
 #include <utils/cartesian_product.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/sqr.hpp>
 #include <utils/mpi/iall_gatherv.hpp>
 
 #include <boost/mpi/collectives.hpp>
 #include <boost/mpi/communicator.hpp>
-#include <boost/range/counting_range.hpp>
 
 #include <mpi.h>
 
@@ -46,6 +44,7 @@
 #include <cassert>
 #include <cmath>
 #include <iterator>
+#include <ranges>
 #include <stdexcept>
 #include <tuple>
 #include <utility>
@@ -152,9 +151,9 @@ template <typename F> void for_each_image(Utils::Vector3i const &ncut, F f) {
           f(nx, ny, nz);
         }
       },
-      boost::counting_range(-ncut[0], ncut[0] + 1),
-      boost::counting_range(-ncut[1], ncut[1] + 1),
-      boost::counting_range(-ncut[2], ncut[2] + 1));
+      std::views::iota(-ncut[0], ncut[0] + 1),
+      std::views::iota(-ncut[1], ncut[1] + 1),
+      std::views::iota(-ncut[2], ncut[2] + 1));
 }
 
 /**
@@ -164,7 +163,9 @@ struct PosMom {
   Utils::Vector3d pos;
   Utils::Vector3d m;
 
-  template <class Archive> void serialize(Archive &ar, long int) { ar &pos &m; }
+  template <class Archive> void serialize(Archive &ar, long int) {
+    ar & pos & m;
+  }
 };
 
 /**
@@ -218,8 +219,7 @@ T image_sum(InputIterator begin, InputIterator end, InputIterator it,
 }
 
 static auto gather_particle_data(BoxGeometry const &box_geo,
-                                 ParticleRange const &particles,
-                                 int n_replicas) {
+                                 ParticleRange const &particles) {
   auto const &comm = ::comm_cart;
   std::vector<Particle *> local_particles;
   std::vector<PosMom> local_posmom;
@@ -291,7 +291,7 @@ void DipolarDirectSum::add_long_range_forces(
   auto const &box_geo = *get_system().box_geo;
   auto const &box_l = box_geo.length();
   auto [local_particles, all_posmom, reqs, offset] =
-      gather_particle_data(box_geo, particles, n_replicas);
+      gather_particle_data(box_geo, particles);
 
   /* Number of image boxes considered */
   auto const ncut = get_n_cut(box_geo, n_replicas);
@@ -379,7 +379,7 @@ double
 DipolarDirectSum::long_range_energy(ParticleRange const &particles) const {
   auto const &box_geo = *get_system().box_geo;
   auto [local_particles, all_posmom, reqs, offset] =
-      gather_particle_data(box_geo, particles, n_replicas);
+      gather_particle_data(box_geo, particles);
 
   /* Number of image boxes considered */
   auto const ncut = get_n_cut(box_geo, n_replicas);
@@ -419,7 +419,7 @@ void DipolarDirectSum::dipole_field_at_part(
   auto const &box_geo = *get_system().box_geo;
   /* collect particle data */
   auto [local_particles, all_posmom, reqs, offset] =
-      gather_particle_data(box_geo, particles, n_replicas);
+      gather_particle_data(box_geo, particles);
 
   auto const ncut = get_n_cut(box_geo, n_replicas);
   auto const with_replicas = (ncut.norm2() > 0);
diff --git a/src/core/magnetostatics/dipolar_direct_sum_gpu_cuda.cu b/src/core/magnetostatics/dipolar_direct_sum_gpu_cuda.cu
index 3f6c58f94c8..bff0f93c323 100644
--- a/src/core/magnetostatics/dipolar_direct_sum_gpu_cuda.cu
+++ b/src/core/magnetostatics/dipolar_direct_sum_gpu_cuda.cu
@@ -34,6 +34,7 @@
 #error CU-file includes mpi.h! This should not happen!
 #endif
 
+// LCOV_EXCL_START
 __device__ inline float scalar_product(float const *a, float const *b) {
   return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
 }
@@ -131,6 +132,7 @@ __device__ float dipole_ia_energy(float pf, float const *r1, float const *r2,
   // Energy
   return pf * (pe1 * r3_inv - pe4 * pe2 * pe3);
 }
+// LCOV_EXCL_STOP
 
 __global__ void DipolarDirectSum_kernel_force(float pf, unsigned int n,
                                               float *pos, float *dip, float *f,
@@ -183,6 +185,7 @@ __global__ void DipolarDirectSum_kernel_force(float pf, unsigned int n,
   }
 }
 
+// LCOV_EXCL_START
 __device__ void dds_sumReduction(float *input, float *sum) {
   auto const tid = static_cast<int>(threadIdx.x);
   for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
@@ -197,6 +200,7 @@ __device__ void dds_sumReduction(float *input, float *sum) {
     sum[0] = input[0];
   }
 }
+// LCOV_EXCL_STOP
 
 __global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n,
                                                float *pos, float *dip,
diff --git a/src/core/magnetostatics/dipoles.cpp b/src/core/magnetostatics/dipoles.cpp
index 4f60e1fa8bf..5b9e3c98a56 100644
--- a/src/core/magnetostatics/dipoles.cpp
+++ b/src/core/magnetostatics/dipoles.cpp
@@ -33,8 +33,6 @@
 #include "errorhandling.hpp"
 #include "system/System.hpp"
 
-#include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/demangle.hpp>
 
 #include <cassert>
@@ -135,11 +133,6 @@ struct LongRangeForce {
     actor->add_long_range_forces();
   }
 #endif
-#ifdef DIPOLAR_BARNES_HUT
-  void operator()(std::shared_ptr<DipolarBarnesHutGpu> const &actor) const {
-    actor->add_long_range_forces();
-  }
-#endif
 #ifdef SCAFACOS_DIPOLES
   void operator()(std::shared_ptr<DipolarScafacos> const &actor) const {
     actor->add_long_range_forces();
@@ -171,12 +164,6 @@ struct LongRangeEnergy {
     return 0.;
   }
 #endif
-#ifdef DIPOLAR_BARNES_HUT
-  double operator()(std::shared_ptr<DipolarBarnesHutGpu> const &actor) const {
-    actor->long_range_energy();
-    return 0.;
-  }
-#endif
 #ifdef SCAFACOS_DIPOLES
   double operator()(std::shared_ptr<DipolarScafacos> const &actor) const {
     return actor->long_range_energy();
diff --git a/src/core/magnetostatics/dipoles.hpp b/src/core/magnetostatics/dipoles.hpp
index 5f9c1b340f9..3e4d3d96c60 100644
--- a/src/core/magnetostatics/dipoles.hpp
+++ b/src/core/magnetostatics/dipoles.hpp
@@ -27,7 +27,6 @@
 
 #include "magnetostatics/solver.hpp"
 
-#include "magnetostatics/barnes_hut_gpu.hpp"
 #include "magnetostatics/dipolar_direct_sum.hpp"
 #include "magnetostatics/dipolar_direct_sum_gpu.hpp"
 #include "magnetostatics/dlc.hpp"
@@ -49,9 +48,6 @@ using MagnetostaticsActor =
 #ifdef DIPOLAR_DIRECT_SUM
                  std::shared_ptr<DipolarDirectSumGpu>,
 #endif
-#ifdef DIPOLAR_BARNES_HUT
-                 std::shared_ptr<DipolarBarnesHutGpu>,
-#endif
 #ifdef DP3M
                  std::shared_ptr<DipolarP3M>,
 #endif
diff --git a/src/core/magnetostatics/dipoles_inline.hpp b/src/core/magnetostatics/dipoles_inline.hpp
index a072a100afb..a3184dc17b9 100644
--- a/src/core/magnetostatics/dipoles_inline.hpp
+++ b/src/core/magnetostatics/dipoles_inline.hpp
@@ -99,23 +99,23 @@ struct ShortRangeEnergyKernel {
 inline std::optional<Solver::ShortRangeForceKernel>
 Solver::pair_force_kernel() const {
 #ifdef DIPOLES
-  if (impl->solver) {
+  if (auto &solver = impl->solver; solver.has_value()) {
     auto const visitor = Dipoles::ShortRangeForceKernel();
-    return std::visit(visitor, *impl->solver);
+    return std::visit(visitor, *solver);
   }
 #endif // DIPOLES
-  return {};
+  return std::nullopt;
 }
 
 inline std::optional<Solver::ShortRangeEnergyKernel>
 Solver::pair_energy_kernel() const {
 #ifdef DIPOLES
-  if (impl->solver) {
+  if (auto &solver = impl->solver; solver.has_value()) {
     auto const visitor = Dipoles::ShortRangeEnergyKernel();
-    return std::visit(visitor, *impl->solver);
+    return std::visit(visitor, *solver);
   }
 #endif // DIPOLES
-  return {};
+  return std::nullopt;
 }
 
 } // namespace Dipoles
diff --git a/src/core/magnetostatics/dlc.cpp b/src/core/magnetostatics/dlc.cpp
index 51e7662a9a6..e327a11d0fb 100644
--- a/src/core/magnetostatics/dlc.cpp
+++ b/src/core/magnetostatics/dlc.cpp
@@ -37,7 +37,6 @@
 #include "errorhandling.hpp"
 #include "system/System.hpp"
 
-#include <utils/constants.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -51,6 +50,7 @@
 #include <cmath>
 #include <cstdio>
 #include <functional>
+#include <numbers>
 #include <numeric>
 #include <stdexcept>
 #include <variant>
@@ -113,8 +113,8 @@ static void dipolar_force_corrections(int kcut,
                                       std::vector<Utils::Vector3d> &ts,
                                       ParticleRange const &particles,
                                       BoxGeometry const &box_geo) {
-  auto const facux = 2. * Utils::pi() * box_geo.length_inv()[0];
-  auto const facuy = 2. * Utils::pi() * box_geo.length_inv()[1];
+  auto const facux = 2. * std::numbers::pi * box_geo.length_inv()[0];
+  auto const facuy = 2. * std::numbers::pi * box_geo.length_inv()[1];
 
   auto const n_local_particles = particles.size();
   std::vector<double> ReSjp(n_local_particles);
@@ -233,7 +233,7 @@ static void dipolar_force_corrections(int kcut,
   // Multiply by the factors we have left during the loops
 
   auto const piarea =
-      Utils::pi() * box_geo.length_inv()[0] * box_geo.length_inv()[1];
+      std::numbers::pi * box_geo.length_inv()[0] * box_geo.length_inv()[1];
   for (std::size_t j = 0; j < n_local_particles; ++j) {
     fs[j] *= piarea;
     ts[j] *= piarea;
@@ -247,8 +247,8 @@ static void dipolar_force_corrections(int kcut,
 static double dipolar_energy_correction(int kcut,
                                         ParticleRange const &particles,
                                         BoxGeometry const &box_geo) {
-  auto const facux = 2. * Utils::pi() * box_geo.length_inv()[0];
-  auto const facuy = 2. * Utils::pi() * box_geo.length_inv()[1];
+  auto const facux = 2. * std::numbers::pi * box_geo.length_inv()[0];
+  auto const facuy = 2. * std::numbers::pi * box_geo.length_inv()[1];
 
   double energy = 0.;
   double sum_S[4] = {0., 0., 0., 0.};
@@ -296,7 +296,7 @@ static double dipolar_energy_correction(int kcut,
   }
 
   auto const piarea =
-      Utils::pi() * box_geo.length_inv()[0] * box_geo.length_inv()[1];
+      std::numbers::pi * box_geo.length_inv()[0] * box_geo.length_inv()[1];
   energy *= -piarea;
   return (this_node == 0) ? energy : 0.;
 }
@@ -306,7 +306,7 @@ void DipolarLayerCorrection::add_force_corrections(
   assert(dlc.far_cut > 0.);
   auto const &box_geo = *get_system().box_geo;
   auto const volume = box_geo.volume();
-  auto const correc = 4. * Utils::pi() / volume;
+  auto const correc = 4. * std::numbers::pi / volume;
 
   // --- Create arrays that should contain the corrections to
   //     the forces and torques, and set them to zero.
@@ -358,7 +358,7 @@ double DipolarLayerCorrection::energy_correction(
   assert(dlc.far_cut > 0.);
   auto const &box_geo = *get_system().box_geo;
   auto const volume = box_geo.volume();
-  auto const pref = prefactor * 2. * Utils::pi() / volume;
+  auto const pref = prefactor * 2. * std::numbers::pi / volume;
 
   // Check if particles aren't in the forbidden gap region
   // This loop is needed, because there is no other guaranteed
@@ -430,12 +430,12 @@ double DipolarLayerCorrection::tune_far_cut() const {
   }
 
   auto constexpr limitkc = 200;
-  auto const piarea = Utils::pi() / (lx * ly);
+  auto const piarea = std::numbers::pi / (lx * ly);
   auto const nmp = static_cast<double>(count_magnetic_particles(system));
   auto const h = dlc.box_h;
   auto far_cut = -1.;
   for (int kc = 1; kc < limitkc; kc++) {
-    auto const gc = kc * 2. * Utils::pi() / lx;
+    auto const gc = kc * 2. * std::numbers::pi / lx;
     auto const fa0 = sqrt(9. * exp(+2. * gc * h) * g1_DLC_dip(gc, lz - h) +
                           9. * exp(-2. * gc * h) * g1_DLC_dip(gc, lz + h) +
                           22. * g1_DLC_dip(gc, lz));
diff --git a/src/core/magnetostatics/dp3m.cpp b/src/core/magnetostatics/dp3m.cpp
index d262e857b0f..b3ab8871fab 100644
--- a/src/core/magnetostatics/dp3m.cpp
+++ b/src/core/magnetostatics/dp3m.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -31,12 +31,13 @@
 
 #include "magnetostatics/dp3m.hpp"
 
+#include "fft/fft.hpp"
 #include "p3m/TuningAlgorithm.hpp"
 #include "p3m/TuningLogger.hpp"
 #include "p3m/common.hpp"
-#include "p3m/fft.hpp"
 #include "p3m/influence_function_dipolar.hpp"
 #include "p3m/interpolation.hpp"
+#include "p3m/math.hpp"
 #include "p3m/send_mesh.hpp"
 
 #include "BoxGeometry.hpp"
@@ -54,10 +55,8 @@
 #include "tuning.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/integral_parameter.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -65,11 +64,16 @@
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdio>
 #include <functional>
+#include <iterator>
+#include <numbers>
 #include <optional>
+#include <span>
 #include <sstream>
 #include <stdexcept>
+#include <tuple>
 #include <vector>
 
 void DipolarP3M::count_magnetic_particles() {
@@ -102,17 +106,14 @@ double dp3m_rtbisection(double box_size, double r_cut_iL, int n_c_part,
                         double tuned_accuracy);
 
 double DipolarP3M::calc_average_self_energy_k_space() const {
-  auto const start = Utils::Vector3i{dp3m.fft.plan[3].start};
-  auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
-
   auto const &box_geo = *get_system().box_geo;
   auto const node_phi = grid_influence_function_self_energy(
-      dp3m.params, start, start + size, dp3m.g_energy);
+      dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, dp3m.g_energy);
 
   double phi = 0.;
   boost::mpi::reduce(comm_cart, node_phi, phi, std::plus<>(), 0);
   phi /= 3. * box_geo.length()[0] * Utils::int_pow<3>(dp3m.params.mesh[0]);
-  return phi * Utils::pi();
+  return phi * std::numbers::pi;
 }
 
 void DipolarP3M::init() {
@@ -127,21 +128,10 @@ void DipolarP3M::init() {
 
   dp3m.params.cao3 = Utils::int_pow<3>(dp3m.params.cao);
   dp3m.params.recalc_a_ai_cao_cut(box_geo.length());
-  dp3m.local_mesh.calc_local_ca_mesh(dp3m.params, local_geo, verlet_skin, 0.);
-
-  dp3m.sm.resize(comm_cart, dp3m.local_mesh);
-
-  int ca_mesh_size =
-      fft_init(dp3m.local_mesh.dim, dp3m.local_mesh.margin, dp3m.params.mesh,
-               dp3m.params.mesh_off, dp3m.ks_pnum, dp3m.fft,
-               ::communicator.node_grid, comm_cart);
-  dp3m.rs_mesh.resize(ca_mesh_size);
-  dp3m.ks_mesh.resize(ca_mesh_size);
-
-  for (auto &val : dp3m.rs_mesh_dip) {
-    val.resize(ca_mesh_size);
-  }
 
+  assert(dp3m.fft);
+  dp3m.local_mesh.calc_local_ca_mesh(dp3m.params, local_geo, verlet_skin, 0.);
+  dp3m.fft->init_fft();
   dp3m.calc_differential_operator();
 
   /* fix box length dependent constants */
@@ -170,15 +160,16 @@ DipolarP3M::DipolarP3M(P3MParameters &&parameters, double prefactor,
 
 namespace {
 template <int cao> struct AssignDipole {
-  void operator()(dp3m_data_struct &dp3m, Utils::Vector3d const &real_pos,
+  void operator()(decltype(DipolarP3M::dp3m) &dp3m,
+                  Utils::Vector3d const &real_pos,
                   Utils::Vector3d const &dip) const {
     auto const weights = p3m_calculate_interpolation_weights<cao>(
         real_pos, dp3m.params.ai, dp3m.local_mesh);
     p3m_interpolate<cao>(dp3m.local_mesh, weights,
                          [&dip, &dp3m](int ind, double w) {
-                           dp3m.rs_mesh_dip[0][ind] += w * dip[0];
-                           dp3m.rs_mesh_dip[1][ind] += w * dip[1];
-                           dp3m.rs_mesh_dip[2][ind] += w * dip[2];
+                           dp3m.mesh.rs_fields[0][ind] += w * dip[0];
+                           dp3m.mesh.rs_fields[1][ind] += w * dip[1];
+                           dp3m.mesh.rs_fields[2][ind] += w * dip[2];
                          });
 
     dp3m.inter_weights.store<cao>(weights);
@@ -190,9 +181,9 @@ void DipolarP3M::dipole_assign(ParticleRange const &particles) {
   dp3m.inter_weights.reset(dp3m.params.cao);
 
   /* prepare local FFT mesh */
-  for (auto &i : dp3m.rs_mesh_dip)
+  for (auto &rs_mesh_field : dp3m.mesh.rs_fields)
     for (int j = 0; j < dp3m.local_mesh.size; j++)
-      i[j] = 0.;
+      rs_mesh_field[j] = 0.;
 
   for (auto const &p : particles) {
     if (p.dipm() != 0.) {
@@ -204,7 +195,7 @@ void DipolarP3M::dipole_assign(ParticleRange const &particles) {
 
 namespace {
 template <int cao> struct AssignTorques {
-  void operator()(dp3m_data_struct const &dp3m, double prefac, int d_rs,
+  void operator()(decltype(DipolarP3M::dp3m) &dp3m, double prefac, int d_rs,
                   ParticleRange const &particles) const {
 
     /* magnetic particle index */
@@ -217,7 +208,7 @@ template <int cao> struct AssignTorques {
         Utils::Vector3d E{};
         p3m_interpolate(dp3m.local_mesh, w,
                         [&E, &dp3m, d_rs](int ind, double w) {
-                          E[d_rs] += w * dp3m.rs_mesh[ind];
+                          E[d_rs] += w * dp3m.mesh.rs_scalar[ind];
                         });
 
         p.torque() -= vector_product(p.calc_dip(), prefac * E);
@@ -228,7 +219,7 @@ template <int cao> struct AssignTorques {
 };
 
 template <int cao> struct AssignForces {
-  void operator()(dp3m_data_struct const &dp3m, double prefac, int d_rs,
+  void operator()(decltype(DipolarP3M::dp3m) &dp3m, double prefac, int d_rs,
                   ParticleRange const &particles) const {
 
     /* magnetic particle index */
@@ -240,9 +231,9 @@ template <int cao> struct AssignForces {
 
         Utils::Vector3d E{};
         p3m_interpolate(dp3m.local_mesh, w, [&E, &dp3m](int ind, double w) {
-          E[0] += w * dp3m.rs_mesh_dip[0][ind];
-          E[1] += w * dp3m.rs_mesh_dip[1][ind];
-          E[2] += w * dp3m.rs_mesh_dip[2][ind];
+          E[0] += w * dp3m.mesh.rs_fields[0][ind];
+          E[1] += w * dp3m.mesh.rs_fields[1][ind];
+          E[2] += w * dp3m.mesh.rs_fields[2][ind];
         });
 
         p.force()[d_rs] += p.calc_dip() * prefac * E;
@@ -269,20 +260,7 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
 
   if (dp3m.sum_mu2 > 0.) {
     dipole_assign(particles);
-    /* Gather information for FFT grid inside the nodes domain (inner local
-     * mesh) and perform forward 3D FFT (Charge Assignment Mesh). */
-    std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0].data(),
-                                       dp3m.rs_mesh_dip[1].data(),
-                                       dp3m.rs_mesh_dip[2].data()}};
-
-    dp3m.sm.gather_grid(Utils::make_span(meshes), comm_cart,
-                        dp3m.local_mesh.dim);
-
-    fft_perform_forw(dp3m.rs_mesh_dip[0].data(), dp3m.fft, comm_cart);
-    fft_perform_forw(dp3m.rs_mesh_dip[1].data(), dp3m.fft, comm_cart);
-    fft_perform_forw(dp3m.rs_mesh_dip[2].data(), dp3m.fft, comm_cart);
-    // Note: after these calls, the grids are in the order yzx and not xyz
-    // anymore!!!
+    dp3m.fft->perform_fwd_fft();
   }
 
   /* === k-space energy calculation  === */
@@ -293,35 +271,33 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
     if (dp3m.sum_mu2 > 0.) {
       /* i*k differentiation for dipolar gradients:
        * |(\Fourier{\vect{mu}}(k)\cdot \vect{k})|^2 */
-      int ind = 0;
-      int i = 0;
-      int j[3];
-      double node_energy = 0.0;
-      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) {
-        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) {
-          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) {
-            node_energy +=
-                dp3m.g_energy[i] *
-                (Utils::sqr(
-                     dp3m.rs_mesh_dip[0][ind] *
-                         dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                     dp3m.rs_mesh_dip[1][ind] *
-                         dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                     dp3m.rs_mesh_dip[2][ind] *
-                         dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]) +
-                 Utils::sqr(
-                     dp3m.rs_mesh_dip[0][ind + 1] *
-                         dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                     dp3m.rs_mesh_dip[1][ind + 1] *
-                         dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                     dp3m.rs_mesh_dip[2][ind + 1] *
-                         dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]]));
-            ind += 2;
-            i++;
-          }
-        }
-      }
-      node_energy *= dipole_prefac * Utils::pi() * box_geo.length_inv()[0];
+
+      auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
+      auto const &offset = dp3m.mesh.start;
+      auto const &d_op = dp3m.d_op[0u];
+      auto const &mesh_dip = dp3m.mesh.rs_fields;
+      auto const [KX, KY, KZ] = dp3m.fft->get_permutations();
+      auto indices = Utils::Vector3i{};
+      auto index = std::size_t(0u);
+      auto it_energy = dp3m.g_energy.begin();
+      auto node_energy = 0.;
+      for_each_3d(mesh_start, dp3m.mesh.size, indices, [&]() {
+        auto const shift = indices + offset;
+        // Re(mu)*k
+        auto const re = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        ++index;
+        // Im(mu)*k
+        auto const im = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        ++index;
+        node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im));
+        std::advance(it_energy, 1);
+      });
+
+      node_energy *= dipole_prefac * std::numbers::pi * box_geo.length_inv()[0];
       boost::mpi::reduce(comm_cart, node_energy, energy, std::plus<>(), 0);
 
       if (dp3m.energy_correction == 0.)
@@ -329,8 +305,8 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
 
       if (this_node == 0) {
         /* self energy correction */
-        energy -= prefactor * dp3m.sum_mu2 * Utils::sqrt_pi_i() * (2. / 3.) *
-                  Utils::int_pow<3>(dp3m.params.alpha);
+        energy -= prefactor * dp3m.sum_mu2 * std::numbers::inv_sqrtpi *
+                  (2. / 3.) * Utils::int_pow<3>(dp3m.params.alpha);
 
         /* dipolar energy correction due to systematic Madelung-self effects */
         energy += prefactor * dp3m.energy_correction / box_geo.volume();
@@ -344,162 +320,107 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
      * DIPOLAR TORQUES (k-space)
      ****************************/
     if (dp3m.sum_mu2 > 0.) {
-      auto const two_pi_L_i = 2. * Utils::pi() * box_geo.length_inv()[0];
-      /* fill in ks_mesh array for torque calculation */
-      int ind = 0;
-      int i = 0;
-      int j[3];
-      double tmp0, tmp1;
-
-      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y
-        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
-             j[1]++) { // j[1]=n_z
-          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
-               j[2]++) { // j[2]=n_x
-            // tmp0 = Re(mu)*k,   tmp1 = Im(mu)*k
-
-            tmp0 = dp3m.rs_mesh_dip[0][ind] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-
-            tmp1 = dp3m.rs_mesh_dip[0][ind + 1] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind + 1] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind + 1] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-
-            /* the optimal influence function is the same for torques
-               and energy */
-            dp3m.ks_mesh[ind] = tmp0 * dp3m.g_energy[i];
-            dp3m.ks_mesh[ind + 1] = tmp1 * dp3m.g_energy[i];
-            ind += 2;
-            i++;
-          }
-        }
-      }
+      auto const wavenumber = 2. * std::numbers::pi * box_geo.length_inv()[0u];
+      auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
+      auto const &mesh_stop = dp3m.mesh.size;
+      auto const offset = dp3m.mesh.start;
+      auto const &d_op = dp3m.d_op[0u];
+      auto const [KX, KY, KZ] = dp3m.fft->get_permutations();
+      auto &mesh_dip = dp3m.mesh.rs_fields;
+      auto indices = Utils::Vector3i{};
+      auto index = std::size_t(0u);
+
+      /* fill in mesh.ks_scalar array for torque calculation */
+      auto it_energy = dp3m.g_energy.begin();
+      index = 0u;
+      for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+        auto const shift = indices + offset;
+        // Re(mu)*k
+        auto const re = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        dp3m.mesh.ks_scalar[index] = *it_energy * re;
+        ++index;
+        // Im(mu)*k
+        auto const im = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        dp3m.mesh.ks_scalar[index] = *it_energy * im;
+        ++index;
+        std::advance(it_energy, 1);
+      });
 
       /* Force component loop */
       for (int d = 0; d < 3; d++) {
-        auto const d_rs = (d + dp3m.ks_pnum) % 3;
-        ind = 0;
-        for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) {
-          for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1]; j[1]++) {
-            for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2]; j[2]++) {
-              dp3m.rs_mesh[ind] =
-                  dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                  dp3m.ks_mesh[ind];
-              ind++;
-              dp3m.rs_mesh[ind] =
-                  dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                  dp3m.ks_mesh[ind];
-              ind++;
-            }
-          }
-        }
-
-        /* Back FFT force component mesh */
-        fft_perform_back(dp3m.rs_mesh.data(), false, dp3m.fft, comm_cart);
-        /* redistribute force component mesh */
-        dp3m.sm.spread_grid(dp3m.rs_mesh.data(), comm_cart,
-                            dp3m.local_mesh.dim);
+        index = 0u;
+        for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+          auto const pos = indices[d] + offset[d];
+          dp3m.mesh.rs_scalar[index] = d_op[pos] * dp3m.mesh.ks_scalar[index];
+          ++index;
+          dp3m.mesh.rs_scalar[index] = d_op[pos] * dp3m.mesh.ks_scalar[index];
+          ++index;
+        });
+        dp3m.fft->perform_space_back_fft();
         /* Assign force component from mesh to particle */
+        auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignTorques, 1, 7>(
-            dp3m.params.cao, dp3m, dipole_prefac * two_pi_L_i, d_rs, particles);
+            dp3m.params.cao, dp3m, dipole_prefac * wavenumber, d_rs, particles);
       }
 
       /***************************
          DIPOLAR FORCES (k-space)
       ****************************/
-
       // Compute forces after torques because the algorithm below overwrites the
-      // grids dp3m.rs_mesh_dip !
+      // grids dp3m.mesh.rs_fields !
       // Note: I'll do here 9 inverse FFTs. By symmetry, we can reduce this
       // number to 6 !
-      /* fill in ks_mesh array for force calculation */
-      ind = 0;
-      i = 0;
-      for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0]; j[0]++) { // j[0]=n_y
-        for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
-             j[1]++) { // j[1]=n_z
-          for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
-               j[2]++) { // j[2]=n_x
-            // tmp0 = Im(mu)*k,   tmp1 = -Re(mu)*k
-            tmp0 = dp3m.rs_mesh_dip[0][ind + 1] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind + 1] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind + 1] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-            tmp1 = dp3m.rs_mesh_dip[0][ind] *
-                       dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] +
-                   dp3m.rs_mesh_dip[1][ind] *
-                       dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] +
-                   dp3m.rs_mesh_dip[2][ind] *
-                       dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]];
-            dp3m.ks_mesh[ind] = tmp0 * dp3m.g_force[i];
-            dp3m.ks_mesh[ind + 1] = -tmp1 * dp3m.g_force[i];
-            ind += 2;
-            i++;
-          }
-        }
-      }
+      /* fill in mesh.ks_scalar array for force calculation */
+      auto it_force = dp3m.g_force.begin();
+      index = 0u;
+      for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+        auto const shift = indices + offset;
+        // Re(mu)*k
+        auto const re = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        ++index;
+        // Im(mu)*k
+        auto const im = mesh_dip[0u][index] * d_op[shift[KX]] +
+                        mesh_dip[1u][index] * d_op[shift[KY]] +
+                        mesh_dip[2u][index] * d_op[shift[KZ]];
+        ++index;
+        dp3m.mesh.ks_scalar[index - 2] = *it_force * im;
+        dp3m.mesh.ks_scalar[index - 1] = *it_force * (-re);
+        std::advance(it_force, 1);
+      });
 
       /* Force component loop */
-      for (int d = 0; d < 3; d++) { /* direction in k-space: */
-        auto const d_rs = (d + dp3m.ks_pnum) % 3;
-        ind = 0;
-        for (j[0] = 0; j[0] < dp3m.fft.plan[3].new_mesh[0];
-             j[0]++) { // j[0]=n_y
-          for (j[1] = 0; j[1] < dp3m.fft.plan[3].new_mesh[1];
-               j[1]++) { // j[1]=n_z
-            for (j[2] = 0; j[2] < dp3m.fft.plan[3].new_mesh[2];
-                 j[2]++) { // j[2]=n_x
-              tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                     dp3m.ks_mesh[ind];
-              dp3m.rs_mesh_dip[0][ind] =
-                  dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0;
-              dp3m.rs_mesh_dip[1][ind] =
-                  dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0;
-              dp3m.rs_mesh_dip[2][ind] =
-                  dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0;
-              ind++;
-              tmp0 = dp3m.d_op[0][j[d] + dp3m.fft.plan[3].start[d]] *
-                     dp3m.ks_mesh[ind];
-              dp3m.rs_mesh_dip[0][ind] =
-                  dp3m.d_op[0][j[2] + dp3m.fft.plan[3].start[2]] * tmp0;
-              dp3m.rs_mesh_dip[1][ind] =
-                  dp3m.d_op[0][j[0] + dp3m.fft.plan[3].start[0]] * tmp0;
-              dp3m.rs_mesh_dip[2][ind] =
-                  dp3m.d_op[0][j[1] + dp3m.fft.plan[3].start[1]] * tmp0;
-              ind++;
-            }
-          }
-        }
-        /* Back FFT force component mesh */
-        fft_perform_back(dp3m.rs_mesh_dip[0].data(), false, dp3m.fft,
-                         comm_cart);
-        fft_perform_back(dp3m.rs_mesh_dip[1].data(), false, dp3m.fft,
-                         comm_cart);
-        fft_perform_back(dp3m.rs_mesh_dip[2].data(), false, dp3m.fft,
-                         comm_cart);
-        /* redistribute force component mesh */
-        std::array<double *, 3> meshes = {{dp3m.rs_mesh_dip[0].data(),
-                                           dp3m.rs_mesh_dip[1].data(),
-                                           dp3m.rs_mesh_dip[2].data()}};
-
-        dp3m.sm.spread_grid(Utils::make_span(meshes), comm_cart,
-                            dp3m.local_mesh.dim);
+      for (int d = 0; d < 3; d++) {
+        index = 0u;
+        for_each_3d(mesh_start, mesh_stop, indices, [&]() {
+          auto const shift = indices + offset;
+          auto const f1 =
+              d_op[indices[d] + offset[d]] * dp3m.mesh.ks_scalar[index];
+          mesh_dip[0u][index] = d_op[shift[KX]] * f1;
+          mesh_dip[1u][index] = d_op[shift[KY]] * f1;
+          mesh_dip[2u][index] = d_op[shift[KZ]] * f1;
+          ++index;
+          auto const f2 =
+              d_op[indices[d] + offset[d]] * dp3m.mesh.ks_scalar[index];
+          mesh_dip[0u][index] = d_op[shift[KX]] * f2;
+          mesh_dip[1u][index] = d_op[shift[KY]] * f2;
+          mesh_dip[2u][index] = d_op[shift[KZ]] * f2;
+          ++index;
+        });
+        dp3m.fft->perform_field_back_fft();
         /* Assign force component from mesh to particle */
+        auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
         Utils::integral_parameter<int, AssignForces, 1, 7>(
-            dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(two_pi_L_i), d_rs,
+            dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(wavenumber), d_rs,
             particles);
       }
     } /* if (dp3m.sum_mu2 > 0) */
-  }   /* if (force_flag) */
+  } /* if (force_flag) */
 
   if (dp3m.params.epsilon != P3M_EPSILON_METALLIC) {
     auto const surface_term =
@@ -522,7 +443,7 @@ double DipolarP3M::long_range_kernel(bool force_flag, bool energy_flag,
 double DipolarP3M::calc_surface_term(bool force_flag, bool energy_flag,
                                      ParticleRange const &particles) {
   auto const &box_geo = *get_system().box_geo;
-  auto const pref = prefactor * 4. * Utils::pi() / box_geo.volume() /
+  auto const pref = prefactor * 4. * std::numbers::pi / box_geo.volume() /
                     (2. * dp3m.params.epsilon + 1.);
   auto const n_local_part = particles.size();
 
@@ -532,21 +453,21 @@ double DipolarP3M::calc_surface_term(bool force_flag, bool energy_flag,
   std::vector<double> my(n_local_part);
   std::vector<double> mz(n_local_part);
 
-  int ip = 0;
+  std::size_t ip = 0u;
   for (auto const &p : particles) {
     auto const dip = p.calc_dip();
-    mx[ip] = dip[0];
-    my[ip] = dip[1];
-    mz[ip] = dip[2];
+    mx[ip] = dip[0u];
+    my[ip] = dip[1u];
+    mz[ip] = dip[2u];
     ip++;
   }
 
   // we will need the sum of all dipolar momenta vectors
   auto local_dip = Utils::Vector3d{};
-  for (int i = 0; i < n_local_part; i++) {
-    local_dip[0] += mx[i];
-    local_dip[1] += my[i];
-    local_dip[2] += mz[i];
+  for (std::size_t i = 0u; i < n_local_part; i++) {
+    local_dip[0u] += mx[i];
+    local_dip[1u] += my[i];
+    local_dip[2u] += mz[i];
   }
   auto const box_dip =
       boost::mpi::all_reduce(comm_cart, local_dip, std::plus<>());
@@ -554,7 +475,7 @@ double DipolarP3M::calc_surface_term(bool force_flag, bool energy_flag,
   double energy = 0.;
   if (energy_flag) {
     double sum_e = 0.;
-    for (int i = 0; i < n_local_part; i++) {
+    for (std::size_t i = 0u; i < n_local_part; i++) {
       sum_e += mx[i] * box_dip[0] + my[i] * box_dip[1] + mz[i] * box_dip[2];
     }
     energy =
@@ -567,18 +488,18 @@ double DipolarP3M::calc_surface_term(bool force_flag, bool energy_flag,
     std::vector<double> sumiy(n_local_part);
     std::vector<double> sumiz(n_local_part);
 
-    for (int i = 0; i < n_local_part; i++) {
-      sumix[i] = my[i] * box_dip[2] - mz[i] * box_dip[1];
-      sumiy[i] = mz[i] * box_dip[0] - mx[i] * box_dip[2];
-      sumiz[i] = mx[i] * box_dip[1] - my[i] * box_dip[0];
+    for (std::size_t i = 0u; i < n_local_part; i++) {
+      sumix[i] = my[i] * box_dip[2u] - mz[i] * box_dip[1u];
+      sumiy[i] = mz[i] * box_dip[0u] - mx[i] * box_dip[2u];
+      sumiz[i] = mx[i] * box_dip[1u] - my[i] * box_dip[0u];
     }
 
-    ip = 0;
+    ip = 0u;
     for (auto &p : particles) {
       auto &torque = p.torque();
-      torque[0] -= pref * sumix[ip];
-      torque[1] -= pref * sumiy[ip];
-      torque[2] -= pref * sumiz[ip];
+      torque[0u] -= pref * sumix[ip];
+      torque[1u] -= pref * sumiy[ip];
+      torque[2u] -= pref * sumiz[ip];
       ip++;
     }
   }
@@ -587,27 +508,23 @@ double DipolarP3M::calc_surface_term(bool force_flag, bool energy_flag,
 }
 
 void DipolarP3M::calc_influence_function_force() {
-  auto const start = Utils::Vector3i{dp3m.fft.plan[3].start};
-  auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
-
-  dp3m.g_force = grid_influence_function<3>(dp3m.params, start, start + size,
-                                            get_system().box_geo->length());
+  dp3m.g_force =
+      grid_influence_function<3>(dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
+                                 get_system().box_geo->length_inv());
 }
 
 void DipolarP3M::calc_influence_function_energy() {
-  auto const start = Utils::Vector3i{dp3m.fft.plan[3].start};
-  auto const size = Utils::Vector3i{dp3m.fft.plan[3].new_mesh};
-
-  dp3m.g_energy = grid_influence_function<2>(dp3m.params, start, start + size,
-                                             get_system().box_geo->length());
+  dp3m.g_energy =
+      grid_influence_function<2>(dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
+                                 get_system().box_geo->length_inv());
 }
 
 class DipolarTuningAlgorithm : public TuningAlgorithm {
-  dp3m_data_struct &dp3m;
+  decltype(DipolarP3M::dp3m) &dp3m;
   int m_mesh_max = -1, m_mesh_min = -1;
 
 public:
-  DipolarTuningAlgorithm(System::System &system, dp3m_data_struct &input_dp3m,
+  DipolarTuningAlgorithm(System::System &system, decltype(dp3m) &input_dp3m,
                          double prefactor, int timings)
       : TuningAlgorithm(system, prefactor, timings), dp3m{input_dp3m} {}
 
@@ -642,7 +559,7 @@ class DipolarTuningAlgorithm : public TuningAlgorithm {
                                    dp3m.sum_dip_part, dp3m.sum_mu2, 0.001);
     // alpha cannot be zero for dipoles because real-space formula breaks down
 
-    if (Utils::sqrt_2() * rs_err > dp3m.params.accuracy) {
+    if (std::numbers::sqrt2 * rs_err > dp3m.params.accuracy) {
       /* assume rs_err = ks_err -> rs_err = accuracy/sqrt(2.0) -> alpha_L */
       alpha_L = dp3m_rtbisection(
           box_geo.length()[0], r_cut_iL, dp3m.sum_dip_part, dp3m.sum_mu2,
@@ -750,65 +667,71 @@ void DipolarP3M::tune() {
 }
 
 /** Tuning dipolar-P3M */
-static auto dp3m_tune_aliasing_sums(int nx, int ny, int nz, int mesh,
+static auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh,
                                     double mesh_i, int cao, double alpha_L_i) {
-  using Utils::sinc;
-
-  auto const factor1 = Utils::sqr(Utils::pi() * alpha_L_i);
 
+  auto constexpr mesh_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
+  auto constexpr mesh_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
+  auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
   auto alias1 = 0.;
   auto alias2 = 0.;
-  for (int mx = -P3M_BRILLOUIN; mx <= P3M_BRILLOUIN; mx++) {
-    auto const nmx = nx + mx * mesh;
-    auto const fnmx = mesh_i * nmx;
-    for (int my = -P3M_BRILLOUIN; my <= P3M_BRILLOUIN; my++) {
-      auto const nmy = ny + my * mesh;
-      auto const fnmy = mesh_i * nmy;
-      for (int mz = -P3M_BRILLOUIN; mz <= P3M_BRILLOUIN; mz++) {
-        auto const nmz = nz + mz * mesh;
-        auto const fnmz = mesh_i * nmz;
-
-        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
-        auto const ex = std::exp(-factor1 * nm2);
-
-        auto const U2 = pow(sinc(fnmx) * sinc(fnmy) * sinc(fnmz), 2. * cao);
-
-        alias1 += Utils::sqr(ex) * nm2;
-        alias2 += U2 * ex * pow((nx * nmx + ny * nmy + nz * nmz), 3.) / nm2;
-      }
-    }
-  }
+
+  Utils::Vector3i indices{};
+  Utils::Vector3i nm{};
+  Utils::Vector3d fnm{};
+  for_each_3d(
+      mesh_start, mesh_stop, indices,
+      [&]() {
+        auto const norm_sq = nm.norm2();
+        auto const ex = std::exp(-factor1 * norm_sq);
+        auto const U2 = std::pow(Utils::product(fnm), 2 * cao);
+        alias1 += Utils::sqr(ex) * norm_sq;
+        alias2 += U2 * ex * std::pow(shift * nm, 3) / norm_sq;
+      },
+      [&](unsigned dim, int n) {
+        nm[dim] = shift[dim] + n * mesh;
+        fnm[dim] = math::sinc(nm[dim] * mesh_i);
+      });
+
   return std::make_pair(alias1, alias2);
 }
 
 /** Calculate the k-space error of dipolar-P3M */
 static double dp3m_k_space_error(double box_size, int mesh, int cao,
                                  int n_c_part, double sum_q2, double alpha_L) {
-  double he_q = 0.;
-  auto const mesh_i = 1. / mesh;
-  auto const alpha_L_i = 1. / alpha_L;
 
-  for (int nx = -mesh / 2; nx < mesh / 2; nx++)
-    for (int ny = -mesh / 2; ny < mesh / 2; ny++)
-      for (int nz = -mesh / 2; nz < mesh / 2; nz++)
-        if ((nx != 0) || (ny != 0) || (nz != 0)) {
-          auto const n2 = Utils::sqr(nx) + Utils::sqr(ny) + Utils::sqr(nz);
-          auto const cs = p3m_analytic_cotangent_sum(nx, mesh_i, cao) *
-                          p3m_analytic_cotangent_sum(ny, mesh_i, cao) *
-                          p3m_analytic_cotangent_sum(nz, mesh_i, cao);
+  auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
+  auto const mesh_i = 1. / static_cast<double>(mesh);
+  auto const alpha_L_i = 1. / alpha_L;
+  auto const mesh_stop = Utils::Vector3i::broadcast(mesh / 2);
+  auto const mesh_start = -mesh_stop;
+  auto indices = Utils::Vector3i{};
+  auto values = Utils::Vector3d{};
+  auto he_q = 0.;
+
+  for_each_3d(
+      mesh_start, mesh_stop, indices,
+      [&]() {
+        if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
+          auto const n2 = indices.norm2();
+          auto const cs = Utils::product(values);
           auto const [alias1, alias2] =
-              dp3m_tune_aliasing_sums(nx, ny, nz, mesh, mesh_i, cao, alpha_L_i);
+              dp3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i);
           auto const d =
               alias1 - Utils::sqr(alias2 / cs) /
                            Utils::int_pow<3>(static_cast<double>(n2));
           /* at high precision, d can become negative due to extinction;
              also, don't take values that have no significant digits left*/
-          if (d > 0 && (fabs(d / alias1) > ROUND_ERROR_PREC))
+          if (d > 0. and std::fabs(d / alias1) > ROUND_ERROR_PREC)
             he_q += d;
         }
+      },
+      [&values, &mesh_i, cotangent_sum](unsigned dim, int n) {
+        values[dim] = cotangent_sum(n, mesh_i);
+      });
 
-  return 8. * Utils::sqr(Utils::pi()) / 3. * sum_q2 * sqrt(he_q / n_c_part) /
-         Utils::int_pow<4>(box_size);
+  return 8. * Utils::sqr(std::numbers::pi) / 3. * sum_q2 *
+         sqrt(he_q / n_c_part) / Utils::int_pow<4>(box_size);
 }
 
 /** Calculate the value of the errors for the REAL part of the force in terms
@@ -854,7 +777,7 @@ double dp3m_rtbisection(double box_size, double r_cut_iL, int n_c_part,
                         double tuned_accuracy) {
   constexpr int JJ_RTBIS_MAX = 40;
 
-  auto const constant = tuned_accuracy / Utils::sqrt_2();
+  auto const constant = tuned_accuracy / std::numbers::sqrt2;
 
   auto const f1 =
       dp3m_real_space_error(box_size, r_cut_iL, n_c_part, sum_q2, x1) -
@@ -886,7 +809,7 @@ void DipolarP3M::sanity_checks_boxl() const {
   auto const &system = get_system();
   auto const &box_geo = *system.box_geo;
   auto const &local_geo = *system.local_geo;
-  for (unsigned int i = 0u; i < 3u; i++) {
+  for (auto i = 0u; i < 3u; i++) {
     /* check k-space cutoff */
     if (dp3m.params.cao_cut[i] >= box_geo.length_half()[i]) {
       std::stringstream msg;
@@ -955,9 +878,9 @@ void DipolarP3M::calc_energy_correction() {
   auto const &box_geo = *get_system().box_geo;
   auto const Ukp3m = calc_average_self_energy_k_space() * box_geo.volume();
   auto const Ewald_volume = Utils::int_pow<3>(dp3m.params.alpha_L);
-  auto const Eself = -2. * Ewald_volume * Utils::sqrt_pi_i() / 3.;
+  auto const Eself = -2. * Ewald_volume * std::numbers::inv_sqrtpi / 3.;
   dp3m.energy_correction =
-      -dp3m.sum_mu2 * (Ukp3m + Eself + 2. * Utils::pi() / 3.);
+      -dp3m.sum_mu2 * (Ukp3m + Eself + 2. * std::numbers::pi / 3.);
 }
 
 #ifdef NPT
@@ -965,4 +888,5 @@ void npt_add_virial_magnetic_contribution(double energy) {
   npt_add_virial_contribution(energy);
 }
 #endif // NPT
+
 #endif // DP3M
diff --git a/src/core/magnetostatics/dp3m.hpp b/src/core/magnetostatics/dp3m.hpp
index 85bfcdeb638..0bdf4f05ec2 100644
--- a/src/core/magnetostatics/dp3m.hpp
+++ b/src/core/magnetostatics/dp3m.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -39,7 +39,6 @@
 
 #include "p3m/common.hpp"
 #include "p3m/data_struct.hpp"
-#include "p3m/fft.hpp"
 #include "p3m/interpolation.hpp"
 #include "p3m/send_mesh.hpp"
 
@@ -47,11 +46,12 @@
 #include "ParticleRange.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/AS_erfc_part.hpp>
 
 #include <array>
 #include <cmath>
+#include <numbers>
+#include <utility>
 #include <vector>
 
 #ifdef NPT
@@ -59,42 +59,28 @@
 void npt_add_virial_magnetic_contribution(double energy);
 #endif
 
-struct dp3m_data_struct : public p3m_data_struct_base {
-  explicit dp3m_data_struct(P3MParameters &&parameters)
-      : p3m_data_struct_base{std::move(parameters)} {}
-
-  /** local mesh. */
-  P3MLocalMesh local_mesh;
-  /** real space mesh (local) for CA/FFT. */
-  fft_vector<double> rs_mesh;
-  /** real space mesh (local) for CA/FFT of the dipolar field. */
-  std::array<fft_vector<double>, 3> rs_mesh_dip;
-  /** k-space mesh (local) for k-space calculation and FFT. */
-  std::vector<double> ks_mesh;
-
-  /** number of dipolar particles (only on head node). */
-  int sum_dip_part = 0;
-  /** Sum of square of magnetic dipoles (only on head node). */
-  double sum_mu2 = 0.;
-
-  /** position shift for calculation of first assignment mesh point. */
-  double pos_shift = 0.;
+/** @brief Dipolar P3M solver. */
+struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
+  struct p3m_data_struct_impl : public p3m_data_struct {
+    explicit p3m_data_struct_impl(P3MParameters &&parameters)
+        : p3m_data_struct{std::move(parameters)} {}
 
-  p3m_interpolation_cache inter_weights;
+    /** number of dipolar particles (only on head node). */
+    int sum_dip_part = 0;
+    /** Sum of square of magnetic dipoles (only on head node). */
+    double sum_mu2 = 0.;
 
-  /** send/recv mesh sizes */
-  p3m_send_mesh sm;
+    /** position shift for calculation of first assignment mesh point. */
+    double pos_shift = 0.;
 
-  /** cached k-space self-energy correction */
-  double energy_correction = 0.;
+    p3m_interpolation_cache inter_weights;
 
-  fft_data_struct fft;
-};
+    /** cached k-space self-energy correction */
+    double energy_correction = 0.;
+  };
 
-/** @brief Dipolar P3M solver. */
-struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
   /** Dipolar P3M parameters. */
-  dp3m_data_struct dp3m;
+  p3m_data_struct_impl dp3m;
 
   /** Magnetostatics prefactor. */
   int tune_timings;
@@ -209,7 +195,7 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
     auto const mir = dip1 * d;
     auto const mjr = dip2 * d;
 
-    auto const coeff = 2. * dp3m.params.alpha * Utils::sqrt_pi_i();
+    auto const coeff = 2. * dp3m.params.alpha * std::numbers::inv_sqrtpi;
     auto const dist2i = 1. / dist2;
     auto const exp_adist2 = exp(-Utils::sqr(adist));
 
@@ -268,7 +254,7 @@ struct DipolarP3M : public Dipoles::Actor<DipolarP3M> {
     auto const mir = dip1 * d;
     auto const mjr = dip2 * d;
 
-    auto const coeff = 2. * dp3m.params.alpha * Utils::sqrt_pi_i();
+    auto const coeff = 2. * dp3m.params.alpha * std::numbers::inv_sqrtpi;
     auto const dist2i = 1. / dist2;
     auto const exp_adist2 = exp(-Utils::sqr(adist));
 
diff --git a/src/core/magnetostatics/scafacos_impl.cpp b/src/core/magnetostatics/scafacos_impl.cpp
index 20c39bafa15..95eebbb9f65 100644
--- a/src/core/magnetostatics/scafacos_impl.cpp
+++ b/src/core/magnetostatics/scafacos_impl.cpp
@@ -31,12 +31,13 @@
 #include "communication.hpp"
 #include "system/System.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 #include <utils/matrix.hpp>
 
 #include <cassert>
+#include <iterator>
 #include <memory>
+#include <span>
 #include <string>
 
 std::shared_ptr<DipolarScafacos>
@@ -72,13 +73,13 @@ void DipolarScafacosImpl::update_particle_forces() const {
   auto const &cell_structure = *get_system().cell_structure;
 
   auto it_potentials = potentials.begin();
-  auto it_f = std::size_t{0ul};
+  auto index = std::size_t{0ul};
   for (auto &p : cell_structure.local_particles()) {
     // The scafacos term "potential" here in fact refers to the magnetic
     // field. So, the torques are given by m \times B
     auto const dip = p.calc_dip();
     auto const t = vector_product(
-        dip, Utils::Vector3d(Utils::Span<const double>(&*it_potentials, 3)));
+        dip, Utils::Vector3d(std::span<const double>(&*it_potentials, 3ul)));
     // The force is given by G m, where G is a matrix
     // which comes from the "fields" output of scafacos like this
     // 0 1 2
@@ -86,16 +87,16 @@ void DipolarScafacosImpl::update_particle_forces() const {
     // 2 4 5
     // where the numbers refer to indices in the "field" output from scafacos
     auto const G = Utils::Matrix<double, 3, 3>{
-        {fields[it_f + 0ul], fields[it_f + 1ul], fields[it_f + 2ul]},
-        {fields[it_f + 1ul], fields[it_f + 3ul], fields[it_f + 4ul]},
-        {fields[it_f + 2ul], fields[it_f + 4ul], fields[it_f + 5ul]}};
+        {fields[index + 0ul], fields[index + 1ul], fields[index + 2ul]},
+        {fields[index + 1ul], fields[index + 3ul], fields[index + 4ul]},
+        {fields[index + 2ul], fields[index + 4ul], fields[index + 5ul]}};
     auto const f = G * dip;
 
     // Add to particles
     p.force() += prefactor * f;
     p.torque() += prefactor * t;
-    it_f += 6ul;
-    it_potentials += 3;
+    index += 6ul;
+    std::advance(it_potentials, 3);
   }
 
   /* Check that the particle number did not change */
diff --git a/src/core/nonbonded_interactions/ljcos.cpp b/src/core/nonbonded_interactions/ljcos.cpp
index 1fa4d75e266..bc24335f65f 100644
--- a/src/core/nonbonded_interactions/ljcos.cpp
+++ b/src/core/nonbonded_interactions/ljcos.cpp
@@ -27,9 +27,9 @@
 #ifdef LJCOS
 #include "nonbonded_interaction_data.hpp"
 
-#include <utils/constants.hpp>
 #include <utils/math/sqr.hpp>
 
+#include <numbers>
 #include <stdexcept>
 
 LJcos_Parameters::LJcos_Parameters(double epsilon, double sigma, double cutoff,
@@ -44,11 +44,13 @@ LJcos_Parameters::LJcos_Parameters(double epsilon, double sigma, double cutoff,
   if (cutoff < 0.) {
     throw std::domain_error("LJcos parameter 'cutoff' has to be >= 0");
   }
-  auto const facsq = Utils::cbrt_2() * Utils::sqr(sig);
 
-  rmin = sqrt(Utils::cbrt_2()) * sig;
-  alfa = Utils::pi() / (Utils::sqr(cut) - facsq);
-  beta = Utils::pi() * (1. - (1. / (Utils::sqr(cut) / facsq - 1.)));
+  constexpr auto cbrt2 = 1.25992104989487316476721060727822835057025;
+  auto const facsq = cbrt2 * Utils::sqr(sig);
+
+  rmin = sqrt(cbrt2) * sig;
+  alfa = std::numbers::pi / (Utils::sqr(cut) - facsq);
+  beta = std::numbers::pi * (1. - (1. / (Utils::sqr(cut) / facsq - 1.)));
 }
 
 #endif // LJCOS
diff --git a/src/core/nonbonded_interactions/ljcos2.hpp b/src/core/nonbonded_interactions/ljcos2.hpp
index 482da3b435a..969788cfd34 100644
--- a/src/core/nonbonded_interactions/ljcos2.hpp
+++ b/src/core/nonbonded_interactions/ljcos2.hpp
@@ -36,31 +36,30 @@
 
 #include "nonbonded_interaction_data.hpp"
 
-#include <utils/constants.hpp>
 #include <utils/math/int_pow.hpp>
 #include <utils/math/sqr.hpp>
 
 #include <cmath>
+#include <numbers>
 
 /** Calculate Lennard-Jones cosine squared force factor */
 inline double ljcos2_pair_force_factor(IA_parameters const &ia_params,
                                        double dist) {
   if (dist < (ia_params.ljcos2.cut + ia_params.ljcos2.offset)) {
     auto const r_off = dist - ia_params.ljcos2.offset;
-    auto fac = 0.0;
+    auto fac = 0.;
     if (r_off < ia_params.ljcos2.rchange) {
       auto const frac6 = Utils::int_pow<6>(ia_params.ljcos2.sig / r_off);
-      fac =
-          48.0 * ia_params.ljcos2.eps * frac6 * (frac6 - 0.5) / (r_off * dist);
+      fac = 48. * ia_params.ljcos2.eps * frac6 * (frac6 - 0.5) / (r_off * dist);
     } else if (r_off < ia_params.ljcos2.rchange + ia_params.ljcos2.w) {
-      fac = -ia_params.ljcos2.eps * Utils::pi() / 2 / ia_params.ljcos2.w /
+      fac = -ia_params.ljcos2.eps * std::numbers::pi / 2. / ia_params.ljcos2.w /
             dist *
-            sin(Utils::pi() * (r_off - ia_params.ljcos2.rchange) /
+            sin(std::numbers::pi * (r_off - ia_params.ljcos2.rchange) /
                 ia_params.ljcos2.w);
     }
     return fac;
   }
-  return 0.0;
+  return 0.;
 }
 
 /** Calculate Lennard-Jones cosine squared energy */
@@ -69,16 +68,16 @@ inline double ljcos2_pair_energy(IA_parameters const &ia_params, double dist) {
     auto const r_off = dist - ia_params.ljcos2.offset;
     if (r_off < ia_params.ljcos2.rchange) {
       auto const frac6 = Utils::int_pow<6>(ia_params.ljcos2.sig / r_off);
-      return 4.0 * ia_params.ljcos2.eps * (Utils::sqr(frac6) - frac6);
+      return 4. * ia_params.ljcos2.eps * (Utils::sqr(frac6) - frac6);
     }
     if (r_off < (ia_params.ljcos2.rchange + ia_params.ljcos2.w)) {
-      return -ia_params.ljcos2.eps / 2 *
-             (cos(Utils::pi() * (r_off - ia_params.ljcos2.rchange) /
+      return -ia_params.ljcos2.eps / 2. *
+             (cos(std::numbers::pi * (r_off - ia_params.ljcos2.rchange) /
                   ia_params.ljcos2.w) +
-              1);
+              1.);
     }
   }
-  return 0.0;
+  return 0.;
 }
 
 #endif /* ifdef LJCOS2 */
diff --git a/src/core/npt.cpp b/src/core/npt.cpp
index d75e4bbe068..15cfdda6d6d 100644
--- a/src/core/npt.cpp
+++ b/src/core/npt.cpp
@@ -86,11 +86,11 @@ NptIsoParameters::NptIsoParameters(double ext_pressure, double piston,
   p_vel = ::nptiso.p_vel;
 
   /* set the NpT geometry */
-  for (auto const i : {0, 1, 2}) {
+  for (auto const i : {0u, 1u, 2u}) {
     if (rescale[i]) {
       geometry |= ::nptgeom_dir[i];
       dimension += 1;
-      non_const_dim = i;
+      non_const_dim = static_cast<int>(i);
     }
   }
 
diff --git a/src/core/object-in-fluid/oif_global_forces.cpp b/src/core/object-in-fluid/oif_global_forces.cpp
index 14296433318..30fee91916c 100644
--- a/src/core/object-in-fluid/oif_global_forces.cpp
+++ b/src/core/object-in-fluid/oif_global_forces.cpp
@@ -25,11 +25,11 @@
 
 #include "bonded_interactions/bonded_interaction_data.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/triangle_functions.hpp>
 
+#include <span>
+
 int max_oif_objects = 0;
 
 Utils::Vector2d calc_oif_global(int molType, BoxGeometry const &box_geo,
@@ -39,9 +39,8 @@ Utils::Vector2d calc_oif_global(int molType, BoxGeometry const &box_geo,
   // z volume
   double VOL_partVol = 0.;
 
-  cs.bond_loop([&partArea, &VOL_partVol, &box_geo,
-                molType](Particle &p1, int bond_id,
-                         Utils::Span<Particle *> partners) {
+  cs.bond_loop([&partArea, &VOL_partVol, &box_geo, molType](
+                   Particle &p1, int bond_id, std::span<Particle *> partners) {
     if (p1.mol_id() != molType)
       return false;
 
@@ -74,9 +73,8 @@ void add_oif_global_forces(Utils::Vector2d const &area_volume, int molType,
   double area = area_volume[0];
   double VOL_volume = area_volume[1];
 
-  cs.bond_loop([&box_geo, area, VOL_volume,
-                molType](Particle &p1, int bond_id,
-                         Utils::Span<Particle *> partners) {
+  cs.bond_loop([&box_geo, area, VOL_volume, molType](
+                   Particle &p1, int bond_id, std::span<Particle *> partners) {
     if (p1.mol_id() != molType)
       return false;
 
diff --git a/src/core/object-in-fluid/oif_global_forces_params.hpp b/src/core/object-in-fluid/oif_global_forces_params.hpp
index fbb60ef7159..8906b759a9b 100644
--- a/src/core/object-in-fluid/oif_global_forces_params.hpp
+++ b/src/core/object-in-fluid/oif_global_forces_params.hpp
@@ -51,10 +51,10 @@ struct OifGlobalForcesBond {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &A0_g;
-    ar &ka_g;
-    ar &V0;
-    ar &kv;
+    ar & A0_g;
+    ar & ka_g;
+    ar & V0;
+    ar & kv;
   }
 };
 
diff --git a/src/core/object-in-fluid/oif_local_forces.hpp b/src/core/object-in-fluid/oif_local_forces.hpp
index b4d622cbd6f..3f741d1d75e 100644
--- a/src/core/object-in-fluid/oif_local_forces.hpp
+++ b/src/core/object-in-fluid/oif_local_forces.hpp
@@ -85,15 +85,15 @@ struct OifLocalForcesBond {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &r0;
-    ar &ks;
-    ar &kslin;
-    ar &phi0;
-    ar &kb;
-    ar &A01;
-    ar &A02;
-    ar &kal;
-    ar &kvisc;
+    ar & r0;
+    ar & ks;
+    ar & kslin;
+    ar & phi0;
+    ar & kb;
+    ar & A01;
+    ar & A02;
+    ar & kal;
+    ar & kvisc;
   }
 };
 
diff --git a/src/core/observables/CylindricalProfileObservable.hpp b/src/core/observables/CylindricalProfileObservable.hpp
index 988343f7ac3..c9e7ee0c816 100644
--- a/src/core/observables/CylindricalProfileObservable.hpp
+++ b/src/core/observables/CylindricalProfileObservable.hpp
@@ -23,8 +23,6 @@
 
 #include <utils/math/cylindrical_transformation_parameters.hpp>
 
-#include <boost/range/algorithm.hpp>
-
 #include <memory>
 #include <utility>
 
diff --git a/src/core/observables/EnergyObservable.hpp b/src/core/observables/EnergyObservable.hpp
index 44407c89b65..c073170e75a 100644
--- a/src/core/observables/EnergyObservable.hpp
+++ b/src/core/observables/EnergyObservable.hpp
@@ -30,9 +30,9 @@ namespace Observables {
 
 class Energy : public Observable {
 public:
-  std::vector<std::size_t> shape() const override { return {1}; }
+  std::vector<std::size_t> shape() const override { return {1u}; }
   std::vector<double>
-  operator()(boost::mpi::communicator const &comm) const override {
+  operator()(boost::mpi::communicator const &) const override {
     return {System::get_system().calculate_energy()->accumulate(0u)};
   }
 };
diff --git a/src/core/observables/PidObservable.hpp b/src/core/observables/PidObservable.hpp
index 4e9894a61d6..9e4b7787f10 100644
--- a/src/core/observables/PidObservable.hpp
+++ b/src/core/observables/PidObservable.hpp
@@ -17,8 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef OBSERVABLES_PIDOBSERVABLE_HPP
-#define OBSERVABLES_PIDOBSERVABLE_HPP
+#pragma once
 
 #include <particle_observables/observable.hpp>
 
@@ -31,10 +30,10 @@
 
 #include <boost/mpi/collectives/gather.hpp>
 #include <boost/mpi/collectives/reduce.hpp>
-#include <boost/range/algorithm/copy.hpp>
 #include <boost/serialization/utility.hpp>
 #include <boost/serialization/vector.hpp>
 
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <iterator>
@@ -85,7 +84,7 @@ template <class _, std::size_t N> struct shape_impl<Utils::Vector<_, N>> {
 template <class T> struct shape_impl<std::vector<T>> {
   static std::vector<std::size_t> eval(std::size_t n_part) {
     std::vector<std::size_t> ret{n_part};
-    boost::copy(shape_impl<T>::eval(n_part), std::back_inserter(ret));
+    std::ranges::copy(shape_impl<T>::eval(n_part), std::back_inserter(ret));
 
     return ret;
   }
@@ -157,7 +156,7 @@ get_all_particle_positions(boost::mpi::communicator const &comm,
   auto const argsort = detail::get_argsort(comm, local_pids, sorted_pids);
 
   std::vector<std::vector<pos_type>> global_positions{};
-  global_positions.reserve(comm.size());
+  global_positions.reserve(static_cast<std::size_t>(comm.size()));
   boost::mpi::gather(comm, local_positions, global_positions, 0);
 
   if (comm.rank() != 0) {
@@ -214,7 +213,7 @@ template <class ObsType> class ParticleObservable : public PidObservable {
   std::vector<double>
   evaluate(boost::mpi::communicator const &comm,
            ParticleReferenceRange const &local_particles,
-           ParticleObservables::traits<Particle> const &traits) const override {
+           ParticleObservables::traits<Particle> const &) const override {
     if constexpr (is_map<ObsType>::value) {
       std::vector<double> local_traits{};
       local_traits.reserve(local_particles.size());
@@ -271,4 +270,3 @@ template <class ObsType> class ParticleObservable : public PidObservable {
 };
 
 } // namespace Observables
-#endif
diff --git a/src/core/observables/PressureObservable.hpp b/src/core/observables/PressureObservable.hpp
index 65d26d1b9ad..a0aff183665 100644
--- a/src/core/observables/PressureObservable.hpp
+++ b/src/core/observables/PressureObservable.hpp
@@ -30,9 +30,9 @@ namespace Observables {
 
 class Pressure : public Observable {
 public:
-  std::vector<std::size_t> shape() const override { return {1}; }
+  std::vector<std::size_t> shape() const override { return {1u}; }
   std::vector<double>
-  operator()(boost::mpi::communicator const &comm) const override {
+  operator()(boost::mpi::communicator const &) const override {
     auto const obs = System::get_system().calculate_pressure();
 
     return {(obs->accumulate(0., 0u) + obs->accumulate(0., 4u) +
diff --git a/src/core/observables/PressureTensor.hpp b/src/core/observables/PressureTensor.hpp
index d7e1f2ace5b..4c987481ce3 100644
--- a/src/core/observables/PressureTensor.hpp
+++ b/src/core/observables/PressureTensor.hpp
@@ -30,9 +30,9 @@ namespace Observables {
 
 class PressureTensor : public Observable {
 public:
-  std::vector<std::size_t> shape() const override { return {3, 3}; }
+  std::vector<std::size_t> shape() const override { return {3u, 3u}; }
   std::vector<double>
-  operator()(boost::mpi::communicator const &comm) const override {
+  operator()(boost::mpi::communicator const &) const override {
     auto const obs = System::get_system().calculate_pressure();
 
     std::vector<double> result;
diff --git a/src/core/observables/ProfileObservable.hpp b/src/core/observables/ProfileObservable.hpp
index c045b1fd1d7..d032eb834ed 100644
--- a/src/core/observables/ProfileObservable.hpp
+++ b/src/core/observables/ProfileObservable.hpp
@@ -23,8 +23,7 @@
 
 #include <utils/math/make_lin_space.hpp>
 
-#include <boost/range/algorithm.hpp>
-
+#include <algorithm>
 #include <array>
 #include <cstddef>
 #include <stdexcept>
@@ -75,18 +74,15 @@ class ProfileObservable : virtual public Observable {
   /** Calculate the bin edges for each dimension */
   std::array<std::vector<double>, 3> edges() const {
     std::array<std::vector<double>, 3> profile_edges = {
-        {std::vector<double>(m_n_bins[0] + 1),
-         std::vector<double>(m_n_bins[1] + 1),
-         std::vector<double>(m_n_bins[2] + 1)}};
-    boost::copy(Utils::make_lin_space(m_limits[0].first, m_limits[0].second,
-                                      m_n_bins[0] + 1),
-                profile_edges[0].begin());
-    boost::copy(Utils::make_lin_space(m_limits[1].first, m_limits[1].second,
-                                      m_n_bins[1] + 1),
-                profile_edges[1].begin());
-    boost::copy(Utils::make_lin_space(m_limits[2].first, m_limits[2].second,
-                                      m_n_bins[2] + 1),
-                profile_edges[2].begin());
+        {std::vector<double>(m_n_bins[0u] + 1u),
+         std::vector<double>(m_n_bins[1u] + 1u),
+         std::vector<double>(m_n_bins[2u] + 1u)}};
+    for (auto i = 0u; i < 3u; ++i) {
+      std::ranges::copy(Utils::make_lin_space(m_limits[i].first,
+                                              m_limits[i].second,
+                                              m_n_bins[i] + 1u),
+                        profile_edges[i].begin());
+    }
     return profile_edges;
   }
 };
diff --git a/src/core/observables/RDF.cpp b/src/core/observables/RDF.cpp
index e631fefc783..a42fd7d8c84 100644
--- a/src/core/observables/RDF.cpp
+++ b/src/core/observables/RDF.cpp
@@ -23,14 +23,15 @@
 #include "system/System.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/for_each_pair.hpp>
 #include <utils/math/int_pow.hpp>
 
-#include <boost/range/algorithm/transform.hpp>
+#include <boost/mpi/communicator.hpp>
 #include <boost/range/combine.hpp>
 
 #include <cmath>
+#include <cstddef>
+#include <numbers>
 #include <vector>
 
 namespace Observables {
@@ -64,7 +65,7 @@ RDF::evaluate(boost::mpi::communicator const &comm,
   auto const &box_geo = *System::get_system().box_geo;
   auto const bin_width = (max_r - min_r) / static_cast<double>(n_r_bins);
   auto const inv_bin_width = 1.0 / bin_width;
-  std::vector<double> res(n_values(), 0.0);
+  std::vector<double> res(n_r_bins, 0.0);
   long int cnt = 0;
   auto op = [this, inv_bin_width, &cnt, &res, &box_geo](auto const &pos1,
                                                         auto const &pos2) {
@@ -102,11 +103,11 @@ RDF::evaluate(boost::mpi::communicator const &comm,
     return res;
   // normalization
   auto const volume = box_geo.volume();
-  for (int i = 0; i < n_r_bins; ++i) {
-    auto const r_in = i * bin_width + min_r;
+  for (std::size_t i = 0u; i < res.size(); ++i) {
+    auto const r_in = static_cast<double>(i) * bin_width + min_r;
     auto const r_out = r_in + bin_width;
     auto const bin_volume =
-        (4.0 / 3.0) * Utils::pi() *
+        (4. / 3.) * std::numbers::pi *
         (Utils::int_pow<3>(r_out) - Utils::int_pow<3>(r_in));
     res[i] *= volume / (bin_volume * static_cast<double>(cnt));
   }
diff --git a/src/core/observables/RDF.hpp b/src/core/observables/RDF.hpp
index ccebb1bd37b..bb21a293514 100644
--- a/src/core/observables/RDF.hpp
+++ b/src/core/observables/RDF.hpp
@@ -17,8 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef OBSERVABLES_RDF_HPP
-#define OBSERVABLES_RDF_HPP
+#pragma once
 
 #include "Observable.hpp"
 
@@ -53,14 +52,15 @@ class RDF : public Observable {
 
   std::vector<std::size_t> shape() const override { return {n_r_bins}; }
 
-  explicit RDF(std::vector<int> ids1, std::vector<int> ids2, int n_r_bins,
-               double min_r, double max_r)
+  RDF(std::vector<int> ids1, std::vector<int> ids2, int n_r_bins, double min_r,
+      double max_r)
       : m_ids1(std::move(ids1)), m_ids2(std::move(ids2)), min_r(min_r),
-        max_r(max_r), n_r_bins(n_r_bins) {
+        max_r(max_r) {
     if (max_r <= min_r)
       throw std::runtime_error("max_r has to be > min_r");
     if (n_r_bins <= 0)
       throw std::domain_error("n_r_bins has to be >= 1");
+    this->n_r_bins = static_cast<std::size_t>(n_r_bins);
   }
   std::vector<double>
   operator()(boost::mpi::communicator const &comm) const final;
@@ -72,4 +72,3 @@ class RDF : public Observable {
 };
 
 } // Namespace Observables
-#endif
diff --git a/src/core/observables/utils_histogram.hpp b/src/core/observables/utils_histogram.hpp
index 591423e2c3e..20ea26adc3a 100644
--- a/src/core/observables/utils_histogram.hpp
+++ b/src/core/observables/utils_histogram.hpp
@@ -36,7 +36,7 @@ template <class Pos>
 auto gather(boost::mpi::communicator const &comm,
             std::vector<Pos> const &local_pos) {
   std::vector<std::vector<Pos>> global_pos{};
-  global_pos.reserve(comm.size());
+  global_pos.reserve(static_cast<std::size_t>(comm.size()));
   boost::mpi::gather(comm, local_pos, global_pos, 0);
   return global_pos;
 }
@@ -46,12 +46,11 @@ template <class Pos, class Val>
 auto gather(boost::mpi::communicator const &comm,
             std::vector<Pos> const &local_pos,
             std::vector<Val> const &local_val) {
-  auto const world_size = comm.size();
   std::vector<std::vector<Pos>> global_pos{};
-  global_pos.reserve(world_size);
+  global_pos.reserve(static_cast<std::size_t>(comm.size()));
   boost::mpi::gather(comm, local_pos, global_pos, 0);
   std::vector<std::vector<Val>> global_val{};
-  global_val.reserve(world_size);
+  global_val.reserve(static_cast<std::size_t>(comm.size()));
   boost::mpi::gather(comm, local_val, global_val, 0);
   return std::make_pair(global_pos, global_val);
 }
diff --git a/src/core/p3m/CMakeLists.txt b/src/core/p3m/CMakeLists.txt
index f05697c8378..2a6dd407baf 100644
--- a/src/core/p3m/CMakeLists.txt
+++ b/src/core/p3m/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2018-2022 The ESPResSo project
+# Copyright (C) 2018-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,13 +17,5 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-if(FFTW3_FOUND)
-  target_link_libraries(espresso_core PUBLIC FFTW3::FFTW3)
-endif()
-
-target_sources(
-  espresso_core
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/TuningAlgorithm.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/send_mesh.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp)
+target_sources(espresso_core PRIVATE common.cpp send_mesh.cpp
+                                     TuningAlgorithm.cpp FFTBackendLegacy.cpp)
diff --git a/src/core/p3m/FFTBackendLegacy.cpp b/src/core/p3m/FFTBackendLegacy.cpp
new file mode 100644
index 00000000000..146c241de19
--- /dev/null
+++ b/src/core/p3m/FFTBackendLegacy.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config/config.hpp"
+
+#if defined(P3M) or defined(DP3M)
+
+#include "FFTBackendLegacy.hpp"
+
+#include "communication.hpp"
+
+#include "fft/fft.hpp"
+
+#include <utils/Vector.hpp>
+
+#include <array>
+#include <memory>
+#include <span>
+#include <utility>
+
+FFTBackendLegacy::FFTBackendLegacy(p3m_data_struct &obj, bool dipolar)
+    : FFTBackend(obj), dipolar{dipolar},
+      fft{std::make_unique<fft::fft_data_struct>(
+          ::Communication::mpiCallbacksHandle()->share_mpi_env())} {}
+
+FFTBackendLegacy::~FFTBackendLegacy() = default;
+
+void FFTBackendLegacy::update_mesh_data() {
+  auto const mesh_size_ptr = fft->get_mesh_size();
+  auto const mesh_start_ptr = fft->get_mesh_start();
+  for (auto i = 0u; i < 3u; ++i) {
+    mesh.size[i] = mesh_size_ptr[i];
+    mesh.start[i] = mesh_start_ptr[i];
+  }
+  mesh.stop = mesh.start + mesh.size;
+  mesh.ks_scalar = std::span(ks_mesh);
+  mesh.rs_scalar = std::span(rs_mesh);
+  for (auto i = 0u; i < 3u; ++i) {
+    mesh.rs_fields[i] = std::span(rs_mesh_fields[i]);
+  }
+}
+
+void FFTBackendLegacy::init_fft() {
+  mesh_comm.resize(::comm_cart, local_mesh);
+  auto ca_mesh_size = fft->initialize_fft(
+      ::comm_cart, local_mesh.dim, local_mesh.margin, params.mesh,
+      params.mesh_off, mesh.ks_pnum, ::communicator.node_grid);
+  rs_mesh.resize(ca_mesh_size);
+  if (dipolar) {
+    ks_mesh.resize(ca_mesh_size);
+  }
+  for (auto &rs_mesh_field : rs_mesh_fields) {
+    rs_mesh_field.resize(ca_mesh_size);
+  }
+  update_mesh_data();
+}
+
+void FFTBackendLegacy::perform_field_back_fft() {
+  /* Back FFT force component mesh */
+  for (auto &rs_mesh_field : rs_mesh_fields) {
+    fft->backward_fft(::comm_cart, rs_mesh_field.data(),
+                      check_complex_residuals);
+  }
+  /* redistribute force component mesh */
+  std::array<double *, 3u> meshes = {{rs_mesh_fields[0u].data(),
+                                      rs_mesh_fields[1u].data(),
+                                      rs_mesh_fields[2u].data()}};
+  mesh_comm.spread_grid(::comm_cart, meshes, local_mesh.dim);
+}
+
+void FFTBackendLegacy::perform_fwd_fft() {
+  if (dipolar) {
+    std::array<double *, 3u> meshes = {{rs_mesh_fields[0u].data(),
+                                        rs_mesh_fields[1u].data(),
+                                        rs_mesh_fields[2u].data()}};
+    mesh_comm.gather_grid(::comm_cart, meshes, local_mesh.dim);
+    for (auto &rs_mesh_field : rs_mesh_fields) {
+      fft->forward_fft(::comm_cart, rs_mesh_field.data());
+    }
+  } else {
+    mesh_comm.gather_grid(::comm_cart, rs_mesh.data(), local_mesh.dim);
+    fft->forward_fft(::comm_cart, rs_mesh.data());
+  }
+  update_mesh_data();
+}
+
+void FFTBackendLegacy::perform_space_back_fft() {
+  /* Back FFT force component mesh */
+  fft->backward_fft(::comm_cart, rs_mesh.data(), check_complex_residuals);
+  /* redistribute force component mesh */
+  mesh_comm.spread_grid(::comm_cart, rs_mesh.data(), local_mesh.dim);
+}
+
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/FFTBackendLegacy.hpp b/src/core/p3m/FFTBackendLegacy.hpp
new file mode 100644
index 00000000000..2ae1a379aef
--- /dev/null
+++ b/src/core/p3m/FFTBackendLegacy.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "config/config.hpp"
+
+#if defined(P3M) or defined(DP3M)
+
+#include "common.hpp"
+#include "data_struct.hpp"
+#include "send_mesh.hpp"
+
+#include "fft/vector.hpp"
+
+#include <array>
+#include <memory>
+#include <tuple>
+
+namespace fft {
+struct fft_data_struct;
+} // namespace fft
+
+/**
+ * @brief Historic FFT backend based on FFTW3.
+ * The 3D FFT is split into three 1D FFTs.
+ */
+class FFTBackendLegacy : public FFTBackend {
+  bool dipolar;
+  std::unique_ptr<fft::fft_data_struct> fft;
+  /** @brief k-space mesh (local) for k-space calculations. */
+  std::vector<double> ks_mesh;
+  /** @brief real-space mesh (local) for CA/FFT. */
+  fft::vector<double> rs_mesh;
+  /** @brief real-space mesh (local) for the electric or dipolar field. */
+  std::array<fft::vector<double>, 3> rs_mesh_fields;
+  p3m_send_mesh mesh_comm;
+
+public:
+  explicit FFTBackendLegacy(p3m_data_struct &obj, bool dipolar);
+  ~FFTBackendLegacy() override;
+  void init_fft() override;
+  void perform_fwd_fft() override;
+  void perform_field_back_fft() override;
+  void perform_space_back_fft() override;
+  void update_mesh_data();
+
+  /**
+   * @brief Index helpers for reciprocal space.
+   * After the FFT the data is in order YZX, which
+   * means that Y is the slowest changing index.
+   */
+  std::tuple<int, int, int> get_permutations() const override {
+    constexpr static int KX = 2;
+    constexpr static int KY = 0;
+    constexpr static int KZ = 1;
+    return {KX, KY, KZ};
+  }
+};
+
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/TuningAlgorithm.cpp b/src/core/p3m/TuningAlgorithm.cpp
index 8386efb9cdd..ab6ad80c9d2 100644
--- a/src/core/p3m/TuningAlgorithm.cpp
+++ b/src/core/p3m/TuningAlgorithm.cpp
@@ -34,8 +34,6 @@
 #include "communication.hpp"
 #include "system/System.hpp"
 
-#include <boost/range/algorithm/min_element.hpp>
-
 #include <algorithm>
 #include <cassert>
 #include <cmath>
@@ -62,8 +60,8 @@ void TuningAlgorithm::determine_r_cut_limits() {
   auto const verlet_skin = m_system.cell_structure->get_verlet_skin();
   auto const r_cut_iL = get_params().r_cut_iL;
   if (r_cut_iL == 0.) {
-    auto const min_box_l = *boost::min_element(box_geo.length());
-    auto const min_local_box_l = *boost::min_element(local_geo.length());
+    auto const min_box_l = std::ranges::min(box_geo.length());
+    auto const min_local_box_l = std::ranges::min(local_geo.length());
     m_r_cut_iL_min = 0.;
     m_r_cut_iL_max = std::min(min_local_box_l, min_box_l / 2.) - verlet_skin;
     m_r_cut_iL_min *= box_geo.length_inv()[0];
@@ -130,12 +128,12 @@ double TuningAlgorithm::get_mc_time(Utils::Vector3i const &mesh, int cao,
   /* initial checks. */
   auto const k_cut_per_dir = (static_cast<double>(cao) / 2.) *
                              Utils::hadamard_division(box_geo.length(), mesh);
-  auto const k_cut = *boost::min_element(k_cut_per_dir);
-  auto const min_box_l = *boost::min_element(box_geo.length());
-  auto const min_local_box_l = *boost::min_element(local_geo.length());
+  auto const k_cut = std::ranges::min(k_cut_per_dir);
+  auto const min_box_l = std::ranges::min(box_geo.length());
+  auto const min_local_box_l = std::ranges::min(local_geo.length());
   auto const k_cut_max = std::min(min_box_l, min_local_box_l) - verlet_skin;
 
-  if (cao >= *boost::min_element(mesh) or k_cut >= k_cut_max) {
+  if (cao >= std::ranges::min(mesh) or k_cut >= k_cut_max) {
     m_logger->log_cao_too_large(mesh[0], cao);
     return -P3M_TUNE_CAO_TOO_LARGE;
   }
diff --git a/src/core/p3m/TuningAlgorithm.hpp b/src/core/p3m/TuningAlgorithm.hpp
index 758d6f5ee20..0b4d87cdadd 100644
--- a/src/core/p3m/TuningAlgorithm.hpp
+++ b/src/core/p3m/TuningAlgorithm.hpp
@@ -92,8 +92,8 @@ class TuningAlgorithm {
 
 public:
   TuningAlgorithm(System::System &system, double prefactor, int timings)
-      : m_system{system}, m_timings{timings}, m_n_trials{0ul}, m_prefactor{
-                                                                   prefactor} {}
+      : m_system{system}, m_timings{timings}, m_n_trials{0ul},
+        m_prefactor{prefactor} {}
 
   virtual ~TuningAlgorithm() = default;
 
diff --git a/src/core/p3m/common.cpp b/src/core/p3m/common.cpp
index 83011667d6b..6290c46d6ab 100644
--- a/src/core/p3m/common.cpp
+++ b/src/core/p3m/common.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -21,59 +21,15 @@
 
 #include "config/config.hpp"
 
-#if defined(P3M) || defined(DP3M)
+#if defined(P3M) or defined(DP3M)
 
 #include "common.hpp"
 
 #include "LocalBox.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
-#include <utils/math/sqr.hpp>
 
 #include <cmath>
-#include <stdexcept>
-
-double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao) {
-  auto const c =
-      Utils::sqr(std::cos(Utils::pi() * mesh_i * static_cast<double>(n)));
-
-  switch (cao) {
-  case 1: {
-    return 1.0;
-  }
-  case 2: {
-    return (1.0 + c * 2.0) / 3.0;
-  }
-  case 3: {
-    return (2.0 + c * (11.0 + c * 2.0)) / 15.0;
-  }
-  case 4: {
-    return (17.0 + c * (180.0 + c * (114.0 + c * 4.0))) / 315.0;
-  }
-  case 5: {
-    return (62.0 + c * (1072.0 + c * (1452.0 + c * (247.0 + c * 2.0)))) /
-           2835.0;
-  }
-  case 6: {
-    return (1382.0 +
-            c * (35396.0 +
-                 c * (83021.0 + c * (34096.0 + c * (2026.0 + c * 4.0))))) /
-           155925.0;
-  }
-  case 7: {
-    return (21844.0 +
-            c * (776661.0 +
-                 c * (2801040.0 +
-                      c * (2123860.0 +
-                           c * (349500.0 + c * (8166.0 + c * 4.0)))))) /
-           6081075.0;
-  }
-  default: {
-    throw std::logic_error("Invalid value cao=" + std::to_string(cao));
-  }
-  }
-}
 
 void P3MLocalMesh::calc_local_ca_mesh(P3MParameters const &params,
                                       LocalBox const &local_geo, double skin,
@@ -149,4 +105,4 @@ void P3MLocalMesh::calc_local_ca_mesh(P3MParameters const &params,
   q_21_off = dim[2] * (dim[1] - params.cao);
 }
 
-#endif /* defined(P3M) || defined(DP3M) */
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/common.hpp b/src/core/p3m/common.hpp
index 275c40bce2e..af9a9af5959 100644
--- a/src/core/p3m/common.hpp
+++ b/src/core/p3m/common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -38,32 +38,22 @@
 
 #include <utils/Vector.hpp>
 
+#include <algorithm>
 #include <array>
 #include <vector>
 
 /** This value indicates metallic boundary conditions. */
 auto constexpr P3M_EPSILON_METALLIC = 0.0;
 
-#if defined(P3M) || defined(DP3M)
+#if defined(P3M) or defined(DP3M)
 
 #include "LocalBox.hpp"
 
-#include <array>
+#include <cstddef>
+#include <span>
 #include <stdexcept>
-#include <vector>
-
-namespace detail {
-/** @brief Index helpers for direct and reciprocal space.
- *  After the FFT the data is in order YZX, which
- *  means that Y is the slowest changing index.
- */
-namespace FFT_indexing {
-enum FFT_REAL_VECTOR : int { RX = 0, RY = 1, RZ = 2 };
-enum FFT_WAVE_VECTOR : int { KY = 0, KZ = 1, KX = 2 };
-} // namespace FFT_indexing
-} // namespace detail
 
-/** Structure to hold P3M parameters and some dependent variables. */
+/** @brief Structure to hold P3M parameters and some dependent variables. */
 struct P3MParameters {
   /** tuning or production? */
   bool tuning;
@@ -179,9 +169,8 @@ struct P3MParameters {
   }
 };
 
-/** Structure for local mesh parameters. */
+/** @brief Properties of the local mesh. */
 struct P3MLocalMesh {
-  /* local mesh characterization. */
   /** dimension (size) of local mesh. */
   Utils::Vector3i dim;
   /** number of local mesh points. */
@@ -212,7 +201,7 @@ struct P3MLocalMesh {
    */
   void recalc_ld_pos(P3MParameters const &params) {
     // spatial position of left down mesh point
-    for (unsigned int i = 0; i < 3; i++) {
+    for (auto i = 0u; i < 3u; i++) {
       ld_pos[i] = (ld_ind[i] + params.mesh_off[i]) * params.a[i];
     }
   }
@@ -226,17 +215,27 @@ struct P3MLocalMesh {
                           double space_layer);
 };
 
-/** One of the aliasing sums used to compute k-space errors.
- *  Fortunately the one which is most important (because it converges
- *  most slowly, since it is not damped exponentially) can be
- *  calculated analytically. The result (which depends on the order of
- *  the spline interpolation) can be written as an even trigonometric
- *  polynomial. The results are tabulated here (the employed formula
- *  is eq. (7.66) in @cite hockney88a).
- */
-double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao);
+/** @brief Local mesh FFT buffers. */
+struct P3MFFTMesh {
+  /** @brief k-space scalar mesh for k-space calculations. */
+  std::span<double> ks_scalar;
+  /** @brief real-space scalar mesh for charge assignment and FFT. */
+  std::span<double> rs_scalar;
+  /** @brief real-space 3D meshes for the electric or dipolar field. */
+  std::array<std::span<double>, 3> rs_fields;
+
+  /** @brief Indices of the lower left corner of the local mesh grid. */
+  Utils::Vector3i start;
+  /** @brief Indices of the upper right corner of the local mesh grid. */
+  Utils::Vector3i stop;
+  /** @brief Extents of the local mesh grid. */
+  Utils::Vector3i size;
+
+  /** @brief number of permutations in k_space */
+  int ks_pnum = 0;
+};
 
-#endif /* P3M || DP3M */
+#endif // defined(P3M) or defined(DP3M)
 
 namespace detail {
 /** Calculate indices that shift @ref P3MParameters::mesh "mesh" by `mesh/2`.
@@ -250,8 +249,8 @@ std::array<std::vector<int>, 3> inline calc_meshift(
     Utils::Vector3i const &mesh_size, bool zero_out_midpoint = false) {
   std::array<std::vector<int>, 3> ret{};
 
-  for (unsigned int i = 0; i < 3; i++) {
-    ret[i] = std::vector<int>(mesh_size[i]);
+  for (auto i = 0u; i < 3u; ++i) {
+    ret[i] = std::vector<int>(static_cast<std::size_t>(mesh_size[i]));
 
     for (int j = 1; j <= mesh_size[i] / 2; j++) {
       ret[i][j] = j;
diff --git a/src/core/p3m/data_struct.hpp b/src/core/p3m/data_struct.hpp
index 410cffbbf94..9b607031991 100644
--- a/src/core/p3m/data_struct.hpp
+++ b/src/core/p3m/data_struct.hpp
@@ -1,6 +1,6 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
- * Copyright (C) 2002-2010
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
  * This file is part of ESPResSo.
@@ -23,20 +23,38 @@
 
 #include "config/config.hpp"
 
-#if defined(P3M) || defined(DP3M)
+#if defined(P3M) or defined(DP3M)
 
 #include "common.hpp"
 
 #include <array>
+#include <cassert>
+#include <memory>
+#include <tuple>
+#include <utility>
 #include <vector>
 
-struct p3m_data_struct_base {
-  explicit p3m_data_struct_base(P3MParameters &&parameters)
-      : params{std::move(parameters)}, ks_pnum{0} {}
+class FFTBackend;
 
+/**
+ * @brief Base class for the electrostatics and magnetostatics P3M algorithms.
+ * Contains a handle to the FFT backend, information about the local mesh,
+ * the differential operator, and various buffers.
+ */
+struct p3m_data_struct {
+  explicit p3m_data_struct(P3MParameters &&parameters)
+      : params{std::move(parameters)} {}
+
+  /** @brief P3M base parameters. */
   P3MParameters params;
+  /** @brief Local mesh properties. */
+  P3MLocalMesh local_mesh;
+  /** @brief Local mesh FFT buffers. */
+  P3MFFTMesh mesh;
 
-  /** Spatial differential operator in k-space. We use an i*k differentiation.
+  /**
+   * @brief Spatial differential operator in k-space.
+   * We use an i*k differentiation.
    */
   std::array<std::vector<int>, 3> d_op;
   /** Force optimised influence function (k-space) */
@@ -44,8 +62,8 @@ struct p3m_data_struct_base {
   /** Energy optimised influence function (k-space) */
   std::vector<double> g_energy;
 
-  /** number of permutations in k_space */
-  int ks_pnum;
+  /** FFT backend. */
+  std::unique_ptr<FFTBackend> fft;
 
   /** Calculate the Fourier transformed differential operator.
    *  Remark: This is done on the level of n-vectors and not k-vectors,
@@ -54,6 +72,40 @@ struct p3m_data_struct_base {
   void calc_differential_operator() {
     d_op = detail::calc_meshift(params.mesh, true);
   }
+
+  template <typename T, class... Args> void make_fft_instance(Args... args) {
+    assert(fft == nullptr);
+    fft = std::make_unique<T>(*this, args...);
+  }
+};
+
+/**
+ * @brief API for the FFT backend of the P3M algorithm.
+ * Any FFT backend must implement this interface.
+ * The backend can read some members of @ref p3m_data_struct
+ * but can only modify the FFT buffers in @ref P3MFFTMesh.
+ */
+class FFTBackend {
+protected:
+  P3MParameters const &params;
+  P3MLocalMesh const &local_mesh;
+  P3MFFTMesh &mesh;
+
+public:
+  bool check_complex_residuals = false;
+  explicit FFTBackend(p3m_data_struct &obj)
+      : params{obj.params}, local_mesh{obj.local_mesh}, mesh{obj.mesh} {}
+  virtual ~FFTBackend() = default;
+  /** @brief Initialize the FFT plans and buffers. */
+  virtual void init_fft() = 0;
+  /** @brief Carry out the forward FFT of the scalar or field meshes. */
+  virtual void perform_fwd_fft() = 0;
+  /** @brief Carry out the backward FFT of the scalar mesh. */
+  virtual void perform_field_back_fft() = 0;
+  /** @brief Carry out the backward FFT of the field meshes. */
+  virtual void perform_space_back_fft() = 0;
+  /** @brief Get indices of the k-space data layout. */
+  virtual std::tuple<int, int, int> get_permutations() const = 0;
 };
 
-#endif
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/for_each_3d.hpp b/src/core/p3m/for_each_3d.hpp
new file mode 100644
index 00000000000..cc4bb2553d0
--- /dev/null
+++ b/src/core/p3m/for_each_3d.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2024 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <concepts>
+#include <cstddef>
+
+namespace detail {
+
+constexpr inline void noop_projector(unsigned, int) {}
+
+template <typename T>
+concept IndexVectorConcept = requires(T vector) {
+  { vector[0] } -> std::convertible_to<std::size_t>;
+};
+
+} // namespace detail
+
+/**
+ * @brief Repeat an operation on every element of a 3D grid.
+ *
+ * Intermediate values that depend on the iterated coordinates
+ * are calculated and stored once per iteration. This is useful
+ * when the operation is costly.
+ *
+ * @param start       Initial values for the loop counters.
+ * @param stop        Final values (one-past-the-end) for the loop counters.
+ * @param counters    Loop counters.
+ * @param kernel      Functor to execute.
+ * @param projector   Projection of the current loop counter.
+ * @tparam Kernel     Nullary function.
+ * @tparam Projector  Binary function that takes a nesting depth and a loop
+ *                    counter as arguments and projects a value.
+ */
+template <class Kernel, class Projector = decltype(detail::noop_projector)>
+  requires std::invocable<Kernel> and std::invocable<Projector, unsigned, int>
+void for_each_3d(detail::IndexVectorConcept auto &&start,
+                 detail::IndexVectorConcept auto &&stop,
+                 detail::IndexVectorConcept auto &&counters, Kernel &&kernel,
+                 Projector &&projector = detail::noop_projector) {
+  auto &nx = counters[0u];
+  auto &ny = counters[1u];
+  auto &nz = counters[2u];
+  for (nx = start[0u]; nx < stop[0u]; ++nx) {
+    projector(0u, nx);
+    for (ny = start[1u]; ny < stop[1u]; ++ny) {
+      projector(1u, ny);
+      for (nz = start[2u]; nz < stop[2u]; ++nz) {
+        projector(2u, nz);
+        kernel();
+      }
+    }
+  }
+}
diff --git a/src/core/p3m/influence_function.hpp b/src/core/p3m/influence_function.hpp
index 7e1d45c3306..eb259d89d92 100644
--- a/src/core/p3m/influence_function.hpp
+++ b/src/core/p3m/influence_function.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2022 The ESPResSo project
+ * Copyright (C) 2019-2024 The ESPResSo project
  *
  * This file is part of ESPResSo.
  *
@@ -16,23 +16,22 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP
-#define ESPRESSO_P3M_INFLUENCE_FUNCTION_HPP
+
+#pragma once
 
 #include "p3m/common.hpp"
+#include "p3m/for_each_3d.hpp"
+#include "p3m/math.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/index.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
-#include <boost/range/numeric.hpp>
-
 #include <cmath>
 #include <cstddef>
 #include <functional>
+#include <numbers>
 #include <utility>
 #include <vector>
 
@@ -48,50 +47,48 @@
  *
  * @param cao Charge assignment order.
  * @param alpha Ewald splitting parameter.
- * @param k k Vector to evaluate the function for.
+ * @param k k-vector to evaluate the function for.
  * @param h Grid spacing.
  */
 template <std::size_t S, std::size_t m>
 double G_opt(int cao, double alpha, Utils::Vector3d const &k,
              Utils::Vector3d const &h) {
-  using namespace detail::FFT_indexing;
-  using Utils::int_pow;
-  using Utils::sinc;
-
-  auto constexpr two_pi = 2. * Utils::pi();
-  auto constexpr two_pi_i = 1. / two_pi;
-  auto constexpr limit = 30.;
 
   auto const k2 = k.norm2();
-  if (k2 == 0.0) {
-    return 0.0;
+  if (k2 == 0.) {
+    return 0.;
   }
 
-  double numerator = 0.0;
-  double denominator = 0.0;
-
-  for (int mx = -m; mx <= m; mx++) {
-    for (int my = -m; my <= m; my++) {
-      for (int mz = -m; mz <= m; mz++) {
-        auto const km =
-            k + two_pi * Utils::Vector3d{mx / h[RX], my / h[RY], mz / h[RZ]};
-        auto const U2 = std::pow(sinc(km[RX] * h[RX] * two_pi_i) *
-                                     sinc(km[RY] * h[RY] * two_pi_i) *
-                                     sinc(km[RZ] * h[RZ] * two_pi_i),
-                                 2 * cao);
-
+  auto constexpr limit = 30.;
+  auto constexpr m_start = Utils::Vector3i::broadcast(-m);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(m + 1);
+  auto const exponent_prefactor = Utils::sqr(1. / (2. * alpha));
+  auto const wavevector = (2. * std::numbers::pi) / h;
+  auto const wavevector_i = 1. / wavevector;
+  auto indices = Utils::Vector3i{};
+  auto km = Utils::Vector3d{};
+  auto fnm = Utils::Vector3d{};
+  auto numerator = 0.;
+  auto denominator = 0.;
+
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        auto const U2 = std::pow(Utils::product(fnm), 2 * cao);
         auto const km2 = km.norm2();
-        auto const exponent = Utils::sqr(1. / (2. * alpha)) * km2;
+        auto const exponent = exponent_prefactor * km2;
         if (exponent < limit) {
-          auto const f3 = std::exp(-exponent) * (4. * Utils::pi() / km2);
-          numerator += U2 * f3 * int_pow<S>(k * km);
+          auto const f3 = std::exp(-exponent) * (4. * std::numbers::pi / km2);
+          numerator += U2 * f3 * Utils::int_pow<S>(k * km);
         }
         denominator += U2;
-      }
-    }
-  }
+      },
+      [&](unsigned dim, int n) {
+        km[dim] = k[dim] + n * wavevector[dim];
+        fnm[dim] = math::sinc(km[dim] * wavevector_i[dim]);
+      });
 
-  return numerator / (int_pow<S>(k2) * Utils::sqr(denominator));
+  return numerator / (Utils::int_pow<S>(k2) * Utils::sqr(denominator));
 }
 
 /**
@@ -104,26 +101,28 @@ double G_opt(int cao, double alpha, Utils::Vector3d const &k,
  *          1 for electric field...
  * @tparam m Number of aliasing terms to take into account.
  *
- * @param params P3M parameters
- * @param n_start Lower left corner of the grid
- * @param n_end Upper right corner of the grid.
- * @param box_l Box size
+ * @param params P3M parameters.
+ * @param n_start Lower left corner of the grid.
+ * @param n_stop Upper right corner of the grid.
+ * @param KX k-space x-axis index.
+ * @param KY k-space y-axis index.
+ * @param KZ k-space z-axis index.
+ * @param inv_box_l Inverse box length.
  * @return Values of G_opt at regular grid points.
  */
 template <std::size_t S, std::size_t m = 0>
-std::vector<double> grid_influence_function(const P3MParameters &params,
-                                            const Utils::Vector3i &n_start,
-                                            const Utils::Vector3i &n_end,
-                                            const Utils::Vector3d &box_l) {
-  using namespace detail::FFT_indexing;
+std::vector<double> grid_influence_function(P3MParameters const &params,
+                                            Utils::Vector3i const &n_start,
+                                            Utils::Vector3i const &n_stop,
+                                            int const KX, int const KY,
+                                            int const KZ,
+                                            Utils::Vector3d const &inv_box_l) {
 
   auto const shifts = detail::calc_meshift(params.mesh);
-
-  auto const size = n_end - n_start;
+  auto const size = n_stop - n_start;
 
   /* The influence function grid */
-  auto g =
-      std::vector<double>(boost::accumulate(size, 1, std::multiplies<>()), 0.);
+  auto g = std::vector<double>(Utils::product(size), 0.);
 
   /* Skip influence function calculation in tuning mode,
      the results need not be correct for timing. */
@@ -131,31 +130,23 @@ std::vector<double> grid_influence_function(const P3MParameters &params,
     return g;
   }
 
-  auto const h = Utils::Vector3d{params.a};
-
-  Utils::Vector3i n{};
-  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
-    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
-      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
-        auto const ind = Utils::get_linear_index(n - n_start, size,
-                                                 Utils::MemoryOrder::ROW_MAJOR);
-        if ((n[KX] % (params.mesh[RX] / 2) == 0) &&
-            (n[KY] % (params.mesh[RY] / 2) == 0) &&
-            (n[KZ] % (params.mesh[RZ] / 2) == 0)) {
-          g[ind] = 0.0;
-        } else {
-          auto const k = 2 * Utils::pi() *
-                         Utils::Vector3d{shifts[RX][n[KX]] / box_l[RX],
-                                         shifts[RY][n[KY]] / box_l[RY],
-                                         shifts[RZ][n[KZ]] / box_l[RZ]};
-
-          g[ind] = G_opt<S, m>(params.cao, params.alpha, k, h);
-        }
-      }
+  auto const wavevector = (2. * std::numbers::pi) * inv_box_l;
+  auto const half_mesh = params.mesh / 2;
+  auto indices = Utils::Vector3i{};
+  auto index = std::size_t(0u);
+
+  for_each_3d(n_start, n_stop, indices, [&]() {
+    if ((indices[KX] % half_mesh[0u] != 0) or
+        (indices[KY] % half_mesh[1u] != 0) or
+        (indices[KZ] % half_mesh[2u] != 0)) {
+      auto const k =
+          Utils::Vector3d{{shifts[0u][indices[KX]] * wavevector[0u],
+                           shifts[1u][indices[KY]] * wavevector[1u],
+                           shifts[2u][indices[KZ]] * wavevector[2u]}};
+      g[index] = G_opt<S, m>(params.cao, params.alpha, k, params.a);
     }
-  }
+    ++index;
+  });
 
   return g;
 }
-
-#endif
diff --git a/src/core/p3m/influence_function_dipolar.hpp b/src/core/p3m/influence_function_dipolar.hpp
index da52a265e05..d52a1a6d675 100644
--- a/src/core/p3m/influence_function_dipolar.hpp
+++ b/src/core/p3m/influence_function_dipolar.hpp
@@ -18,27 +18,25 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP
-#define ESPRESSO_CORE_P3M_INFLUENCE_FUNCTION_DIPOLAR_HPP
+
+#pragma once
 
 #include "config/config.hpp"
 
 #if defined(DP3M)
 
 #include "p3m/common.hpp"
+#include "p3m/for_each_3d.hpp"
+#include "p3m/math.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
-#include <utils/index.hpp>
 #include <utils/math/int_pow.hpp>
-#include <utils/math/sinc.hpp>
 #include <utils/math/sqr.hpp>
 
-#include <boost/range/numeric.hpp>
-
 #include <cmath>
 #include <cstddef>
 #include <functional>
+#include <numbers>
 #include <vector>
 
 /** Calculate the aliasing sums for the optimal influence function.
@@ -56,40 +54,41 @@
 template <std::size_t S>
 double G_opt_dipolar(P3MParameters const &params, Utils::Vector3i const &shift,
                      Utils::Vector3i const &d_op) {
-  using Utils::int_pow;
-  using Utils::sinc;
+
   auto constexpr limit = P3M_BRILLOUIN;
   auto constexpr exp_limit = 30.;
-  auto const exponent = 2. * params.cao;
-
-  auto numerator = 0.0;
-  auto denominator = 0.0;
-
-  auto const f1 = 1.0 / static_cast<double>(params.mesh[0]);
-  auto const f2 = Utils::sqr(Utils::pi() / params.alpha_L);
-
-  for (int mx = -limit; mx <= limit; mx++) {
-    auto const nmx = shift[0] + params.mesh[0] * mx;
-    auto const sx = std::pow(sinc(f1 * nmx), exponent);
-    for (int my = -limit; my <= limit; my++) {
-      auto const nmy = shift[1] + params.mesh[0] * my;
-      auto const sy = sx * std::pow(sinc(f1 * nmy), exponent);
-      for (int mz = -limit; mz <= limit; mz++) {
-        auto const nmz = shift[2] + params.mesh[0] * mz;
-        auto const sz = sy * std::pow(sinc(f1 * nmz), exponent);
-        auto const nm2 = Utils::sqr(nmx) + Utils::sqr(nmy) + Utils::sqr(nmz);
-        auto const exp_term = f2 * nm2;
+  auto constexpr m_start = Utils::Vector3i::broadcast(-limit);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1);
+  auto const cao = params.cao;
+  auto const mesh = params.mesh[0];
+  auto const offset =
+      static_cast<Utils::Vector3d>(shift) / static_cast<double>(mesh);
+  auto const f2 = Utils::sqr(std::numbers::pi / params.alpha_L);
+  auto indices = Utils::Vector3i{};
+  auto nm = Utils::Vector3i{};
+  auto fnm = Utils::Vector3d{};
+  auto numerator = 0.;
+  auto denominator = 0.;
+
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() {
+        auto const norm_sq = nm.norm2();
+        auto const sz = std::pow(Utils::product(fnm), 2 * cao);
+        auto const exp_term = f2 * norm_sq;
         if (exp_term < exp_limit) {
-          auto const f3 = sz * std::exp(-exp_term) / nm2;
-          auto const n_nm = d_op[0] * nmx + d_op[1] * nmy + d_op[2] * nmz;
-          numerator += f3 * int_pow<S>(n_nm);
+          auto const f3 = sz * std::exp(-exp_term) / norm_sq;
+          numerator += f3 * Utils::int_pow<S>(d_op * nm);
         }
         denominator += sz;
-      }
-    }
-  }
-  return numerator / (int_pow<S>(static_cast<double>(d_op.norm2())) *
-                      Utils::sqr(denominator));
+      },
+      [&](unsigned dim, int n) {
+        nm[dim] = shift[dim] + n * mesh;
+        fnm[dim] = math::sinc(offset[dim] + n * mesh);
+      });
+
+  return numerator / (Utils::int_pow<S>(static_cast<double>(d_op.norm2())) *
+                      Utils::int_pow<2>(denominator));
 }
 
 /**
@@ -102,21 +101,20 @@ double G_opt_dipolar(P3MParameters const &params, Utils::Vector3i const &shift,
  *
  * @param params DP3M parameters
  * @param n_start Lower left corner of the grid
- * @param n_end Upper right corner of the grid.
- * @param box_l Box size
+ * @param n_stop Upper right corner of the grid.
+ * @param inv_box_l Inverse box length
  * @return Values of the influence function at regular grid points.
  */
 template <std::size_t S>
 std::vector<double> grid_influence_function(P3MParameters const &params,
                                             Utils::Vector3i const &n_start,
-                                            Utils::Vector3i const &n_end,
-                                            Utils::Vector3d const &box_l) {
+                                            Utils::Vector3i const &n_stop,
+                                            Utils::Vector3d const &inv_box_l) {
 
-  auto const size = n_end - n_start;
+  auto const size = n_stop - n_start;
 
   /* The influence function grid */
-  auto g =
-      std::vector<double>(boost::accumulate(size, 1, std::multiplies<>()), 0.);
+  auto g = std::vector<double>(Utils::product(size), 0.);
 
   /* Skip influence function calculation in tuning mode,
      the results need not be correct for timing. */
@@ -124,61 +122,56 @@ std::vector<double> grid_influence_function(P3MParameters const &params,
     return g;
   }
 
-  double fak1 = Utils::int_pow<3>(static_cast<double>(params.mesh[0])) * 2.0 /
-                Utils::sqr(box_l[0]);
-
-  auto const shifts = detail::calc_meshift(params.mesh, false);
-  auto const d_ops = detail::calc_meshift(params.mesh, true);
-
-  Utils::Vector3i n{};
-  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
-    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
-      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
-        auto const ind = Utils::get_linear_index(n - n_start, size,
-                                                 Utils::MemoryOrder::ROW_MAJOR);
-
-        if (((n[0] % (params.mesh[0] / 2) == 0) &&
-             (n[1] % (params.mesh[0] / 2) == 0) &&
-             (n[2] % (params.mesh[0] / 2) == 0))) {
-          g[ind] = 0.0;
-        } else {
-          auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]],
-                                             shifts[0][n[2]]};
-          auto const d_op =
-              Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]};
-          auto const fak2 = G_opt_dipolar<S>(params, shift, d_op);
-          g[ind] = fak1 * fak2;
+  auto prefactor = Utils::int_pow<3>(static_cast<double>(params.mesh[0])) * 2. *
+                   Utils::int_pow<2>(inv_box_l[0]);
+
+  auto const offset = detail::calc_meshift(params.mesh, false)[0];
+  auto const d_op = detail::calc_meshift(params.mesh, true)[0];
+  auto const half_mesh = params.mesh[0] / 2;
+  auto indices = Utils::Vector3i{};
+  auto shift_off = Utils::Vector3i{};
+  auto d_op_off = Utils::Vector3i{};
+  auto index = std::size_t(0u);
+
+  for_each_3d(
+      n_start, n_stop, indices,
+      [&]() {
+        if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or
+             (indices[2] % half_mesh != 0))) {
+          g[index] = prefactor * G_opt_dipolar<S>(params, shift_off, d_op_off);
         }
-      }
-    }
-  }
+        ++index;
+      },
+      [&](unsigned dim, int n) {
+        d_op_off[dim] = d_op[n];
+        shift_off[dim] = offset[n];
+      });
+
   return g;
 }
 
 inline double G_opt_dipolar_self_energy(P3MParameters const &params,
                                         Utils::Vector3i const &shift) {
-  using Utils::sinc;
+
   auto constexpr limit = P3M_BRILLOUIN + 1;
-  auto const exponent = 2. * params.cao;
-
-  auto u_sum = 0.0;
-
-  auto const f1 = 1.0 / static_cast<double>(params.mesh[0]);
-
-  for (int mx = -limit; mx <= limit; mx++) {
-    auto const nmx = shift[0] + params.mesh[0] * mx;
-    auto const sx = std::pow(sinc(f1 * nmx), exponent);
-    for (int my = -limit; my <= limit; my++) {
-      auto const nmy = shift[1] + params.mesh[0] * my;
-      auto const sy = sx * std::pow(sinc(f1 * nmy), exponent);
-      for (int mz = -limit; mz <= limit; mz++) {
-        auto const nmz = shift[2] + params.mesh[0] * mz;
-        auto const sz = sy * std::pow(sinc(f1 * nmz), exponent);
-        u_sum += sz;
-      }
-    }
-  }
-  return u_sum;
+  auto constexpr m_start = Utils::Vector3i::broadcast(-limit);
+  auto constexpr m_stop = Utils::Vector3i::broadcast(limit + 1);
+  auto const cao = params.cao;
+  auto const mesh = params.mesh[0];
+  auto const offset =
+      static_cast<Utils::Vector3d>(shift) / static_cast<double>(mesh);
+  auto indices = Utils::Vector3i{};
+  auto fnm = Utils::Vector3d{};
+  auto energy = 0.;
+
+  for_each_3d(
+      m_start, m_stop, indices,
+      [&]() { energy += std::pow(Utils::product(fnm), 2 * cao); },
+      [&](unsigned dim, int n) {
+        fnm[dim] = math::sinc(offset[dim] + n * mesh);
+      });
+
+  return energy;
 }
 
 /**
@@ -186,42 +179,39 @@ inline double G_opt_dipolar_self_energy(P3MParameters const &params,
  *
  * @param params DP3M parameters
  * @param n_start Lower left corner of the grid
- * @param n_end Upper right corner of the grid.
+ * @param n_stop Upper right corner of the grid.
  * @param g Energies on the grid.
  * @return Total self-energy.
  */
 inline double grid_influence_function_self_energy(
     P3MParameters const &params, Utils::Vector3i const &n_start,
-    Utils::Vector3i const &n_end, std::vector<double> const &g) {
-  auto const size = n_end - n_start;
-
-  auto const shifts = detail::calc_meshift(params.mesh, false);
-  auto const d_ops = detail::calc_meshift(params.mesh, true);
-
-  double energy = 0.0;
-  Utils::Vector3i n{};
-  for (n[0] = n_start[0]; n[0] < n_end[0]; n[0]++) {
-    for (n[1] = n_start[1]; n[1] < n_end[1]; n[1]++) {
-      for (n[2] = n_start[2]; n[2] < n_end[2]; n[2]++) {
-        if (((n[0] % (params.mesh[0] / 2) == 0) &&
-             (n[1] % (params.mesh[0] / 2) == 0) &&
-             (n[2] % (params.mesh[0] / 2) == 0))) {
-          energy += 0.0;
-        } else {
-          auto const ind = Utils::get_linear_index(
-              n - n_start, size, Utils::MemoryOrder::ROW_MAJOR);
-          auto const shift = Utils::Vector3i{shifts[0][n[0]], shifts[0][n[1]],
-                                             shifts[0][n[2]]};
-          auto const d_op =
-              Utils::Vector3i{d_ops[0][n[0]], d_ops[0][n[1]], d_ops[0][n[2]]};
-          auto const U2 = G_opt_dipolar_self_energy(params, shift);
-          energy += g[ind] * U2 * d_op.norm2();
+    Utils::Vector3i const &n_stop, std::vector<double> const &g) {
+
+  auto const offset = detail::calc_meshift(params.mesh, false)[0];
+  auto const d_op = detail::calc_meshift(params.mesh, true)[0];
+  auto const half_mesh = params.mesh[0] / 2;
+  auto indices = Utils::Vector3i{};
+  auto shift_off = Utils::Vector3i{};
+  auto d_op_off = Utils::Vector3i{};
+  auto index = std::size_t(0u);
+  auto energy = 0.;
+
+  for_each_3d(
+      n_start, n_stop, indices,
+      [&]() {
+        if (((indices[0] % half_mesh != 0) or (indices[1] % half_mesh != 0) or
+             (indices[2] % half_mesh != 0))) {
+          auto const U2 = G_opt_dipolar_self_energy(params, shift_off);
+          energy += g[index] * U2 * d_op_off.norm2();
         }
-      }
-    }
-  }
+        ++index;
+      },
+      [&](unsigned dim, int n) {
+        d_op_off[dim] = d_op[n];
+        shift_off[dim] = offset[n];
+      });
+
   return energy;
 }
 
-#endif
-#endif
+#endif // defined(DP3M)
diff --git a/src/core/p3m/interpolation.hpp b/src/core/p3m/interpolation.hpp
index 0536324ae93..8aa6b76525e 100644
--- a/src/core/p3m/interpolation.hpp
+++ b/src/core/p3m/interpolation.hpp
@@ -18,18 +18,18 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_CORE_P3M_INTERPOLATION_HPP
-#define ESPRESSO_CORE_P3M_INTERPOLATION_HPP
 
-#include <utils/Span.hpp>
+#pragma once
+
 #include <utils/index.hpp>
 #include <utils/math/bspline.hpp>
 
-#include <boost/range/algorithm/copy.hpp>
-
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
+#include <span>
 #include <tuple>
+#include <utility>
 #include <vector>
 
 /**
@@ -84,9 +84,9 @@ class p3m_interpolation_cache {
 
     ca_fmp.push_back(w.ind);
     auto it = std::back_inserter(ca_frac);
-    boost::copy(w.w_x, it);
-    boost::copy(w.w_y, it);
-    boost::copy(w.w_z, it);
+    std::ranges::copy(w.w_x, it);
+    std::ranges::copy(w.w_y, it);
+    std::ranges::copy(w.w_z, it);
   }
 
   /**
@@ -102,17 +102,17 @@ class p3m_interpolation_cache {
    */
   template <int cao> InterpolationWeights<cao> load(std::size_t i) const {
     assert(cao == m_cao);
-
-    using Utils::make_const_span;
     assert(i < size());
 
     InterpolationWeights<cao> ret;
     ret.ind = ca_fmp[i];
 
-    auto const offset = ca_frac.data() + 3 * i * cao;
-    boost::copy(make_const_span(offset + 0 * cao, cao), ret.w_x.begin());
-    boost::copy(make_const_span(offset + 1 * cao, cao), ret.w_y.begin());
-    boost::copy(make_const_span(offset + 2 * cao, cao), ret.w_z.begin());
+    auto const view = std::span(std::as_const(ca_frac));
+    auto const offset = 3ul * i * static_cast<std::size_t>(cao);
+
+    std::ranges::copy(view.subspan(offset + 0ul * cao, cao), ret.w_x.begin());
+    std::ranges::copy(view.subspan(offset + 1ul * cao, cao), ret.w_y.begin());
+    std::ranges::copy(view.subspan(offset + 2ul * cao, cao), ret.w_z.begin());
 
     return ret;
   }
@@ -206,5 +206,3 @@ void p3m_interpolate(P3MLocalMesh const &local_mesh,
     q_ind += local_mesh.q_21_off;
   }
 }
-
-#endif
diff --git a/src/core/p3m/math.hpp b/src/core/p3m/math.hpp
new file mode 100644
index 00000000000..2da9b38dd19
--- /dev/null
+++ b/src/core/p3m/math.hpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (C) 2010-2024 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <utils/device_qualifier.hpp>
+#include <utils/math/sqr.hpp>
+
+#ifndef __CUDACC__
+#include <cmath>
+#endif
+#include <numbers>
+#include <stdexcept>
+#include <string>
+
+namespace math {
+
+/** @brief Return the absolute value of x. */
+inline DEVICE_QUALIFIER auto abs(double x) { return fabs(x); }
+
+/** @brief Return the absolute value of x. */
+inline DEVICE_QUALIFIER auto abs(float x) { return fabsf(x); }
+
+/**
+ * @brief Calculate the function @f$ \mathrm{sinc}(x) = \sin(\pi x)/(\pi x) @f$.
+ *
+ * (same convention as in @cite hockney88a). In order to avoid divisions
+ * by 0, arguments whose modulus is smaller than @f$ \epsilon = 0.1 @f$
+ * are evaluated by an 8th order Taylor expansion.
+ * Note that the difference between sinc(x) and this expansion
+ * is smaller than 0.235e-12, if x is smaller than @f$ \epsilon @f$.
+ * (The next term in the expansion is the 10th order contribution, i.e.
+ * @f$ \pi^{10} x^{10}/39916800 \approx 0.2346 \cdot x^{12} @f$).
+ */
+template <typename T> DEVICE_QUALIFIER auto sinc(T x) {
+  auto constexpr epsilon = T(0.1);
+#if not defined(__CUDACC__)
+  using std::sin;
+#endif
+
+  auto const pix = std::numbers::pi_v<T> * x;
+
+  if (::math::abs(x) > epsilon)
+    return sin(pix) / pix;
+
+  auto constexpr factorial = [](int n) consteval {
+    int acc{1}, c{1};
+    while (c < n) {
+      acc *= ++c;
+    }
+    return acc;
+  };
+
+  /* Coefficients of the Taylor expansion of sinc */
+  auto constexpr c0 = T(+1) / T(factorial(1));
+  auto constexpr c2 = T(-1) / T(factorial(3));
+  auto constexpr c4 = T(+1) / T(factorial(5));
+  auto constexpr c6 = T(-1) / T(factorial(7));
+  auto constexpr c8 = T(+1) / T(factorial(9));
+
+  auto const pix2 = pix * pix;
+  return c0 + pix2 * (c2 + pix2 * (c4 + pix2 * (c6 + pix2 * c8)));
+}
+
+/**
+ * @brief One of the aliasing sums used to compute k-space errors.
+ * Fortunately the one which is most important (because it converges
+ * most slowly, since it is not damped exponentially) can be calculated
+ * analytically. The result (which depends on the order of the spline
+ * interpolation) can be written as an even trigonometric polynomial.
+ * The results are tabulated here (the employed formula is eq. 7-66
+ * p. 233 in @cite hockney88a).
+ */
+template <int cao>
+DEVICE_QUALIFIER auto analytic_cotangent_sum(int n, double mesh_i) {
+  static_assert(cao >= 1 and cao <= 7);
+#if not defined(__CUDACC__)
+  using std::cos;
+#endif
+  auto const theta = static_cast<double>(n) * mesh_i * std::numbers::pi;
+  auto const c = Utils::sqr(cos(theta));
+
+  if constexpr (cao == 1) {
+    return 1.;
+  }
+  if constexpr (cao == 2) {
+    return (1. + c * 2.) / 3.;
+  }
+  if constexpr (cao == 3) {
+    return (2. + c * (11. + c * 2.)) / 15.;
+  }
+  if constexpr (cao == 4) {
+    return (17. + c * (180. + c * (114. + c * 4.))) / 315.;
+  }
+  if constexpr (cao == 5) {
+    return (62. + c * (1072. + c * (1452. + c * (247. + c * 2.)))) / 2835.;
+  }
+  if constexpr (cao == 6) {
+    return (1382. +
+            c * (35396. + c * (83021. + c * (34096. + c * (2026. + c * 4.))))) /
+           155925.;
+  }
+  return (21844. +
+          c * (776661. +
+               c * (2801040. +
+                    c * (2123860. + c * (349500. + c * (8166. + c * 4.)))))) /
+         6081075.;
+}
+
+inline auto get_analytic_cotangent_sum_kernel(int cao) {
+  decltype(&analytic_cotangent_sum<1>) ptr = nullptr;
+  if (cao == 1) {
+    ptr = &analytic_cotangent_sum<1>;
+  } else if (cao == 2) {
+    ptr = &analytic_cotangent_sum<2>;
+  } else if (cao == 3) {
+    ptr = &analytic_cotangent_sum<3>;
+  } else if (cao == 4) {
+    ptr = &analytic_cotangent_sum<4>;
+  } else if (cao == 5) {
+    ptr = &analytic_cotangent_sum<5>;
+  } else if (cao == 6) {
+    ptr = &analytic_cotangent_sum<6>;
+  } else if (cao == 7) {
+    ptr = &analytic_cotangent_sum<7>;
+  }
+  if (ptr == nullptr) {
+    throw std::logic_error("Invalid value cao=" + std::to_string(cao));
+  }
+  return ptr;
+}
+
+} // namespace math
diff --git a/src/core/p3m/send_mesh.cpp b/src/core/p3m/send_mesh.cpp
index f08262317d4..9bb377a7688 100644
--- a/src/core/p3m/send_mesh.cpp
+++ b/src/core/p3m/send_mesh.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -18,15 +18,15 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
+
 #include "config/config.hpp"
 
-#if defined(P3M) || defined(DP3M)
+#if defined(P3M) or defined(DP3M)
 
+#include "fft/fft.hpp"
 #include "p3m/common.hpp"
-#include "p3m/fft.hpp"
 #include "p3m/send_mesh.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 #include <utils/mpi/cart_comm.hpp>
 
@@ -35,6 +35,7 @@
 #include <mpi.h>
 
 #include <cstddef>
+#include <span>
 #include <utility>
 
 /** Add values of a 3d-grid input block (size[3]) to values of 3d-grid
@@ -70,8 +71,8 @@ static void p3m_add_block(double const *in, double *out, int const start[3],
   }
 }
 
-void p3m_send_mesh::resize(const boost::mpi::communicator &comm,
-                           const P3MLocalMesh &local_mesh) {
+void p3m_send_mesh::resize(boost::mpi::communicator const &comm,
+                           P3MLocalMesh const &local_mesh) {
   int done[3] = {0, 0, 0};
   /* send grids */
   for (int i = 0; i < 3; i++) {
@@ -145,9 +146,9 @@ void p3m_send_mesh::resize(const boost::mpi::communicator &comm,
   }
 }
 
-void p3m_send_mesh::gather_grid(Utils::Span<double *> meshes,
-                                const boost::mpi::communicator &comm,
-                                const Utils::Vector3i &dim) {
+void p3m_send_mesh::gather_grid(boost::mpi::communicator const &comm,
+                                std::span<double *> meshes,
+                                Utils::Vector3i const &dim) {
   auto const node_neighbors = Utils::Mpi::cart_neighbors<3>(comm);
   send_grid.resize(max * meshes.size());
   recv_grid.resize(max * meshes.size());
@@ -159,8 +160,8 @@ void p3m_send_mesh::gather_grid(Utils::Span<double *> meshes,
     /* pack send block */
     if (s_size[s_dir] > 0)
       for (std::size_t i = 0; i < meshes.size(); i++) {
-        fft_pack_block(meshes[i], send_grid.data() + i * s_size[s_dir],
-                       s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
+        fft::fft_pack_block(meshes[i], send_grid.data() + i * s_size[s_dir],
+                            s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
       }
 
     /* communication */
@@ -183,9 +184,9 @@ void p3m_send_mesh::gather_grid(Utils::Span<double *> meshes,
   }
 }
 
-void p3m_send_mesh::spread_grid(Utils::Span<double *> meshes,
-                                const boost::mpi::communicator &comm,
-                                const Utils::Vector3i &dim) {
+void p3m_send_mesh::spread_grid(boost::mpi::communicator const &comm,
+                                std::span<double *> meshes,
+                                Utils::Vector3i const &dim) {
   auto const node_neighbors = Utils::Mpi::cart_neighbors<3>(comm);
   send_grid.resize(max * meshes.size());
   recv_grid.resize(max * meshes.size());
@@ -197,8 +198,8 @@ void p3m_send_mesh::spread_grid(Utils::Span<double *> meshes,
     /* pack send block */
     if (r_size[r_dir] > 0)
       for (std::size_t i = 0; i < meshes.size(); i++) {
-        fft_pack_block(meshes[i], send_grid.data() + i * r_size[r_dir],
-                       r_ld[r_dir], r_dim[r_dir], dim.data(), 1);
+        fft::fft_pack_block(meshes[i], send_grid.data() + i * r_size[r_dir],
+                            r_ld[r_dir], r_dim[r_dir], dim.data(), 1);
       }
     /* communication */
     if (node_neighbors[r_dir] != comm.rank()) {
@@ -213,11 +214,11 @@ void p3m_send_mesh::spread_grid(Utils::Span<double *> meshes,
     /* un pack recv block */
     if (s_size[s_dir] > 0) {
       for (std::size_t i = 0; i < meshes.size(); i++) {
-        fft_unpack_block(recv_grid.data() + i * s_size[s_dir], meshes[i],
-                         s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
+        fft::fft_unpack_block(recv_grid.data() + i * s_size[s_dir], meshes[i],
+                              s_ld[s_dir], s_dim[s_dir], dim.data(), 1);
       }
     }
   }
 }
 
-#endif
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/p3m/send_mesh.hpp b/src/core/p3m/send_mesh.hpp
index bcee38e72e5..8fa73435880 100644
--- a/src/core/p3m/send_mesh.hpp
+++ b/src/core/p3m/send_mesh.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2010-2024 The ESPResSo project
  * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
  *   Max-Planck-Institute for Polymer Research, Theory Group
  *
@@ -18,20 +18,20 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_CORE_P3M_SEND_MESH_HPP
-#define ESPRESSO_CORE_P3M_SEND_MESH_HPP
+
+#pragma once
 
 #include "config/config.hpp"
 
-#if defined(P3M) || defined(DP3M)
+#if defined(P3M) or defined(DP3M)
 
 #include "p3m/common.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 
 #include <boost/mpi/communicator.hpp>
 
+#include <span>
 #include <vector>
 
 /** Structure for send/recv meshes. */
@@ -66,22 +66,20 @@ class p3m_send_mesh {
   std::vector<double> recv_grid;
 
 public:
-  void resize(const boost::mpi::communicator &comm,
-              const P3MLocalMesh &local_mesh);
-  void gather_grid(Utils::Span<double *> meshes,
-                   const boost::mpi::communicator &comm,
-                   const Utils::Vector3i &dim);
-  void gather_grid(double *mesh, const boost::mpi::communicator &comm,
-                   const Utils::Vector3i &dim) {
-    gather_grid(Utils::make_span(&mesh, 1), comm, dim);
+  void resize(boost::mpi::communicator const &comm,
+              P3MLocalMesh const &local_mesh);
+  void gather_grid(boost::mpi::communicator const &comm,
+                   std::span<double *> meshes, Utils::Vector3i const &dim);
+  void gather_grid(boost::mpi::communicator const &comm, double *mesh,
+                   Utils::Vector3i const &dim) {
+    gather_grid(comm, std::span(&mesh, 1u), dim);
   }
-  void spread_grid(Utils::Span<double *> meshes,
-                   const boost::mpi::communicator &comm,
-                   const Utils::Vector3i &dim);
-  void spread_grid(double *mesh, const boost::mpi::communicator &comm,
-                   const Utils::Vector3i &dim) {
-    spread_grid(Utils::make_span(&mesh, 1), comm, dim);
+  void spread_grid(boost::mpi::communicator const &comm,
+                   std::span<double *> meshes, Utils::Vector3i const &dim);
+  void spread_grid(boost::mpi::communicator const &comm, double *mesh,
+                   Utils::Vector3i const &dim) {
+    spread_grid(comm, std::span(&mesh, 1u), dim);
   }
 };
-#endif
-#endif
+
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/particle_node.cpp b/src/core/particle_node.cpp
index 3238d86631a..8bd95199b4d 100644
--- a/src/core/particle_node.cpp
+++ b/src/core/particle_node.cpp
@@ -30,7 +30,6 @@
 #include "system/System.hpp"
 
 #include <utils/Cache.hpp>
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 #include <utils/keys.hpp>
 #include <utils/mpi/gatherv.hpp>
@@ -40,14 +39,12 @@
 #include <boost/mpi/collectives/gather.hpp>
 #include <boost/mpi/collectives/reduce.hpp>
 #include <boost/mpi/collectives/scatter.hpp>
-#include <boost/optional.hpp>
-#include <boost/range/algorithm/sort.hpp>
-#include <boost/range/numeric.hpp>
 
 #include <algorithm>
 #include <cmath>
 #include <functional>
 #include <iterator>
+#include <span>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
@@ -221,7 +218,7 @@ REGISTER_CALLBACK(mpi_get_particles_local)
  *
  * @returns The particle list.
  */
-static std::vector<Particle> mpi_get_particles(Utils::Span<const int> ids) {
+static std::vector<Particle> mpi_get_particles(std::span<const int> ids) {
   mpi_call(mpi_get_particles_local);
   /* Return value */
   std::vector<Particle> parts(ids.size());
@@ -264,7 +261,7 @@ static std::vector<Particle> mpi_get_particles(Utils::Span<const int> ids) {
   return parts;
 }
 
-void prefetch_particle_data(Utils::Span<const int> in_ids) {
+void prefetch_particle_data(std::span<const int> in_ids) {
   /* Nothing to do on a single node. */
   // NOLINTNEXTLINE(clang-analyzer-core.NonNullParamChecker)
   if (comm_cart.size() == 1)
@@ -422,10 +419,9 @@ static void clear_particle_type_map() {
  * largest id.
  */
 static int calculate_max_seen_id() {
-  return boost::accumulate(particle_node, -1,
-                           [](int max, const std::pair<int, int> &kv) {
-                             return std::max(max, kv.first);
-                           });
+  return std::accumulate(
+      particle_node.begin(), particle_node.end(), -1,
+      [](int max, auto const &kv) { return std::max(max, kv.first); });
 }
 
 /**
@@ -586,7 +582,7 @@ std::vector<int> get_particle_ids() {
     build_particle_node();
 
   auto ids = Utils::keys(particle_node);
-  boost::sort(ids);
+  std::ranges::sort(ids);
 
   return ids;
 }
diff --git a/src/core/particle_node.hpp b/src/core/particle_node.hpp
index 76b7c6567c1..f41a04ff6d7 100644
--- a/src/core/particle_node.hpp
+++ b/src/core/particle_node.hpp
@@ -30,10 +30,10 @@
 
 #include "Particle.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 
 #include <cstddef>
+#include <span>
 #include <vector>
 
 namespace type_tracking {
@@ -60,7 +60,7 @@ const Particle &get_particle_data(int p_id);
  *
  * @param ids Ids of the particles that should be fetched.
  */
-void prefetch_particle_data(Utils::Span<const int> ids);
+void prefetch_particle_data(std::span<const int> ids);
 
 /** @brief Invalidate the fetch cache for get_particle_data. */
 void invalidate_fetch_cache();
diff --git a/src/core/polymer.cpp b/src/core/polymer.cpp
index ad7c67244da..02d3b3b7c25 100644
--- a/src/core/polymer.cpp
+++ b/src/core/polymer.cpp
@@ -39,23 +39,23 @@
 #include "system/System.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/vec_rotate.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
-#include <boost/optional.hpp>
 
 #include <cmath>
 #include <cstddef>
 #include <functional>
 #include <memory>
+#include <numbers>
+#include <optional>
 #include <stdexcept>
 #include <vector>
 
 template <class RNG>
 static Utils::Vector3d random_position(BoxGeometry const &box_geo, RNG &rng) {
   Utils::Vector3d v;
-  for (int i = 0; i < 3; ++i)
+  for (auto i = 0u; i < 3u; ++i)
     v[i] = box_geo.length()[i] * rng();
   return v;
 }
@@ -63,7 +63,7 @@ static Utils::Vector3d random_position(BoxGeometry const &box_geo, RNG &rng) {
 template <class RNG> static Utils::Vector3d random_unit_vector(RNG &rng) {
   Utils::Vector3d v;
   double const phi = acos(1. - 2. * rng());
-  double const theta = 2. * Utils::pi() * rng();
+  double const theta = 2. * std::numbers::pi * rng();
   v[0] = sin(phi) * cos(theta);
   v[1] = sin(phi) * sin(theta);
   v[2] = cos(phi);
@@ -192,15 +192,15 @@ draw_polymer_positions(System::System const &system, int const n_polymers,
 
   /* Try up to max_tries times to draw a valid position */
   auto draw_valid_monomer_position =
-      [&](int p, int m) -> boost::optional<Utils::Vector3d> {
-    for (unsigned i = 0; i < max_tries; i++) {
+      [&](int p, int m) -> std::optional<Utils::Vector3d> {
+    for (auto i = 0; i < max_tries; i++) {
       auto const trial_pos = draw_monomer_position(p, m);
       if (is_valid_pos(trial_pos)) {
         return trial_pos;
       }
     }
 
-    return {};
+    return std::nullopt;
   };
 
   // create remaining monomers' positions by backtracking.
diff --git a/src/core/pressure.cpp b/src/core/pressure.cpp
index 0a06a6a2065..a0e14ca5b91 100644
--- a/src/core/pressure.cpp
+++ b/src/core/pressure.cpp
@@ -34,13 +34,13 @@
 #include "system/System.hpp"
 #include "virtual_sites/relative.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 #include <utils/flatten.hpp>
 
-#include <boost/range/algorithm/copy.hpp>
-
+#include <algorithm>
+#include <cstddef>
 #include <memory>
+#include <span>
 
 namespace System {
 std::shared_ptr<Observable_stat> System::calculate_pressure() {
@@ -70,16 +70,16 @@ std::shared_ptr<Observable_stat> System::calculate_pressure() {
   short_range_loop(
       [this, coulomb_force_kernel_ptr = get_ptr(coulomb_force_kernel),
        &obs_pressure](Particle const &p1, int bond_id,
-                      Utils::Span<Particle *> partners) {
+                      std::span<Particle *> partners) {
         auto const &iaparams = *bonded_ia_params.at(bond_id);
         auto const result = calc_bonded_pressure_tensor(
             iaparams, p1, partners, *box_geo, coulomb_force_kernel_ptr);
         if (result) {
-          auto const &tensor = result.get();
+          auto const &tensor = result.value();
           /* pressure tensor part */
-          for (int k = 0; k < 3; k++)
-            for (int l = 0; l < 3; l++)
-              obs_pressure.bonded_contribution(bond_id)[k * 3 + l] +=
+          for (std::size_t k = 0u; k < 3u; k++)
+            for (std::size_t l = 0u; l < 3u; l++)
+              obs_pressure.bonded_contribution(bond_id)[k * 3u + l] +=
                   tensor(k, l);
 
           return false;
@@ -101,7 +101,7 @@ std::shared_ptr<Observable_stat> System::calculate_pressure() {
 #ifdef ELECTROSTATICS
   /* calculate k-space part of electrostatic interaction. */
   auto const coulomb_pressure = coulomb.calc_pressure_long_range(local_parts);
-  boost::copy(coulomb_pressure, obs_pressure.coulomb.begin() + 9);
+  std::ranges::copy(coulomb_pressure, obs_pressure.coulomb.begin() + 9u);
 #endif
 #ifdef DIPOLES
   /* calculate k-space part of magnetostatic interaction. */
@@ -111,8 +111,8 @@ std::shared_ptr<Observable_stat> System::calculate_pressure() {
 #ifdef VIRTUAL_SITES_RELATIVE
   if (!obs_pressure.virtual_sites.empty()) {
     auto const vs_pressure = vs_relative_pressure_tensor(*cell_structure);
-    boost::copy(Utils::flatten(vs_pressure),
-                obs_pressure.virtual_sites.begin());
+    std::ranges::copy(Utils::flatten(vs_pressure),
+                      obs_pressure.virtual_sites.begin());
   }
 #endif
 
diff --git a/src/core/pressure_inline.hpp b/src/core/pressure_inline.hpp
index 924440b3463..91ff4a26d07 100644
--- a/src/core/pressure_inline.hpp
+++ b/src/core/pressure_inline.hpp
@@ -34,13 +34,13 @@
 #include "exclusions.hpp"
 #include "forces_inline.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 #include <utils/math/tensor_product.hpp>
 
-#include <boost/optional.hpp>
 #include <boost/variant.hpp>
 
+#include <optional>
+#include <span>
 #include <string>
 #include <tuple>
 
@@ -64,7 +64,7 @@ inline void add_non_bonded_pair_virials(
   if (do_nonbonded(p1, p2))
 #endif
   {
-    auto const force = calc_central_radial_force(p1, p2, ia_params, d, dist).f +
+    auto const force = calc_central_radial_force(ia_params, d, dist).f +
                        calc_central_radial_charge_force(p1, p2, ia_params, d,
                                                         dist, kernel_forces)
                            .f +
@@ -79,9 +79,9 @@ inline void add_non_bonded_pair_virials(
     /* real space Coulomb */
     auto const p_coulomb = (*kernel_pressure)(p1.q() * p2.q(), d, dist);
 
-    for (int i = 0; i < 3; i++) {
-      for (int j = 0; j < 3; j++) {
-        obs_pressure.coulomb[i * 3 + j] += p_coulomb(i, j);
+    for (std::size_t i = 0u; i < 3u; i++) {
+      for (std::size_t j = 0u; j < 3u; j++) {
+        obs_pressure.coulomb[i * 3u + j] += p_coulomb(i, j);
       }
     }
   }
@@ -96,7 +96,7 @@ inline void add_non_bonded_pair_virials(
 #endif // DIPOLES
 }
 
-inline boost::optional<Utils::Matrix<double, 3, 3>>
+inline std::optional<Utils::Matrix<double, 3, 3>>
 calc_bonded_virial_pressure_tensor(
     Bonded_IA_Parameters const &iaparams, Particle const &p1,
     Particle const &p2, BoxGeometry const &box_geo,
@@ -104,7 +104,7 @@ calc_bonded_virial_pressure_tensor(
   auto const dx = box_geo.get_mi_vector(p1.pos(), p2.pos());
   auto const result = calc_bond_pair_force(p1, p2, iaparams, dx, kernel);
   if (result) {
-    auto const &force = result.get();
+    auto const &force = result.value();
 
     return Utils::tensor_product(force, dx);
   }
@@ -112,7 +112,7 @@ calc_bonded_virial_pressure_tensor(
   return {};
 }
 
-inline boost::optional<Utils::Matrix<double, 3, 3>>
+inline std::optional<Utils::Matrix<double, 3, 3>>
 calc_bonded_three_body_pressure_tensor(Bonded_IA_Parameters const &iaparams,
                                        Particle const &p1, Particle const &p2,
                                        Particle const &p3,
@@ -130,7 +130,7 @@ calc_bonded_three_body_pressure_tensor(Bonded_IA_Parameters const &iaparams,
         calc_bonded_three_body_force(iaparams, box_geo, p1, p2, p3);
     if (result) {
       Utils::Vector3d force2, force3;
-      std::tie(std::ignore, force2, force3) = result.get();
+      std::tie(std::ignore, force2, force3) = result.value();
 
       return Utils::tensor_product(force2, dx21) +
              Utils::tensor_product(force3, dx31);
@@ -145,9 +145,9 @@ calc_bonded_three_body_pressure_tensor(Bonded_IA_Parameters const &iaparams,
   return {};
 }
 
-inline boost::optional<Utils::Matrix<double, 3, 3>> calc_bonded_pressure_tensor(
+inline std::optional<Utils::Matrix<double, 3, 3>> calc_bonded_pressure_tensor(
     Bonded_IA_Parameters const &iaparams, Particle const &p1,
-    Utils::Span<Particle *> partners, BoxGeometry const &box_geo,
+    std::span<Particle *> partners, BoxGeometry const &box_geo,
     Coulomb::ShortRangeForceKernel::kernel_type const *kernel) {
   switch (number_of_partners(iaparams)) {
   case 1:
@@ -174,7 +174,7 @@ inline void add_kinetic_virials(Particle const &p1,
     return;
 
   /* kinetic pressure */
-  for (int k = 0; k < 3; k++)
-    for (int l = 0; l < 3; l++)
-      obs_pressure.kinetic[k * 3 + l] += p1.v()[k] * p1.v()[l] * p1.mass();
+  for (std::size_t k = 0u; k < 3u; k++)
+    for (std::size_t l = 0u; l < 3u; l++)
+      obs_pressure.kinetic[k * 3u + l] += p1.v()[k] * p1.v()[l] * p1.mass();
 }
diff --git a/src/core/random.hpp b/src/core/random.hpp
index 5c350f5cda2..3bfc0d5a4c2 100644
--- a/src/core/random.hpp
+++ b/src/core/random.hpp
@@ -27,13 +27,13 @@
  */
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/u32_to_u64.hpp>
 #include <utils/uniform.hpp>
 
 #include <Random123/philox.h>
 
 #include <cstddef>
+#include <numbers>
 #include <random>
 #include <vector>
 
@@ -165,18 +165,17 @@ auto noise_gaussian(uint64_t counter, uint32_t seed, int key1, int key2 = 0) {
   // optimizations: the modulo is cached (logarithms are expensive), the
   // sin/cos are evaluated simultaneously by gcc or separately by Clang
   Utils::VectorXd<N> noise{};
-  constexpr double two_pi = 2.0 * Utils::pi();
   {
-    auto const modulo = sqrt(-2.0 * log(u[0]));
-    auto const angle = two_pi * u[1];
+    auto const modulo = sqrt(-2. * log(u[0]));
+    auto const angle = 2. * std::numbers::pi * u[1];
     noise[0] = modulo * cos(angle);
     if (N > 1) {
       noise[1] = modulo * sin(angle);
     }
   }
   if (N > 2) {
-    auto const modulo = sqrt(-2.0 * log(u[2]));
-    auto const angle = two_pi * u[3];
+    auto const modulo = sqrt(-2. * log(u[2]));
+    auto const angle = 2. * std::numbers::pi * u[3];
     noise[2] = modulo * cos(angle);
     if (N > 3) {
       noise[3] = modulo * sin(angle);
diff --git a/src/core/rattle.cpp b/src/core/rattle.cpp
index 631926784b0..47a1dfd19d7 100644
--- a/src/core/rattle.cpp
+++ b/src/core/rattle.cpp
@@ -35,6 +35,10 @@
 #include <boost/mpi/collectives/all_reduce.hpp>
 #include <boost/range/algorithm.hpp>
 
+#include <cmath>
+#include <functional>
+#include <span>
+
 /**
  * @brief copy current position
  *
@@ -108,20 +112,19 @@ static bool compute_correction_vector(CellStructure &cs,
                                       BoxGeometry const &box_geo,
                                       Kernel kernel) {
   bool correction = false;
-  cs.bond_loop(
-      [&correction, &kernel, &box_geo](Particle &p1, int bond_id,
-                                       Utils::Span<Particle *> partners) {
-        auto const &iaparams = *bonded_ia_params.at(bond_id);
-
-        if (auto const *bond = boost::get<RigidBond>(&iaparams)) {
-          auto const corrected = kernel(*bond, box_geo, p1, *partners[0]);
-          if (corrected)
-            correction = true;
-        }
-
-        /* Rigid bonds cannot break */
-        return false;
-      });
+  cs.bond_loop([&correction, &kernel, &box_geo](
+                   Particle &p1, int bond_id, std::span<Particle *> partners) {
+    auto const &iaparams = *bonded_ia_params.at(bond_id);
+
+    if (auto const *bond = boost::get<RigidBond>(&iaparams)) {
+      auto const corrected = kernel(*bond, box_geo, p1, *partners[0]);
+      if (corrected)
+        correction = true;
+    }
+
+    /* Rigid bonds cannot break */
+    return false;
+  });
 
   return correction;
 }
@@ -209,10 +212,8 @@ static bool calculate_velocity_correction(RigidBond const &ia_params,
  * @brief Apply velocity corrections
  *
  * @param particles particle range
- * @param box_geo Box geometry
  */
-static void apply_velocity_correction(ParticleRange const &particles,
-                                      BoxGeometry const &box_geo) {
+static void apply_velocity_correction(ParticleRange const &particles) {
   boost::for_each(particles,
                   [](Particle &p) { p.v() += p.rattle_params().correction; });
 }
@@ -237,7 +238,7 @@ void correct_velocity_shake(CellStructure &cs, BoxGeometry const &box_geo) {
 
     cs.ghosts_reduce_rattle_correction();
 
-    apply_velocity_correction(particles, box_geo);
+    apply_velocity_correction(particles);
     cs.ghosts_update(Cells::DATA_PART_MOMENTUM);
   }
 
diff --git a/src/core/reaction_methods/ReactionAlgorithm.cpp b/src/core/reaction_methods/ReactionAlgorithm.cpp
index e18d390a74c..341fc0a191b 100644
--- a/src/core/reaction_methods/ReactionAlgorithm.cpp
+++ b/src/core/reaction_methods/ReactionAlgorithm.cpp
@@ -29,7 +29,6 @@
 #include "system/System.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/contains.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -43,6 +42,7 @@
 #include <iterator>
 #include <limits>
 #include <map>
+#include <numbers>
 #include <optional>
 #include <stdexcept>
 #include <string>
@@ -336,8 +336,7 @@ void ReactionAlgorithm::check_exclusion_range(int p_id, int p_type) {
   if (neighbor_search_order_n) {
     auto all_ids = get_particle_ids_parallel();
     /* remove the inserted particle id */
-    all_ids.erase(std::remove(all_ids.begin(), all_ids.end(), p_id),
-                  all_ids.end());
+    std::erase(all_ids, p_id);
     particle_ids = all_ids;
   } else {
     auto &system = System::get_system();
@@ -465,7 +464,7 @@ Utils::Vector3d ReactionAlgorithm::get_random_position_in_box() {
     auto const random_radius =
         m_cyl_radius * std::sqrt(m_uniform_real_distribution(m_generator));
     auto const random_phi =
-        2. * Utils::pi() * m_uniform_real_distribution(m_generator);
+        2. * std::numbers::pi * m_uniform_real_distribution(m_generator);
     out_pos[0] = m_cyl_x + random_radius * cos(random_phi);
     out_pos[1] = m_cyl_y + random_radius * sin(random_phi);
     out_pos[2] = box_geo.length()[2] * m_uniform_real_distribution(m_generator);
diff --git a/src/core/reaction_methods/tests/CMakeLists.txt b/src/core/reaction_methods/tests/CMakeLists.txt
index 7dd67a9e634..e63fa9630a3 100644
--- a/src/core/reaction_methods/tests/CMakeLists.txt
+++ b/src/core/reaction_methods/tests/CMakeLists.txt
@@ -1,4 +1,5 @@
-# Copyright (C) 2021-2022 The ESPResSo project
+#
+# Copyright (C) 2021-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -14,14 +15,13 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
 
-include(unit_test)
+include(espresso_unit_test)
 
-unit_test(NAME SingleReaction_test SRC SingleReaction_test.cpp DEPENDS
-          espresso::core)
-unit_test(NAME ReactionAlgorithm_test SRC ReactionAlgorithm_test.cpp DEPENDS
-          espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME particle_tracking_test SRC particle_tracking_test.cpp DEPENDS
-          espresso::core Boost::mpi MPI::MPI_CXX)
-unit_test(NAME reaction_methods_utils_test SRC reaction_methods_utils_test.cpp
-          DEPENDS espresso::core)
+espresso_unit_test(SRC SingleReaction_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC ReactionAlgorithm_test.cpp DEPENDS espresso::core
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC particle_tracking_test.cpp DEPENDS espresso::core
+                   Boost::mpi MPI::MPI_CXX)
+espresso_unit_test(SRC reaction_methods_utils_test.cpp DEPENDS espresso::core)
diff --git a/src/core/scafacos/ScafacosContext.cpp b/src/core/scafacos/ScafacosContext.cpp
index 342dce89f70..8cd66b601ce 100644
--- a/src/core/scafacos/ScafacosContext.cpp
+++ b/src/core/scafacos/ScafacosContext.cpp
@@ -43,9 +43,9 @@ get_system_params() {
   auto const &system = System::get_system();
   auto const &box_geo = *system.box_geo;
   auto const &cell_structure = *system.cell_structure;
-  auto periodicity = Utils::Vector3i{static_cast<int>(box_geo.periodic(0)),
-                                     static_cast<int>(box_geo.periodic(1)),
-                                     static_cast<int>(box_geo.periodic(2))};
+  auto periodicity = Utils::Vector3i{static_cast<int>(box_geo.periodic(0u)),
+                                     static_cast<int>(box_geo.periodic(1u)),
+                                     static_cast<int>(box_geo.periodic(2u))};
   auto const n_part = boost::mpi::all_reduce(
       comm_cart, cell_structure.local_particles().size(), std::plus<>());
   return {box_geo.length(), periodicity, n_part};
diff --git a/src/core/stokesian_dynamics/sd_interface.cpp b/src/core/stokesian_dynamics/sd_interface.cpp
index 14685359462..d4464773f6d 100644
--- a/src/core/stokesian_dynamics/sd_interface.cpp
+++ b/src/core/stokesian_dynamics/sd_interface.cpp
@@ -34,7 +34,6 @@
 #include <utils/mpi/gather_buffer.hpp>
 #include <utils/mpi/scatter_buffer.hpp>
 
-#include <boost/range/algorithm.hpp>
 #include <boost/serialization/is_bitwise_serializable.hpp>
 
 #include <algorithm>
@@ -61,9 +60,9 @@ struct SD_particle_data {
   ParticleForce ext_force;
 
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
-    ar &type;
-    ar &pos;
-    ar &ext_force;
+    ar & type;
+    ar & pos;
+    ar & ext_force;
   }
 };
 
@@ -132,8 +131,9 @@ void propagate_vel_pos_sd(ParticleRangeStokesian const &particles,
   static std::vector<SD_particle_data> parts_buffer{};
 
   parts_buffer.clear();
-  boost::transform(particles, std::back_inserter(parts_buffer),
-                   [](auto const &p) { return SD_particle_data(p); });
+  std::transform(particles.begin(), particles.end(),
+                 std::back_inserter(parts_buffer),
+                 [](auto const &p) { return SD_particle_data(p); });
   Utils::Mpi::gather_buffer(parts_buffer, ::comm_cart, 0);
 
   /* Buffer that holds local particle data, and all particles on the head
diff --git a/src/core/system/GpuParticleData.cpp b/src/core/system/GpuParticleData.cpp
index 89d95f9dcb8..75f4df9e26d 100644
--- a/src/core/system/GpuParticleData.cpp
+++ b/src/core/system/GpuParticleData.cpp
@@ -28,7 +28,6 @@
 #include "cuda/CudaHostAllocator.hpp"
 #include "system/System.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 #include <utils/mpi/gather_buffer.hpp>
 #include <utils/mpi/scatter_buffer.hpp>
@@ -37,6 +36,8 @@
 #include <boost/serialization/is_bitwise_serializable.hpp>
 #include <boost/serialization/split_free.hpp>
 
+#include <cstddef>
+#include <span>
 #include <vector>
 
 void GpuParticleData::enable_particle_transfer() {
@@ -84,7 +85,7 @@ BOOST_SERIALIZATION_SPLIT_FREE(GpuParticleData::GpuParticle)
 static void pack_particles(ParticleRange const &particles,
                            GpuParticleData::GpuParticle *buffer) {
   auto const &box = *System::get_system().box_geo;
-  unsigned long int i = 0u;
+  std::size_t i = 0u;
   for (auto const &p : particles) {
     buffer[i].p = static_cast<Utils::Vector3f>(box.folded_position(p.pos()));
 #ifdef DIPOLES
@@ -129,11 +130,11 @@ void GpuParticleData::gather_particle_data(
  *                this is only touched if ROTATION is active.
  */
 static void add_forces_and_torques(ParticleRange const &particles,
-                                   Utils::Span<const float> forces,
-                                   Utils::Span<const float> torques) {
-  unsigned long int i = 0u;
+                                   std::span<const float> forces,
+                                   std::span<const float> torques) {
+  std::size_t i = 0ul;
   for (auto &p : particles) {
-    for (unsigned long int j = 0u; j < 3u; j++) {
+    for (std::size_t j = 0ul; j < 3ul; j++) {
       p.force()[j] += static_cast<double>(forces[3ul * i + j]);
 #ifdef ROTATION
       p.torque()[j] += static_cast<double>(torques[3ul * i + j]);
@@ -155,8 +156,8 @@ static void add_forces_and_torques(ParticleRange const &particles,
  *                     relevant on the head node.
  */
 void GpuParticleData::particles_scatter_forces(
-    ParticleRange const &particles, Utils::Span<float> host_forces,
-    Utils::Span<float> host_torques) const {
+    ParticleRange const &particles, std::span<float> host_forces,
+    std::span<float> host_torques) const {
 
   auto const size = 3ul * particles.size();
   auto const n_elements = static_cast<int>(size);
diff --git a/src/core/system/GpuParticleData.hpp b/src/core/system/GpuParticleData.hpp
index a97755b5cd4..6a933286935 100644
--- a/src/core/system/GpuParticleData.hpp
+++ b/src/core/system/GpuParticleData.hpp
@@ -27,12 +27,12 @@
 #include "cuda/CudaHostAllocator.hpp"
 #include "system/Leaf.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 
 #include <bitset>
 #include <cstddef>
 #include <memory>
+#include <span>
 
 /**
  * @brief Particle data communication manager for the GPU.
@@ -99,12 +99,12 @@ class GpuParticleData : public System::Leaf<GpuParticleData> {
                             pinned_vector<GpuParticle> &particle_data_host,
                             int this_node);
   void particles_scatter_forces(ParticleRange const &particles,
-                                Utils::Span<float> host_forces,
-                                Utils::Span<float> host_torques) const;
+                                std::span<float> host_forces,
+                                std::span<float> host_torques) const;
 
 public:
   GpuParticleData() = default;
-  ~GpuParticleData();
+  ~GpuParticleData() = default;
 
   void update() {
     if (m_need_particles_update and m_communication_enabled) {
diff --git a/src/core/system/GpuParticleData_cuda.cu b/src/core/system/GpuParticleData_cuda.cu
index 1c9794769d1..1c0be762de4 100644
--- a/src/core/system/GpuParticleData_cuda.cu
+++ b/src/core/system/GpuParticleData_cuda.cu
@@ -33,8 +33,6 @@
 #include "cuda/init.hpp"
 #include "cuda/utils.cuh"
 
-#include <utils/Span.hpp>
-
 #include <thrust/copy.h>
 #include <thrust/device_vector.h>
 
@@ -43,6 +41,7 @@
 #include <cstddef>
 #include <cstdio>
 #include <memory>
+#include <span>
 
 #if defined(OMPI_MPI_H) || defined(_MPI_H)
 #error CU-file includes mpi.h! This should not happen!
@@ -139,11 +138,11 @@ public:
     }
   }
 #endif
-  Utils::Span<float> get_particle_forces_host_span() {
+  std::span<float> get_particle_forces_host_span() {
     return {particle_forces_host.data(), particle_forces_host.size()};
   }
 #ifdef ROTATION
-  Utils::Span<float> get_particle_torques_host_span() {
+  std::span<float> get_particle_torques_host_span() {
     return {particle_torques_host.data(), particle_torques_host.size()};
   }
 #endif
@@ -153,8 +152,6 @@ void GpuParticleData::initialize() {
   m_data = GpuParticleData::Storage::make_shared(get_system().cleanup_queue);
 }
 
-GpuParticleData::~GpuParticleData() {}
-
 std::size_t GpuParticleData::n_particles() const {
   return m_data->particle_data_device.size();
 }
@@ -199,12 +196,11 @@ void GpuParticleData::enable_property(std::size_t property) {
 }
 
 bool GpuParticleData::has_compatible_device_impl() const {
-  auto result = true;
-  try {
+  auto result = false;
+  invoke_skip_cuda_exceptions([&result]() {
     cuda_check_device();
-  } catch (cuda_runtime_error const &err) {
-    result = false;
-  }
+    result = true;
+  });
   return result;
 }
 
@@ -215,8 +211,7 @@ void GpuParticleData::gpu_init_particle_comm() {
   try {
     cuda_check_device();
   } catch (cuda_runtime_error const &err) {
-    fprintf(stderr, "ERROR: %s\n", err.what());
-    errexit();
+    throw cuda_fatal_error(err.what());
   }
   m_data->realloc_device_memory();
 }
@@ -275,7 +270,7 @@ void GpuParticleData::copy_forces_to_host(ParticleRange const &particles,
 #ifdef ROTATION
     auto torques_buffer = m_data->get_particle_torques_host_span();
 #else
-    auto torques_buffer = Utils::Span<float>{nullptr, std::size_t{0ul}};
+    auto torques_buffer = std::span<float>();
 #endif
 
     // add forces and torques to the particles
diff --git a/src/core/system/Leaf.hpp b/src/core/system/Leaf.hpp
index 25633b15bbb..d6c911457c2 100644
--- a/src/core/system/Leaf.hpp
+++ b/src/core/system/Leaf.hpp
@@ -54,7 +54,7 @@ template <typename Class> class Leaf {
     m_system = system;
   }
 
-  void detach_system(std::shared_ptr<System> const &system) {
+  void detach_system([[maybe_unused]] std::shared_ptr<System> const &system) {
     assert(system);
     assert(not m_system.expired());
     assert(system == m_system.lock());
diff --git a/src/core/system/System.cpp b/src/core/system/System.cpp
index a87f56ca344..b44db5df266 100644
--- a/src/core/system/System.cpp
+++ b/src/core/system/System.cpp
@@ -44,7 +44,14 @@
 #include <utils/Vector.hpp>
 #include <utils/mpi/all_compare.hpp>
 
+#include <boost/mpi/collectives/all_reduce.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <functional>
 #include <memory>
+#include <stdexcept>
+#include <utility>
 
 namespace System {
 
@@ -131,7 +138,17 @@ void System::set_min_global_cut(double value) {
 
 void System::set_cell_structure_topology(CellStructureType topology) {
   if (topology == CellStructureType::REGULAR) {
-    cell_structure->set_regular_decomposition(get_interaction_range());
+    if (cell_structure->decomposition_type() == CellStructureType::REGULAR) {
+      // get fully connected info from exising regular decomposition
+      auto &old_regular_decomposition =
+          dynamic_cast<RegularDecomposition const &>(
+              std::as_const(*cell_structure).decomposition());
+      cell_structure->set_regular_decomposition(
+          get_interaction_range(),
+          old_regular_decomposition.fully_connected_boundary());
+    } else { // prev. decomposition is not a regular decomposition
+      cell_structure->set_regular_decomposition(get_interaction_range(), {});
+    }
   } else if (topology == CellStructureType::NSQUARE) {
     cell_structure->set_atom_decomposition();
   } else {
@@ -167,6 +184,20 @@ void System::on_boxl_change(bool skip_method_adaption) {
   Constraints::constraints.on_boxl_change();
 }
 
+void System::veto_boxl_change(bool skip_particle_checks) const {
+  if (not skip_particle_checks) {
+    auto const n_part = boost::mpi::all_reduce(
+        ::comm_cart, cell_structure->local_particles().size(), std::plus<>());
+    if (n_part > 0ul) {
+      throw std::runtime_error(
+          "Cannot reset the box length when particles are present");
+    }
+  }
+  Constraints::constraints.veto_boxl_change();
+  lb.veto_boxl_change();
+  ek.veto_boxl_change();
+}
+
 void System::on_node_grid_change() {
   update_local_geo();
   lb.on_node_grid_change();
diff --git a/src/core/system/System.hpp b/src/core/system/System.hpp
index 377cd8f1db7..2bf9dbeb5b5 100644
--- a/src/core/system/System.hpp
+++ b/src/core/system/System.hpp
@@ -196,7 +196,7 @@ class System : public std::enable_shared_from_this<System> {
   /** @brief Calculate initial particle forces from active thermostats. */
   void thermostat_force_init();
   /** @brief Calculate particle-lattice interactions. */
-  void lb_couple_particles(double time_step);
+  void lb_couple_particles();
 
   /** \name Hook procedures
    *  These procedures are called if several significant changes to
@@ -241,6 +241,7 @@ class System : public std::enable_shared_from_this<System> {
    *  initialized immediately (P3M etc.).
    */
   void on_observable_calc();
+  void veto_boxl_change(bool skip_particle_checks = false) const;
   /**@}*/
 
   /**
diff --git a/src/core/tuning.cpp b/src/core/tuning.cpp
index 89cd339ea35..c5ec28fa3e6 100644
--- a/src/core/tuning.cpp
+++ b/src/core/tuning.cpp
@@ -32,8 +32,6 @@
 
 #include <boost/mpi/collectives/all_reduce.hpp>
 #include <boost/mpi/collectives/broadcast.hpp>
-#include <boost/range/algorithm/max_element.hpp>
-#include <boost/range/algorithm/min_element.hpp>
 
 #include <mpi.h>
 
@@ -135,8 +133,8 @@ void System::tune_verlet_skin(double min_skin, double max_skin, double tol,
    * the maximal range that can be supported by the cell system, but
    * never larger than half the box size. */
   auto const max_permissible_skin = std::min(
-      *boost::min_element(cell_structure->max_cutoff()) - maximal_cutoff(),
-      0.5 * *boost::max_element(box_geo->length()));
+      std::ranges::min(cell_structure->max_cutoff()) - maximal_cutoff(),
+      0.5 * std::ranges::max(box_geo->length()));
 
   if (adjust_max_skin and max_skin > max_permissible_skin)
     b = max_permissible_skin;
diff --git a/src/core/unit_tests/BondList_test.cpp b/src/core/unit_tests/BondList_test.cpp
index 29b17360420..3651ff06fee 100644
--- a/src/core/unit_tests/BondList_test.cpp
+++ b/src/core/unit_tests/BondList_test.cpp
@@ -25,6 +25,7 @@
 
 #include <boost/archive/text_iarchive.hpp>
 #include <boost/archive/text_oarchive.hpp>
+#include <boost/range/algorithm/equal.hpp>
 
 #include <array>
 #include <iterator>
diff --git a/src/core/unit_tests/BoxGeometry_test.cpp b/src/core/unit_tests/BoxGeometry_test.cpp
index 5c7b13d4571..062e9d0c671 100644
--- a/src/core/unit_tests/BoxGeometry_test.cpp
+++ b/src/core/unit_tests/BoxGeometry_test.cpp
@@ -32,22 +32,22 @@ BOOST_AUTO_TEST_CASE(periodicity_test) {
   {
     auto const box = BoxGeometry{};
 
-    BOOST_CHECK(box.periodic(0));
-    BOOST_CHECK(box.periodic(1));
-    BOOST_CHECK(box.periodic(2));
+    BOOST_CHECK(box.periodic(0u));
+    BOOST_CHECK(box.periodic(1u));
+    BOOST_CHECK(box.periodic(2u));
   }
 
   /* setter */
   {
     auto box = BoxGeometry{};
 
-    box.set_periodic(0, false);
-    BOOST_CHECK(not box.periodic(0));
-    box.set_periodic(1, false);
-    BOOST_CHECK(not box.periodic(1));
-    box.set_periodic(2, false);
-    BOOST_CHECK(not box.periodic(2));
-    BOOST_CHECK_THROW(box.set_periodic(3, false), std::out_of_range);
+    box.set_periodic(0u, false);
+    BOOST_CHECK(not box.periodic(0u));
+    box.set_periodic(1u, false);
+    BOOST_CHECK(not box.periodic(1u));
+    box.set_periodic(2u, false);
+    BOOST_CHECK(not box.periodic(2u));
+    BOOST_CHECK_THROW(box.set_periodic(3u, false), std::out_of_range);
   }
 }
 
diff --git a/src/core/unit_tests/CMakeLists.txt b/src/core/unit_tests/CMakeLists.txt
index 95b6c389a20..92d56587c4b 100644
--- a/src/core/unit_tests/CMakeLists.txt
+++ b/src/core/unit_tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2010-2022 The ESPResSo project
+# Copyright (C) 2010-2024 The ESPResSo project
 # Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
 #   Max-Planck-Institute for Polymer Research, Theory Group
 #
@@ -19,69 +19,59 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
+include(espresso_unit_test)
 
-# Add tests here
-unit_test(NAME RuntimeError_test SRC RuntimeError_test.cpp DEPENDS
-          Boost::serialization)
-unit_test(NAME RuntimeErrorCollector_test SRC RuntimeErrorCollector_test.cpp
-          DEPENDS espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME EspressoSystemStandAlone_parallel_test SRC
-          EspressoSystemStandAlone_test.cpp DEPENDS espresso::core Boost::mpi
-          MPI::MPI_CXX NUM_PROC 4)
-unit_test(NAME EspressoSystemStandAlone_serial_test SRC
-          EspressoSystemStandAlone_test.cpp DEPENDS espresso::core Boost::mpi
-          MPI::MPI_CXX NUM_PROC 1)
-unit_test(NAME EspressoSystem_test SRC EspressoSystem_test.cpp DEPENDS
-          espresso::core Boost::mpi NUM_PROC 2)
-unit_test(NAME ResourceCleanup_test SRC ResourceCleanup_test.cpp DEPENDS
-          espresso::core Boost::mpi NUM_PROC 2)
-unit_test(NAME MpiCallbacks_test SRC MpiCallbacks_test.cpp DEPENDS
-          espresso::utils Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME ParticleIterator_test SRC ParticleIterator_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME p3m_test SRC p3m_test.cpp DEPENDS espresso::utils espresso::core)
-unit_test(NAME link_cell_test SRC link_cell_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Particle_test SRC Particle_test.cpp DEPENDS espresso::utils
-          Boost::serialization)
-unit_test(NAME Particle_serialization_test SRC Particle_serialization_test.cpp
-          DEPENDS espresso::utils Boost::serialization)
-unit_test(NAME rotation_test SRC rotation_test.cpp DEPENDS espresso::utils
-          espresso::core)
-unit_test(NAME field_coupling_couplings SRC field_coupling_couplings_test.cpp
-          DEPENDS espresso::utils)
-unit_test(NAME field_coupling_fields SRC field_coupling_fields_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME field_coupling_force_field SRC
-          field_coupling_force_field_test.cpp DEPENDS espresso::utils)
-unit_test(NAME periodic_fold_test SRC periodic_fold_test.cpp)
-unit_test(NAME grid_test SRC grid_test.cpp DEPENDS espresso::core)
-unit_test(NAME lees_edwards_test SRC lees_edwards_test.cpp DEPENDS
-          espresso::core)
-unit_test(NAME BoxGeometry_test SRC BoxGeometry_test.cpp DEPENDS espresso::core)
-unit_test(NAME LocalBox_test SRC LocalBox_test.cpp DEPENDS espresso::core)
-unit_test(NAME Verlet_list_test SRC Verlet_list_test.cpp DEPENDS espresso::core
-          NUM_PROC 4)
-unit_test(NAME VerletCriterion_test SRC VerletCriterion_test.cpp DEPENDS
-          espresso::core)
-unit_test(NAME thermostats_test SRC thermostats_test.cpp DEPENDS espresso::core)
-unit_test(NAME random_test SRC random_test.cpp DEPENDS espresso::utils
-          Random123)
-unit_test(NAME BondList_test SRC BondList_test.cpp DEPENDS espresso::core)
-unit_test(NAME energy_test SRC energy_test.cpp DEPENDS espresso::core)
-unit_test(NAME bonded_interactions_map_test SRC
-          bonded_interactions_map_test.cpp DEPENDS espresso::core)
-unit_test(NAME bond_breakage_test SRC bond_breakage_test.cpp DEPENDS
-          espresso::core)
+espresso_unit_test(SRC RuntimeError_test.cpp DEPENDS Boost::serialization)
+espresso_unit_test(SRC RuntimeErrorCollector_test.cpp DEPENDS espresso::core
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(
+  NAME EspressoSystemStandAlone_parallel_test SRC
+  EspressoSystemStandAlone_test.cpp DEPENDS espresso::core Boost::mpi
+  MPI::MPI_CXX NUM_PROC 4)
+espresso_unit_test(
+  NAME EspressoSystemStandAlone_serial_test SRC
+  EspressoSystemStandAlone_test.cpp DEPENDS espresso::core Boost::mpi
+  MPI::MPI_CXX NUM_PROC 1)
+espresso_unit_test(SRC EspressoSystem_test.cpp DEPENDS espresso::core
+                   Boost::mpi NUM_PROC 2)
+espresso_unit_test(SRC ResourceCleanup_test.cpp DEPENDS espresso::core
+                   Boost::mpi NUM_PROC 2)
+espresso_unit_test(SRC MpiCallbacks_test.cpp DEPENDS espresso::utils Boost::mpi
+                   MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC ParticleIterator_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC link_cell_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Particle_test.cpp DEPENDS espresso::utils
+                   Boost::serialization)
+espresso_unit_test(SRC Particle_serialization_test.cpp DEPENDS espresso::utils
+                   Boost::serialization)
+espresso_unit_test(SRC rotation_test.cpp DEPENDS espresso::utils espresso::core)
+espresso_unit_test(SRC field_coupling_couplings_test.cpp DEPENDS
+                   espresso::utils)
+espresso_unit_test(SRC field_coupling_fields_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC field_coupling_force_field_test.cpp DEPENDS
+                   espresso::utils)
+espresso_unit_test(SRC periodic_fold_test.cpp)
+espresso_unit_test(SRC grid_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC lees_edwards_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC BoxGeometry_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC LocalBox_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC Verlet_list_test.cpp DEPENDS espresso::core NUM_PROC 4)
+espresso_unit_test(SRC VerletCriterion_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC thermostats_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC random_test.cpp DEPENDS espresso::utils Random123)
+espresso_unit_test(SRC BondList_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC energy_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC bonded_interactions_map_test.cpp DEPENDS espresso::core)
+espresso_unit_test(SRC bond_breakage_test.cpp DEPENDS espresso::core)
 if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
   # AppleClang doesn't implement C++17's mathematical special functions
-  unit_test(NAME specfunc_test SRC specfunc_test.cpp DEPENDS espresso::utils
-            espresso::core)
+  espresso_unit_test(SRC specfunc_test.cpp DEPENDS espresso::utils
+                     espresso::core)
 endif()
-unit_test(NAME lb_particle_coupling_test SRC lb_particle_coupling_test.cpp
-          DEPENDS espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME ek_interface_test SRC ek_interface_test.cpp DEPENDS
-          espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC lb_particle_coupling_test.cpp DEPENDS espresso::core
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC ek_interface_test.cpp DEPENDS espresso::core Boost::mpi
+                   MPI::MPI_CXX NUM_PROC 2)
 if(ESPRESSO_BUILD_WITH_WALBERLA)
   target_link_libraries(
     lb_particle_coupling_test PRIVATE espresso::walberla
@@ -89,3 +79,14 @@ if(ESPRESSO_BUILD_WITH_WALBERLA)
   target_link_libraries(ek_interface_test PRIVATE espresso::walberla
                                                   espresso::walberla::cpp_flags)
 endif()
+
+if(ESPRESSO_BUILD_WITH_FFTW)
+  espresso_unit_test(SRC p3m_test.cpp DEPENDS espresso::utils espresso::core)
+  espresso_unit_test(SRC fft_test.cpp DEPENDS espresso::utils espresso::core)
+  espresso_unit_test(SRC math_test.cpp DEPENDS espresso::utils espresso::core)
+endif()
+
+if(ESPRESSO_BUILD_WITH_CUDA)
+  espresso_unit_test(SRC cuda_test.cu DEPENDS espresso::utils espresso::core
+                     espresso::cuda)
+endif()
diff --git a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
index 19836a57bb7..ce9258b733c 100644
--- a/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
+++ b/src/core/unit_tests/EspressoSystemStandAlone_test.cpp
@@ -49,6 +49,7 @@ namespace utf = boost::unit_test;
 #include "nonbonded_interactions/nonbonded_interaction_data.hpp"
 #include "observables/ParticleVelocities.hpp"
 #include "observables/PidObservable.hpp"
+#include "p3m/FFTBackendLegacy.hpp"
 #include "particle_node.hpp"
 #include "system/System.hpp"
 
@@ -59,8 +60,7 @@ namespace utf = boost::unit_test;
 
 #include <boost/mpi.hpp>
 #include <boost/mpi/collectives/all_reduce.hpp>
-#include <boost/optional.hpp>
-#include <boost/range/numeric.hpp>
+#include <boost/variant.hpp>
 
 #include <cassert>
 #include <cmath>
@@ -314,6 +314,7 @@ BOOST_FIXTURE_TEST_CASE(espresso_system_stand_alone, ParticleFactory) {
                              1e-3};
     auto solver =
         std::make_shared<CoulombP3M>(std::move(p3m), prefactor, 1, false, true);
+    solver->p3m.make_fft_instance<FFTBackendLegacy>(false);
     add_actor(comm, espresso::system, system.coulomb.impl->solver, solver,
               [&system]() { system.on_coulomb_change(); });
 
diff --git a/src/core/unit_tests/EspressoSystem_test.cpp b/src/core/unit_tests/EspressoSystem_test.cpp
index 7b66a3827bc..424744817dc 100644
--- a/src/core/unit_tests/EspressoSystem_test.cpp
+++ b/src/core/unit_tests/EspressoSystem_test.cpp
@@ -50,11 +50,10 @@
 /* Decorator to run a unit test depending on GPU availability. */
 boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) {
   bool has_compatible_gpu = false;
-  try {
+  invoke_skip_cuda_exceptions([&]() {
     cuda_check_device();
     has_compatible_gpu = true;
-  } catch (cuda_runtime_error const &) {
-  }
+  });
   return has_compatible_gpu;
 }
 
diff --git a/src/core/unit_tests/LocalBox_test.cpp b/src/core/unit_tests/LocalBox_test.cpp
index a219e955d49..2247a154d5d 100644
--- a/src/core/unit_tests/LocalBox_test.cpp
+++ b/src/core/unit_tests/LocalBox_test.cpp
@@ -27,8 +27,7 @@
 #include <utils/Array.hpp>
 #include <utils/Vector.hpp>
 
-#include <boost/range/algorithm/equal.hpp>
-
+#include <algorithm>
 #include <limits>
 
 /* Check that the box corners and side length agree. */
@@ -59,7 +58,7 @@ BOOST_AUTO_TEST_CASE(constructors) {
 
     BOOST_CHECK(box.my_left() == lower_corner);
     BOOST_CHECK(box.length() == local_box_length);
-    BOOST_CHECK(boost::equal(boundaries, box.boundary()));
+    BOOST_CHECK(std::ranges::equal(boundaries, box.boundary()));
     BOOST_CHECK(box.cell_structure_type() == type);
     check_length(box);
   }
diff --git a/src/core/unit_tests/MpiCallbacks_test.cpp b/src/core/unit_tests/MpiCallbacks_test.cpp
index b189ddef224..ba11a619e8e 100644
--- a/src/core/unit_tests/MpiCallbacks_test.cpp
+++ b/src/core/unit_tests/MpiCallbacks_test.cpp
@@ -30,7 +30,6 @@
 
 #include <boost/mpi.hpp>
 #include <boost/mpi/environment.hpp>
-#include <boost/optional.hpp>
 
 #include <algorithm>
 #include <functional>
@@ -43,7 +42,7 @@ static bool called = false;
 BOOST_AUTO_TEST_CASE(invoke_test) {
   using Communication::detail::invoke;
 
-  auto f = [](int i, unsigned j) { return i + j; };
+  auto f = [](int i, unsigned j) { return i + static_cast<int>(j); };
 
   boost::mpi::communicator world;
   boost::mpi::packed_oarchive::buffer_type buff;
diff --git a/src/core/unit_tests/Particle_serialization_test.cpp b/src/core/unit_tests/Particle_serialization_test.cpp
index 0e6ebec1658..07efc2753aa 100644
--- a/src/core/unit_tests/Particle_serialization_test.cpp
+++ b/src/core/unit_tests/Particle_serialization_test.cpp
@@ -31,6 +31,7 @@
 #include <boost/archive/text_oarchive.hpp>
 #include <boost/hana.hpp>
 #include <boost/mpl/list.hpp>
+#include <boost/utility/identity_type.hpp>
 
 #include <cstddef>
 #include <regex>
@@ -145,14 +146,14 @@ class BitwiseSerializable {
 
   friend boost::serialization::access;
   template <class Archive> void serialize(Archive &ar, long int) {
-    ar &a &b;
+    ar & a & b;
     ar << c << d;
   }
 };
 
 class NotBitwiseSerializable {
   friend boost::serialization::access;
-  template <class Archive> void serialize(Archive &ar, long int) {}
+  template <class Archive> void serialize(Archive &, long int) {}
 };
 
 class MixedSerializable {
@@ -184,7 +185,7 @@ BOOST_AUTO_TEST_CASE(TraitChecker_test) {
   Checker::buffer_type buffer;
   Checker oa{buffer};
   Testing::BitwiseSerializable serializable;
-  oa &serializable;
+  oa & serializable;
   BOOST_REQUIRE_EQUAL(buffer.size(), 0);
   Testing::MixedSerializable mixed;
   oa | mixed;
@@ -216,7 +217,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(
     typename Checker::buffer_type buffer = {};
     Checker oa{buffer};
     Particle p;
-    oa &p;
+    oa & p;
     BOOST_TEST(buffer == buffer_ref, boost::test_tools::per_element());
   }
 
diff --git a/src/core/unit_tests/Particle_test.cpp b/src/core/unit_tests/Particle_test.cpp
index a03c2c2e514..c13fb55dd56 100644
--- a/src/core/unit_tests/Particle_test.cpp
+++ b/src/core/unit_tests/Particle_test.cpp
@@ -27,7 +27,6 @@
 #include "PropagationMode.hpp"
 #include "config/config.hpp"
 
-#include <utils/Span.hpp>
 #include <utils/compact_vector.hpp>
 #include <utils/serialization/memcpy_archive.hpp>
 
@@ -122,7 +121,7 @@ BOOST_AUTO_TEST_CASE(properties_serialization) {
   prop.identity = 1234;
 
   {
-    auto oa = Utils::MemcpyOArchive{Utils::make_span(buf)};
+    auto oa = Utils::MemcpyOArchive{buf};
 
     oa << prop;
 
@@ -130,7 +129,7 @@ BOOST_AUTO_TEST_CASE(properties_serialization) {
   }
 
   {
-    auto ia = Utils::MemcpyIArchive{Utils::make_span(buf)};
+    auto ia = Utils::MemcpyIArchive{buf};
     ParticleProperties out;
 
     ia >> out;
@@ -158,7 +157,7 @@ BOOST_AUTO_TEST_CASE(force_serialization) {
 #endif
 
   {
-    auto oa = Utils::MemcpyOArchive{Utils::make_span(buf)};
+    auto oa = Utils::MemcpyOArchive{buf};
 
     oa << pf;
 
@@ -166,7 +165,7 @@ BOOST_AUTO_TEST_CASE(force_serialization) {
   }
 
   {
-    auto ia = Utils::MemcpyIArchive{Utils::make_span(buf)};
+    auto ia = Utils::MemcpyIArchive{buf};
     ParticleForce out;
 
     ia >> out;
@@ -216,7 +215,7 @@ BOOST_AUTO_TEST_CASE(rattle_serialization) {
   auto pr = ParticleRattle{{1, 2, 3}};
 
   {
-    auto oa = Utils::MemcpyOArchive{Utils::make_span(buf)};
+    auto oa = Utils::MemcpyOArchive{buf};
 
     oa << pr;
 
@@ -224,7 +223,7 @@ BOOST_AUTO_TEST_CASE(rattle_serialization) {
   }
 
   {
-    auto ia = Utils::MemcpyIArchive{Utils::make_span(buf)};
+    auto ia = Utils::MemcpyIArchive{buf};
     ParticleRattle out;
 
     ia >> out;
diff --git a/src/core/unit_tests/Verlet_list_test.cpp b/src/core/unit_tests/Verlet_list_test.cpp
index ae2bcdc1cba..4be59cbbeaa 100644
--- a/src/core/unit_tests/Verlet_list_test.cpp
+++ b/src/core/unit_tests/Verlet_list_test.cpp
@@ -77,8 +77,8 @@ struct IntegratorHelper : public ParticleFactory {
   /** Set particle to move along the x-axis. */
   virtual void set_particle_properties(int) const = 0;
   virtual char const *name() const = 0;
-  friend auto operator<<(std::ostream &os, IntegratorHelper const &obj)
-      -> std::ostream & {
+  friend auto operator<<(std::ostream &os,
+                         IntegratorHelper const &obj) -> std::ostream & {
     return os << obj.name();
   }
 };
diff --git a/src/core/unit_tests/cuda_test.cu b/src/core/unit_tests/cuda_test.cu
new file mode 100644
index 00000000000..812638cfa96
--- /dev/null
+++ b/src/core/unit_tests/cuda_test.cu
@@ -0,0 +1,280 @@
+/*
+ * Copyright (C) 2024 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_MODULE "CUDA interface tests"
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+
+#include "config/config.hpp"
+
+#include "cuda/init.hpp"
+#include "cuda/utils.cuh"
+#include "cuda/utils.hpp"
+#include "errorhandling.hpp"
+
+#include "cuda/CudaHostAllocator.hpp"
+
+#include <cuda.h>
+
+#include <boost/test/unit_test.hpp>
+
+#include <cstddef>
+#include <initializer_list>
+#include <optional>
+#include <string>
+
+boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) {
+  bool has_compatible_device = false;
+  int n_devices = 0;
+  cudaGetDeviceCount(&n_devices);
+  if (n_devices > 0) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    if (prop.major >= 3) {
+      has_compatible_device = true;
+    }
+  }
+  return has_compatible_device;
+}
+
+std::optional<std::string> read_pending_cuda_errors() {
+  auto const CU_err = cudaGetLastError();
+  if (CU_err != cudaSuccess) {
+    auto const message = std::string(cudaGetErrorString(CU_err));
+    return {"There is a pending CUDA error: \"" + message + "\""};
+  }
+  return std::nullopt;
+}
+
+void setup() {}
+void teardown() {
+  auto error = read_pending_cuda_errors();
+  BOOST_REQUIRE_MESSAGE(not error.has_value(), error.value_or(""));
+}
+
+namespace Testing::non_sticky_cuda_error {
+
+/** @brief Trigger a non-sticky CUDA error for testing purposes. */
+void trigger() { cudaSetDevice(-1); }
+
+/** @brief Clear a non-sticky CUDA error raised by @ref trigger. */
+void clear() {
+  auto const error_code = cudaGetLastError();
+  BOOST_REQUIRE_MESSAGE(error_code == cudaSuccess or
+                            error_code == cudaErrorInvalidDevice,
+                        "An unexpected CUDA error was pending!");
+}
+
+} // namespace Testing::non_sticky_cuda_error
+
+#ifdef P3M
+dim3 p3m_make_grid(unsigned int n_blocks);
+#endif
+
+static auto fixture = boost::unit_test::fixture(&setup, &teardown);
+
+BOOST_AUTO_TEST_SUITE(suite, *boost::unit_test::precondition(has_gpu))
+
+BOOST_AUTO_TEST_CASE(gpu_fixture, *fixture) {
+  auto error1 = read_pending_cuda_errors();
+  BOOST_REQUIRE(not error1.has_value());
+
+  // check we can raise and clear non-sticky CUDA errors
+  Testing::non_sticky_cuda_error::trigger();
+  Testing::non_sticky_cuda_error::clear();
+  auto error2 = read_pending_cuda_errors();
+  BOOST_REQUIRE(not error2.has_value());
+
+  // check fixture can handle the default non-sticky CUDA error
+  Testing::non_sticky_cuda_error::trigger();
+  auto ref_what3{"There is a pending CUDA error: \"invalid device ordinal\""};
+  auto error3 = read_pending_cuda_errors();
+  BOOST_REQUIRE(error3.has_value());
+  BOOST_REQUIRE_EQUAL(error3.value(), ref_what3);
+  // sticky error should have been cleared
+  error3 = read_pending_cuda_errors();
+  BOOST_REQUIRE(not error3.has_value());
+
+  // check fixture can handle a custom non-sticky CUDA error
+  cudaMallocHost(nullptr, std::size_t(0u));
+  auto ref_what4{"There is a pending CUDA error: \"invalid argument\""};
+  auto error4 = read_pending_cuda_errors();
+  BOOST_REQUIRE(error4.has_value());
+  BOOST_REQUIRE_EQUAL(error4.value(), ref_what4);
+  // sticky error should have been cleared
+  error4 = read_pending_cuda_errors();
+  BOOST_REQUIRE(not error4.has_value());
+}
+
+static int fatal_error_counter = 0;
+static void increment_counter() noexcept { ++fatal_error_counter; }
+
+BOOST_AUTO_TEST_CASE(gpu_interface, *fixture) {
+  fatal_error_counter = 0;
+  auto local_error_counter = 0;
+
+  try {
+    throw cuda_fatal_error("message");
+  } catch (cuda_fatal_error &err) {
+    std::string const what = "message";
+    BOOST_CHECK_EQUAL(err.what(), what);
+    BOOST_CHECK_EQUAL(err.get_terminate(), &errexit);
+    err.set_terminate(nullptr);
+    BOOST_CHECK_EQUAL(err.get_terminate(), nullptr);
+    err.set_terminate(increment_counter);
+    BOOST_CHECK_EQUAL(err.get_terminate(), &increment_counter);
+    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+  }
+  ++local_error_counter;
+  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+
+  // -----------------------
+
+  auto error_caught = false;
+  auto const block = dim3{1, 2, 3};
+  auto const grid = dim3{4, 5, 6};
+  cuda_check_errors_exit(block, grid, "", "", 0u); // should not throw
+  try {
+    Testing::non_sticky_cuda_error::trigger();
+    // should clear the CUDA error flag and throw a fatal error
+    cuda_check_errors_exit(block, grid, "cudaSetDevice()", "filename.cu", 4u);
+  } catch (cuda_fatal_error &err) {
+    error_caught = true;
+    err.set_terminate(increment_counter);
+    std::string const what =
+        "CUDA error: \"invalid device ordinal\" while calling "
+        "cudaSetDevice() with block: <1,2,3>, grid: <4,5,6> in filename.cu:4";
+    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+    BOOST_CHECK_EQUAL(err.what(), what);
+    BOOST_CHECK_EQUAL(cudaGetLastError(), cudaSuccess);
+  }
+  ++local_error_counter;
+  BOOST_REQUIRE(error_caught);
+  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+
+  // -----------------------
+
+  error_caught = false;
+  cuda_safe_mem_exit(cudaSuccess, "", 0u); // should not throw
+  try {
+    Testing::non_sticky_cuda_error::trigger();
+    cuda_safe_mem_exit(cudaSuccess, "filename.cu", 4u); // should throw
+  } catch (cuda_fatal_error &err) {
+    error_caught = true;
+    err.set_terminate(increment_counter);
+    std::string const what =
+        "CUDA error: \"invalid device ordinal\" in filename.cu:4. Error "
+        "found during memory operation. Possibly however from a failed "
+        "operation before the memory operation";
+    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+    BOOST_CHECK_EQUAL(err.what(), what);
+  }
+  ++local_error_counter;
+  BOOST_REQUIRE(error_caught);
+  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+
+  // -----------------------
+
+  error_caught = false;
+  try {
+    cuda_safe_mem_exit(cudaErrorNotPermitted, "filename.cu", 4u);
+  } catch (cuda_fatal_error &err) {
+    error_caught = true;
+    err.set_terminate(increment_counter);
+    std::string const what = "CUDA error: \"operation not permitted\" during "
+                             "memory operation in filename.cu:4";
+    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+    BOOST_CHECK_EQUAL(err.what(), what);
+  }
+  ++local_error_counter;
+  BOOST_REQUIRE(error_caught);
+  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+
+  // -----------------------
+
+  error_caught = false;
+  try {
+    cuda_safe_mem_exit(cudaErrorInvalidValue, "function_name()", 4u);
+  } catch (cuda_fatal_error &err) {
+    error_caught = true;
+    err.set_terminate(increment_counter);
+    std::string const what =
+        "CUDA error: \"invalid argument\" during memory operation in "
+        "function_name():4. You may have tried to allocate zero memory";
+    BOOST_CHECK_EQUAL(fatal_error_counter, local_error_counter);
+    BOOST_CHECK_EQUAL(err.what(), what);
+  }
+  ++local_error_counter;
+  BOOST_REQUIRE(error_caught);
+  BOOST_REQUIRE_EQUAL(fatal_error_counter, local_error_counter);
+
+  // -----------------------
+
+  error_caught = false;
+  BOOST_REQUIRE_EQUAL(stream[0], nullptr);
+  cuda_init(); // allocate
+  BOOST_REQUIRE_NE(stream[0], nullptr);
+  cuda_set_device(0); // reallocate, may or may not result in the same pointer
+  BOOST_REQUIRE_NE(stream[0], nullptr);
+  auto const old_stream = stream[0];
+  try {
+    cuda_set_device(-1); // fail to reallocate, pointer remains the same
+  } catch (cuda_runtime_error_cuda const &err) {
+    error_caught = true;
+    std::string const what = "CUDA error: invalid device ordinal";
+    BOOST_CHECK_EQUAL(err.what(), what);
+  }
+  BOOST_REQUIRE(error_caught);
+  BOOST_REQUIRE_EQUAL(stream[0], old_stream);
+
+  // -----------------------
+
+  BOOST_REQUIRE_GE(cuda_get_n_gpus(), 1);
+  char gpu_name_buffer[260] = {'\0'};
+  cuda_get_gpu_name(0, gpu_name_buffer);
+  for (int i = 255; i < 260; ++i) {
+    BOOST_REQUIRE_EQUAL(gpu_name_buffer[i], '\0');
+  }
+}
+
+#ifdef P3M
+
+BOOST_AUTO_TEST_CASE(p3m_reshape_grid_test, *fixture) {
+  auto constexpr optimal_size = 65536u;
+
+  for (auto cao = 1u; cao <= 3u; ++cao) {
+    auto const n_blocks = cao * optimal_size;
+    auto const grid = p3m_make_grid(n_blocks);
+    BOOST_CHECK_EQUAL(grid.x, optimal_size);
+    BOOST_CHECK_EQUAL(grid.y, cao);
+    BOOST_CHECK_EQUAL(grid.z, 1u);
+  }
+
+  for (auto mul : {2u, 3u, 6u, 12u}) {
+    auto const n_blocks = mul * optimal_size + 1u;
+    auto const grid = p3m_make_grid(n_blocks);
+    BOOST_CHECK_EQUAL(grid.x, n_blocks / (mul + 1u) + ((mul == 2u) ? 0u : 1u));
+    BOOST_CHECK_EQUAL(grid.y, (mul + 1u));
+    BOOST_CHECK_EQUAL(grid.z, 1u);
+  }
+}
+
+#endif
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/core/unit_tests/ek_interface_test.cpp b/src/core/unit_tests/ek_interface_test.cpp
index d70f9a714fa..a151345bcfb 100644
--- a/src/core/unit_tests/ek_interface_test.cpp
+++ b/src/core/unit_tests/ek_interface_test.cpp
@@ -188,6 +188,7 @@ BOOST_AUTO_TEST_CASE(ek_interface_walberla) {
       espresso::ek_container->add(ek_species);
       BOOST_CHECK_THROW(ek.veto_kT(params.kT + 1.), std::runtime_error);
       BOOST_CHECK_THROW(ek.on_boxl_change(), std::runtime_error);
+      BOOST_CHECK_THROW(ek.veto_boxl_change(), std::runtime_error);
       BOOST_CHECK_THROW(ek.veto_time_step(ek.get_tau() * 2.),
                         std::invalid_argument);
       BOOST_CHECK_THROW(ek.veto_time_step(ek.get_tau() / 2.5),
@@ -215,6 +216,7 @@ BOOST_AUTO_TEST_CASE(ek_interface_none) {
     BOOST_CHECK_THROW(ek.propagate(), NoEKActive);
     BOOST_CHECK_THROW(ek.get_tau(), NoEKActive);
     BOOST_CHECK_THROW(ek.sanity_checks(), NoEKActive);
+    BOOST_CHECK_THROW(ek.veto_boxl_change(), NoEKActive);
     BOOST_CHECK_THROW(ek.veto_time_step(0.), NoEKActive);
     BOOST_CHECK_THROW(ek.veto_kT(0.), NoEKActive);
     BOOST_CHECK_THROW(ek.on_cell_structure_change(), NoEKActive);
diff --git a/src/core/unit_tests/fft_test.cpp b/src/core/unit_tests/fft_test.cpp
new file mode 100644
index 00000000000..94cfc33f7fc
--- /dev/null
+++ b/src/core/unit_tests/fft_test.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2024 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_MODULE "FFT utility functions"
+#define BOOST_TEST_DYN_LINK
+
+#include "config/config.hpp"
+
+#if defined(P3M) or defined(DP3M)
+
+#include <boost/test/unit_test.hpp>
+
+#include "fft/fft.hpp"
+#include "fft/vector.hpp"
+#include "p3m/for_each_3d.hpp"
+
+#include <utils/Vector.hpp>
+
+#include <array>
+#include <cstddef>
+#include <limits>
+#include <optional>
+#include <span>
+#include <stdexcept>
+#include <vector>
+
+BOOST_AUTO_TEST_CASE(fft_find_comm_groups_mismatch) {
+  using fft::find_comm_groups;
+  int my_pos[3] = {0};
+  int nodelist[4] = {0};
+  int nodepos[12] = {0};
+  int rank = 0;
+  {
+    auto const optional = find_comm_groups({0, 1, 2}, {1, 2, 3}, nodelist,
+                                           nodelist, nodepos, my_pos, rank);
+    BOOST_CHECK(not optional.has_value());
+  }
+  {
+    auto const optional = find_comm_groups({3, 2, 1}, {2, 3, 1}, nodelist,
+                                           nodelist, nodepos, my_pos, rank);
+    BOOST_CHECK(not optional.has_value());
+  }
+  {
+    auto const optional = find_comm_groups({2, 3, 1}, {3, 2, 1}, nodelist,
+                                           nodelist, nodepos, my_pos, rank);
+    BOOST_CHECK(not optional.has_value());
+  }
+}
+
+BOOST_AUTO_TEST_CASE(fft_map_grid) {
+  using fft::map_3don2d_grid;
+  {
+    auto g3d = Utils::Vector3i{{3, 2, 1}};
+    auto g2d = Utils::Vector3i{{3, 2, 1}};
+    auto ref = g2d;
+    auto dir = map_3don2d_grid(g3d.data(), g2d.data());
+    BOOST_CHECK_EQUAL(dir, 2);
+    BOOST_CHECK_EQUAL(g2d, ref);
+  }
+  {
+    auto g3d = Utils::Vector3i{{2, 1, 6}};
+    auto g2d = Utils::Vector3i{{6, 2, 1}};
+    auto ref = g2d;
+    auto dir = map_3don2d_grid(g3d.data(), g2d.data());
+    BOOST_CHECK_EQUAL(dir, 2);
+    BOOST_CHECK_EQUAL(g2d, ref);
+  }
+  {
+    auto g3d = Utils::Vector3i{{2, 6, 2}};
+    auto g2d = Utils::Vector3i{{6, 2, 6}};
+    auto ref = Utils::Vector3i{{6, 1, 2}};
+    auto dir = map_3don2d_grid(g3d.data(), g2d.data());
+    BOOST_CHECK_EQUAL(dir, 1);
+    BOOST_CHECK_EQUAL(g2d, ref);
+  }
+  {
+    auto g3d = Utils::Vector3i{{3, 6, 6}};
+    auto g2d = Utils::Vector3i{{6, 3, 6}};
+    auto ref = g2d;
+    auto dir = map_3don2d_grid(g3d.data(), g2d.data());
+    BOOST_CHECK_EQUAL(dir, -1);
+    BOOST_CHECK_EQUAL(g2d, ref);
+  }
+  {
+    auto g3d = Utils::Vector3i{{4, 1, 6}};
+    auto g2d = Utils::Vector3i{{6, 4, 1}};
+    auto ref = Utils::Vector3i{{4, 6, 1}};
+    auto dir = map_3don2d_grid(g3d.data(), g2d.data());
+    BOOST_CHECK_EQUAL(dir, 2);
+    BOOST_CHECK_EQUAL(g2d, ref);
+  }
+  {
+    auto g3d = Utils::Vector3i{{5, 7, 7}};
+    auto g2d = Utils::Vector3i{{7, 7, 5}};
+    auto ref = Utils::Vector3i{{1, 7, 7}};
+    auto dir = map_3don2d_grid(g3d.data(), g2d.data());
+    BOOST_CHECK_EQUAL(dir, 0);
+    BOOST_CHECK_EQUAL(g2d, ref);
+  }
+  {
+    auto g3d = Utils::Vector3i{{5, 7, 5}};
+    auto g2d = Utils::Vector3i{{7, 7, 5}};
+    auto ref = g2d;
+    auto dir = map_3don2d_grid(g3d.data(), g2d.data());
+    BOOST_CHECK_EQUAL(dir, -1);
+    BOOST_CHECK_EQUAL(g2d, ref);
+  }
+  {
+    auto g3d = Utils::Vector3i{{4, 5, 6}};
+    auto g2d = Utils::Vector3i{{6, 4, 5}};
+    auto ref = Utils::Vector3i{{4, 1, 6}};
+    auto dir = map_3don2d_grid(g3d.data(), g2d.data());
+    BOOST_CHECK_EQUAL(dir, 1);
+    BOOST_CHECK_EQUAL(g2d, ref);
+  }
+  {
+    auto g3d = Utils::Vector3i{{5, 4, 6}};
+    auto g2d = Utils::Vector3i{{6, 4, 5}};
+    auto ref = Utils::Vector3i{{1, 4, 6}};
+    auto dir = map_3don2d_grid(g3d.data(), g2d.data());
+    BOOST_CHECK_EQUAL(dir, 0);
+    BOOST_CHECK_EQUAL(g2d, ref);
+  }
+  {
+    auto g3d = Utils::Vector3i{{5, 6, 8}};
+    auto g2d = Utils::Vector3i{{8, 7, 5}};
+    auto ref = g2d;
+    auto dir = map_3don2d_grid(g3d.data(), g2d.data());
+    BOOST_CHECK_EQUAL(dir, -1);
+    BOOST_CHECK_EQUAL(g2d, ref);
+  }
+  {
+    auto g3d = Utils::Vector3i{{5, 6, 9}};
+    auto g2d = Utils::Vector3i{{8, 7, 5}};
+    auto ref = g2d;
+    auto dir = map_3don2d_grid(g3d.data(), g2d.data());
+    BOOST_CHECK_EQUAL(dir, -1);
+    BOOST_CHECK_EQUAL(g2d, ref);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(fft_exceptions) {
+  auto constexpr size_max = std::numeric_limits<std::size_t>::max();
+  auto constexpr bad_size = size_max / sizeof(int) + 1ul;
+  fft::allocator<int> allocator{};
+  BOOST_CHECK_EQUAL(allocator.allocate(0ul), nullptr);
+  BOOST_CHECK_THROW(allocator.allocate(bad_size), std::bad_array_new_length);
+}
+
+BOOST_AUTO_TEST_CASE(for_each_3d_test) {
+  auto const m_start = Utils::Vector3i{{0, -1, 3}};
+  auto const m_stop = Utils::Vector3i{{2, 2, 5}};
+  auto ref_loop_counters = m_start;
+  auto indices = Utils::Vector3i{};
+
+  auto const kernel = [&]() {
+    BOOST_REQUIRE_EQUAL(indices, ref_loop_counters);
+    if (++ref_loop_counters[2u] == m_stop[2u]) {
+      ref_loop_counters[2u] = m_start[2u];
+      if (++ref_loop_counters[1u] == m_stop[1u]) {
+        ref_loop_counters[1u] = m_start[1u];
+        if (++ref_loop_counters[0u] == m_stop[0u]) {
+          ref_loop_counters[0u] = m_start[0u];
+        }
+      }
+    }
+  };
+
+  {
+    for_each_3d(m_start, m_stop, indices, kernel, [&](unsigned dim, int n) {
+      BOOST_REQUIRE_GE(dim, 0);
+      BOOST_REQUIRE_LE(dim, 2);
+      BOOST_REQUIRE_EQUAL(n, ref_loop_counters[dim]);
+    });
+
+    BOOST_REQUIRE_EQUAL(indices, m_stop);
+    BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start);
+  }
+  {
+    for_each_3d(m_start, m_stop, indices, kernel);
+
+    BOOST_REQUIRE_EQUAL(indices, m_stop);
+    BOOST_REQUIRE_EQUAL(ref_loop_counters, m_start);
+  }
+}
+
+#else  // defined(P3M) or defined(DP3M)
+int main(int argc, char **argv) {}
+#endif // defined(P3M) or defined(DP3M)
diff --git a/src/core/unit_tests/grid_test.cpp b/src/core/unit_tests/grid_test.cpp
index 1694ba57784..41b7a10b215 100644
--- a/src/core/unit_tests/grid_test.cpp
+++ b/src/core/unit_tests/grid_test.cpp
@@ -38,59 +38,35 @@ BOOST_AUTO_TEST_CASE(get_mi_coord_test) {
   auto const box_l = 3.1415;
 
   /* Non-periodic */
-  {{auto const a = 1.;
-  auto const b = 2.;
-
-  BOOST_CHECK_EQUAL(get_mi_coord(a, b, box_l, /* periodic */ false), a - b);
-}
-
-{
-  auto const a = 1.;
-  auto const b = 3.;
-
-  BOOST_CHECK_EQUAL(get_mi_coord(a, b, box_l, /* periodic */ false), a - b);
-}
-}
+  {
+    {
+      auto const a = 1.;
+      auto const b = 2.;
 
-/* Regular distance */
-{
-  auto const a = -0.5;
-  auto const b = +1.0;
+      BOOST_CHECK_EQUAL(get_mi_coord(a, b, box_l, /* periodic */ false), a - b);
+    }
 
-  BOOST_CHECK_EQUAL(get_mi_coord(a, b, box_l, /* periodic */ true), a - b);
-  BOOST_CHECK_EQUAL(get_mi_coord(b, a, box_l, /* periodic */ true), b - a);
-}
+    {
+      auto const a = 1.;
+      auto const b = 3.;
 
-/* Wrapped */
-{
-  auto const a = 1.;
-  auto const b = 3.;
-
-  BOOST_CHECK_SMALL(std::abs(get_mi_coord(a, b, box_l, /* periodic */ true) -
-                             (a - b) - box_l),
-                    epsilon<double>);
-  BOOST_CHECK_SMALL(std::abs(get_mi_coord(b, a, box_l, /* periodic */ true) -
-                             (b - a) + box_l),
-                    epsilon<double>);
-}
+      BOOST_CHECK_EQUAL(get_mi_coord(a, b, box_l, /* periodic */ false), a - b);
+    }
+  }
 
-/* Corner cases */
-{
+  /* Regular distance */
   {
-    auto const a = 0.4;
-    auto const b = a + 0.5 * box_l;
+    auto const a = -0.5;
+    auto const b = +1.0;
 
-    BOOST_CHECK_SMALL(std::abs(get_mi_coord(a, b, box_l, /* periodic */ true) -
-                               (a - b) - box_l),
-                      epsilon<double>);
-    BOOST_CHECK_SMALL(std::abs(get_mi_coord(b, a, box_l, /* periodic */ true) -
-                               (b - a) + box_l),
-                      epsilon<double>);
+    BOOST_CHECK_EQUAL(get_mi_coord(a, b, box_l, /* periodic */ true), a - b);
+    BOOST_CHECK_EQUAL(get_mi_coord(b, a, box_l, /* periodic */ true), b - a);
   }
 
+  /* Wrapped */
   {
-    auto const a = 0.4;
-    auto const b = std::nextafter(a + 0.5 * box_l, box_l);
+    auto const a = 1.;
+    auto const b = 3.;
 
     BOOST_CHECK_SMALL(std::abs(get_mi_coord(a, b, box_l, /* periodic */ true) -
                                (a - b) - box_l),
@@ -100,19 +76,49 @@ BOOST_AUTO_TEST_CASE(get_mi_coord_test) {
                       epsilon<double>);
   }
 
+  /* Corner cases */
   {
-    auto const a = 0.4;
-    auto const b = std::nextafter(a + 0.5 * box_l, 0.);
-
-    BOOST_CHECK_SMALL(
-        std::abs(get_mi_coord(a, b, box_l, /* periodic */ true) - (a - b)),
-        epsilon<double>);
-    BOOST_CHECK_SMALL(
-        std::abs(get_mi_coord(b, a, box_l, /* periodic */ true) - (b - a)),
-        epsilon<double>);
+    {
+      auto const a = 0.4;
+      auto const b = a + 0.5 * box_l;
+
+      BOOST_CHECK_SMALL(
+          std::abs(get_mi_coord(a, b, box_l, /* periodic */ true) - (a - b) -
+                   box_l),
+          epsilon<double>);
+      BOOST_CHECK_SMALL(
+          std::abs(get_mi_coord(b, a, box_l, /* periodic */ true) - (b - a) +
+                   box_l),
+          epsilon<double>);
+    }
+
+    {
+      auto const a = 0.4;
+      auto const b = std::nextafter(a + 0.5 * box_l, box_l);
+
+      BOOST_CHECK_SMALL(
+          std::abs(get_mi_coord(a, b, box_l, /* periodic */ true) - (a - b) -
+                   box_l),
+          epsilon<double>);
+      BOOST_CHECK_SMALL(
+          std::abs(get_mi_coord(b, a, box_l, /* periodic */ true) - (b - a) +
+                   box_l),
+          epsilon<double>);
+    }
+
+    {
+      auto const a = 0.4;
+      auto const b = std::nextafter(a + 0.5 * box_l, 0.);
+
+      BOOST_CHECK_SMALL(
+          std::abs(get_mi_coord(a, b, box_l, /* periodic */ true) - (a - b)),
+          epsilon<double>);
+      BOOST_CHECK_SMALL(
+          std::abs(get_mi_coord(b, a, box_l, /* periodic */ true) - (b - a)),
+          epsilon<double>);
+    }
   }
 }
-}
 
 BOOST_AUTO_TEST_CASE(get_mi_vector_test) {
   using detail::get_mi_coord;
@@ -121,14 +127,14 @@ BOOST_AUTO_TEST_CASE(get_mi_vector_test) {
   Vector3d box_l = {1., 2., 3.};
   BoxGeometry box;
   box.set_length(box_l);
-  box.set_periodic(1, false);
+  box.set_periodic(1u, false);
 
   auto const a = Vector3d{1.1, 12.2, -13.4};
   auto const b = Vector3d{-0.9, 8.8, 21.1};
 
   auto const result = box.get_mi_vector(a, b);
 
-  for (int i = 0; i < 3; i++) {
+  for (auto i = 0u; i < 3u; i++) {
     auto const expected = get_mi_coord(a[i], b[i], box_l[i], box.periodic(i));
 
     BOOST_CHECK_SMALL(std::abs(expected - result[i]), epsilon<double>);
@@ -142,7 +148,7 @@ BOOST_AUTO_TEST_CASE(lees_edwards_mi_vector) {
   Vector3d box_l = {5., 2., 3.};
   BoxGeometry box;
   box.set_length(box_l);
-  box.set_periodic(1, false);
+  box.set_periodic(1u, false);
   box.set_type(BoxType::LEES_EDWARDS);
   LeesEdwardsBC le{0., 0., 2, 0};
   box.set_lees_edwards_bc(le);
@@ -170,7 +176,7 @@ BOOST_AUTO_TEST_CASE(lees_edwards_mi_vector) {
 
     auto const result = box.get_mi_vector(a, b);
 
-    for (int i = 0; i < 3; i++) {
+    for (auto i = 0u; i < 3u; i++) {
       auto expected = get_mi_coord(a[i], b[i], box_l[i], box.periodic(i));
       if (i == le.shear_direction) {
         expected -= le.pos_offset = 1.;
@@ -198,9 +204,9 @@ BOOST_AUTO_TEST_CASE(lees_edwards_mi_vector) {
 
   // Test a case where coordinate different + LE offset in shift dir > box_l/2
   box.set_type(BoxType::LEES_EDWARDS);
-  box.set_periodic(0, true);
-  box.set_periodic(1, true);
-  box.set_periodic(2, true);
+  box.set_periodic(0u, true);
+  box.set_periodic(1u, true);
+  box.set_periodic(2u, true);
   box.set_length(Vector3d{5., 5., 5.});
   box.set_lees_edwards_bc(LeesEdwardsBC{2.98, 0., 0, 1});
   auto const result =
@@ -248,9 +254,9 @@ BOOST_AUTO_TEST_CASE(fold_position_test) {
   auto const box_l = Utils::Vector3d{2., 4., 6.};
   auto box = BoxGeometry();
   box.set_length(box_l);
-  box.set_periodic(0, true);
-  box.set_periodic(1, true);
-  box.set_periodic(2, false);
+  box.set_periodic(0u, true);
+  box.set_periodic(1u, true);
+  box.set_periodic(2u, false);
 
   /* Wrapped */
   {
diff --git a/src/core/unit_tests/lb_particle_coupling_test.cpp b/src/core/unit_tests/lb_particle_coupling_test.cpp
index 536d9101a39..95c42441916 100644
--- a/src/core/unit_tests/lb_particle_coupling_test.cpp
+++ b/src/core/unit_tests/lb_particle_coupling_test.cpp
@@ -56,7 +56,6 @@ namespace utf = boost::unit_test;
 #include <utils/math/sqr.hpp>
 
 #include <boost/mpi.hpp>
-#include <boost/serialization/optional.hpp>
 
 #include <array>
 #include <cassert>
@@ -104,8 +103,8 @@ static auto make_lb_actor() {
   lb_params = std::make_shared<LB::LBWalberlaParams>(params.agrid, params.tau);
   lb_lattice = std::make_shared<LatticeWalberla>(
       params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
-  lb_fluid = new_lb_walberla(lb_lattice, params.viscosity, params.density,
-                             single_precision);
+  lb_fluid = new_lb_walberla_cpu(lb_lattice, params.viscosity, params.density,
+                                 single_precision);
   lb_fluid->set_collision_model(params.kT, params.seed);
   lb_fluid->ghost_communication();
 }
@@ -175,13 +174,12 @@ BOOST_AUTO_TEST_CASE(rng) {
   auto &thermostat = *espresso::system->thermostat->lb;
   auto const &box_geo = *espresso::system->box_geo;
   auto const &local_box = *espresso::system->local_geo;
-  auto const tau = params.time_step;
   thermostat.rng_initialize(17u);
   thermostat.set_rng_counter(11ul);
   thermostat.gamma = 0.2;
   espresso::set_lb_kT(1.);
 
-  LB::ParticleCoupling coupling{thermostat, lb, box_geo, local_box, tau};
+  LB::ParticleCoupling coupling{thermostat, lb, box_geo, local_box};
   BOOST_CHECK_EQUAL(thermostat.rng_seed(), 17u);
   BOOST_CHECK_EQUAL(thermostat.rng_counter(), 11ul);
   BOOST_CHECK(not thermostat.is_seed_required());
@@ -209,30 +207,12 @@ BOOST_AUTO_TEST_CASE(rng) {
 
   espresso::set_lb_kT(0.);
   LB::ParticleCoupling coupling_unthermalized{thermostat, lb, box_geo,
-                                              local_box, tau};
+                                              local_box};
   auto const step3_norandom =
       coupling_unthermalized.get_noise_term(test_partcl_2);
   BOOST_CHECK((step3_norandom == Utils::Vector3d{0., 0., 0.}));
 }
 
-BOOST_AUTO_TEST_CASE(drift_vel_offset) {
-  Particle p{};
-  auto &lb = espresso::system->lb;
-  auto &thermostat = *espresso::system->thermostat->lb;
-  auto const &box_geo = *espresso::system->box_geo;
-  auto const &local_box = *espresso::system->local_geo;
-  LB::ParticleCoupling coupling{thermostat, lb, box_geo, local_box,
-                                params.time_step};
-  BOOST_CHECK_EQUAL(coupling.lb_drift_velocity_offset(p).norm(), 0);
-  Utils::Vector3d expected{};
-#ifdef LB_ELECTROHYDRODYNAMICS
-  p.mu_E() = Utils::Vector3d{-2., 1.5, 1.};
-  expected += p.mu_E();
-#endif
-  BOOST_CHECK_SMALL((coupling.lb_drift_velocity_offset(p) - expected).norm(),
-                    eps);
-}
-
 BOOST_DATA_TEST_CASE(drag_force, bdata::make(kTs), kT) {
   espresso::set_lb_kT(kT);
   auto &lb = espresso::system->lb;
@@ -241,12 +221,17 @@ BOOST_DATA_TEST_CASE(drag_force, bdata::make(kTs), kT) {
   p.v() = {-2.5, 1.5, 2.};
   p.pos() = espresso::lb_fluid->get_lattice().get_local_domain().first;
   thermostat.gamma = 0.2;
-  Utils::Vector3d drift_offset{-1., 1., 1.};
+#ifdef LB_ELECTROHYDRODYNAMICS
+  p.mu_E() = Utils::Vector3d{-1., 1., 1.};
+#endif
 
   // Drag force in quiescent fluid
   {
-    auto const observed = lb_drag_force(lb, 0.2, p, p.pos(), drift_offset);
-    const Utils::Vector3d expected{0.3, -0.1, -.2};
+    auto const observed = lb_drag_force(lb, 0.2, p, p.pos());
+    Utils::Vector3d expected{0.5, -0.3, -0.4};
+#ifdef LB_ELECTROHYDRODYNAMICS
+    expected += thermostat.gamma * p.mu_E();
+#endif
     BOOST_CHECK_SMALL((observed - expected).norm(), eps);
   }
 }
@@ -269,9 +254,8 @@ BOOST_DATA_TEST_CASE(swimmer_force, bdata::make(kTs), kT) {
   // swimmer coupling
   {
     if (in_local_halo(local_box, p.pos(), params.agrid)) {
-      LB::ParticleCoupling coupling{thermostat, lb, box_geo, local_box,
-                                    params.time_step};
-      coupling.kernel(p);
+      LB::ParticleCoupling coupling{thermostat, lb, box_geo, local_box};
+      coupling.kernel({&p});
       auto const interpolated = LB::get_force_to_be_applied(p.pos());
       auto const expected =
           params.force_md_to_lb(Utils::Vector3d{0., 0., p.swimming().f_swim});
@@ -303,8 +287,8 @@ BOOST_DATA_TEST_CASE(swimmer_force, bdata::make(kTs), kT) {
   // remove force of the particle from the fluid
   {
     if (in_local_halo(local_box, p.pos(), params.agrid)) {
-      add_md_force(lb, p.pos(), -Utils::Vector3d{0., 0., p.swimming().f_swim},
-                   params.time_step);
+      lb.add_force_density(p.pos(),
+                           -Utils::Vector3d{0., 0., p.swimming().f_swim});
       auto const reset = LB::get_force_to_be_applied(p.pos());
       BOOST_REQUIRE_SMALL(reset.norm(), eps);
     }
@@ -323,8 +307,7 @@ BOOST_DATA_TEST_CASE(particle_coupling, bdata::make(kTs), kT) {
       espresso::lb_fluid->get_lattice().get_local_domain().first;
   thermostat.gamma = 0.2;
   Particle p{};
-  LB::ParticleCoupling coupling{thermostat, lb, box_geo, local_box,
-                                params.time_step};
+  LB::ParticleCoupling coupling{thermostat, lb, box_geo, local_box};
   auto expected = coupling.get_noise_term(p);
 #ifdef LB_ELECTROHYDRODYNAMICS
   p.mu_E() = Utils::Vector3d{-2., 1.5, 1.};
@@ -335,7 +318,7 @@ BOOST_DATA_TEST_CASE(particle_coupling, bdata::make(kTs), kT) {
   // coupling
   {
     if (in_local_halo(local_box, p.pos(), params.agrid)) {
-      coupling.kernel(p);
+      coupling.kernel({&p});
       BOOST_CHECK_SMALL((p.force() - expected).norm(), eps);
 
       auto const interpolated = -LB::get_force_to_be_applied(p.pos());
@@ -347,7 +330,7 @@ BOOST_DATA_TEST_CASE(particle_coupling, bdata::make(kTs), kT) {
   // remove force of the particle from the fluid
   {
     if (in_local_halo(local_box, p.pos(), params.agrid)) {
-      add_md_force(lb, p.pos(), -expected, params.time_step);
+      lb.add_force_density(p.pos(), -expected);
     }
   }
 }
@@ -373,6 +356,7 @@ BOOST_DATA_TEST_CASE_F(CleanupActorLB, coupling_particle_lattice_ia,
   create_particle({box_l[0] / 2. - skin * 2., skin * 2., skin * 2.}, 0, 0);
 
   // sanity checks
+  BOOST_REQUIRE_EQUAL(lb.is_gpu(), false);
   BOOST_REQUIRE_EQUAL(get_particle_node_parallel(pid), rank ? -1 : 0);
   BOOST_REQUIRE_EQUAL(
       ErrorHandling::mpi_gather_runtime_errors_all(rank == 0).size(), 0);
@@ -385,8 +369,7 @@ BOOST_DATA_TEST_CASE_F(CleanupActorLB, coupling_particle_lattice_ia,
   set_particle_property(pid, &Particle::mu_E, Utils::Vector3d{-2., 1.5, 1.});
 #endif
 
-  LB::ParticleCoupling coupling{thermostat, lb, box_geo, local_box,
-                                params.time_step};
+  LB::ParticleCoupling coupling{thermostat, lb, box_geo, local_box};
   auto const p_opt = copy_particle_to_head_node(comm, system, pid);
   auto expected = Utils::Vector3d{};
   if (rank == 0) {
@@ -459,7 +442,7 @@ BOOST_DATA_TEST_CASE_F(CleanupActorLB, coupling_particle_lattice_ia,
     // check without LB coupling
     {
       system.thermostat->lb_coupling_deactivate();
-      system.lb_couple_particles(params.time_step);
+      system.lb_couple_particles();
       auto const p_opt = copy_particle_to_head_node(comm, system, pid);
       if (rank == 0) {
         auto const &p = *p_opt;
@@ -480,7 +463,7 @@ BOOST_DATA_TEST_CASE_F(CleanupActorLB, coupling_particle_lattice_ia,
         }
       }
       // couple particle to LB
-      system.lb_couple_particles(params.time_step);
+      system.lb_couple_particles();
       {
         auto const p_opt = copy_particle_to_head_node(comm, system, pid);
         if (rank == 0) {
@@ -495,7 +478,7 @@ BOOST_DATA_TEST_CASE_F(CleanupActorLB, coupling_particle_lattice_ia,
       }
       // remove force of the particle from the fluid
       set_particle_property(pid, &Particle::force, Utils::Vector3d{});
-      add_md_force(lb, p_pos, -expected, params.time_step);
+      lb.add_force_density(p_pos, -expected);
     }
   }
 
@@ -520,6 +503,7 @@ BOOST_AUTO_TEST_CASE(runtime_exceptions) {
   // LB prevents changing most of the system state
   {
     BOOST_CHECK_THROW(lb.on_boxl_change(), std::runtime_error);
+    BOOST_CHECK_THROW(lb.veto_boxl_change(), std::runtime_error);
     BOOST_CHECK_THROW(lb.veto_time_step(lb.get_tau() * 2.),
                       std::invalid_argument);
     BOOST_CHECK_THROW(lb.veto_time_step(lb.get_tau() / 2.5),
@@ -552,7 +536,7 @@ bool test_lb_domain_mismatch_local() {
   ::communicator.node_grid = node_grid_reversed;
   auto const lattice = std::make_shared<LatticeWalberla>(
       Utils::Vector3i{12, 12, 12}, node_grid_original, n_ghost_layers);
-  auto const ptr = new_lb_walberla(lattice, 1.0, 1.0, false);
+  auto const ptr = new_lb_walberla_cpu(lattice, 1.0, 1.0, false);
   ptr->set_collision_model(0.0, 0);
   ::communicator.node_grid = node_grid_original;
   auto lb_instance = std::make_shared<LB::LBWalberla>(ptr, params);
@@ -573,16 +557,16 @@ BOOST_AUTO_TEST_CASE(lb_exceptions) {
   auto &lb = espresso::system->lb;
   // LB exceptions mechanism
   {
-    using std::exception;
     // getters and setters
-    BOOST_CHECK_THROW(lb.get_agrid(), exception);
-    BOOST_CHECK_THROW(lb.get_tau(), exception);
-    BOOST_CHECK_THROW(lb.get_kT(), exception);
-    BOOST_CHECK_THROW(lb.get_pressure_tensor(), exception);
+    BOOST_CHECK_THROW(lb.is_gpu(), std::exception);
+    BOOST_CHECK_THROW(lb.get_agrid(), std::exception);
+    BOOST_CHECK_THROW(lb.get_tau(), std::exception);
+    BOOST_CHECK_THROW(lb.get_kT(), std::exception);
+    BOOST_CHECK_THROW(lb.get_pressure_tensor(), std::exception);
     BOOST_CHECK_THROW(LB::get_force_to_be_applied({-10., -10., -10.}),
                       std::runtime_error);
     // coupling, interpolation, boundaries
-    BOOST_CHECK_THROW(lb.get_momentum(), exception);
+    BOOST_CHECK_THROW(lb.get_momentum(), std::exception);
   }
 
   // waLBerla and ESPResSo must agree on domain decomposition
@@ -612,12 +596,14 @@ BOOST_AUTO_TEST_CASE(lb_exceptions) {
     auto const vec = Utils::Vector3d{};
     auto lb_impl = std::make_shared<LB::LBNone>();
     lb.set<LB::LBNone>(lb_impl);
+    BOOST_CHECK_THROW(lb.is_gpu(), NoLBActive);
     BOOST_CHECK_THROW(lb.get_agrid(), NoLBActive);
     BOOST_CHECK_THROW(lb.get_tau(), NoLBActive);
     BOOST_CHECK_THROW(lb.get_kT(), NoLBActive);
     BOOST_CHECK_THROW(lb.get_pressure_tensor(), NoLBActive);
     BOOST_CHECK_THROW(lb.get_momentum(), NoLBActive);
     BOOST_CHECK_THROW(lb.sanity_checks(), NoLBActive);
+    BOOST_CHECK_THROW(lb.veto_boxl_change(), NoLBActive);
     BOOST_CHECK_THROW(lb.veto_time_step(0.), NoLBActive);
     BOOST_CHECK_THROW(lb.veto_kT(0.), NoLBActive);
     BOOST_CHECK_THROW(lb.lebc_sanity_checks(0u, 1u), NoLBActive);
@@ -630,6 +616,8 @@ BOOST_AUTO_TEST_CASE(lb_exceptions) {
     BOOST_CHECK_THROW(lb_impl->get_density_at_pos(vec, true), NoLBActive);
     BOOST_CHECK_THROW(lb_impl->get_velocity_at_pos(vec, true), NoLBActive);
     BOOST_CHECK_THROW(lb_impl->add_force_at_pos(vec, vec), NoLBActive);
+    BOOST_CHECK_THROW(lb_impl->add_forces_at_pos({}, {}), NoLBActive);
+    BOOST_CHECK_THROW(lb_impl->get_velocities_at_pos({}), NoLBActive);
     lb.reset();
   }
 }
diff --git a/src/core/unit_tests/lees_edwards_test.cpp b/src/core/unit_tests/lees_edwards_test.cpp
index c82c74c1e06..e771d8e1e3a 100644
--- a/src/core/unit_tests/lees_edwards_test.cpp
+++ b/src/core/unit_tests/lees_edwards_test.cpp
@@ -28,8 +28,6 @@
 
 #include <utils/Vector.hpp>
 
-#include <boost/range/algorithm/equal.hpp>
-
 #include <algorithm>
 #include <cmath>
 #include <limits>
diff --git a/src/core/unit_tests/link_cell_test.cpp b/src/core/unit_tests/link_cell_test.cpp
index 475580a948e..413c7a27f1a 100644
--- a/src/core/unit_tests/link_cell_test.cpp
+++ b/src/core/unit_tests/link_cell_test.cpp
@@ -30,9 +30,9 @@
 #include <vector>
 
 BOOST_AUTO_TEST_CASE(link_cell) {
-  const unsigned n_cells = 10;
-  const auto n_part_per_cell = 10;
-  const auto n_part = n_cells * n_part_per_cell;
+  auto const n_cells = 10u;
+  auto const n_part_per_cell = 10u;
+  auto const n_part = n_cells * n_part_per_cell;
 
   std::vector<Cell> cells(n_cells);
 
@@ -55,7 +55,7 @@ BOOST_AUTO_TEST_CASE(link_cell) {
   }
 
   std::vector<std::pair<int, int>> lc_pairs;
-  lc_pairs.reserve((n_part * (n_part - 1)) / 2);
+  lc_pairs.reserve((n_part * (n_part - 1u)) / 2u);
 
   Algorithm::link_cell(cells.begin(), cells.end(),
                        [&lc_pairs](Particle const &p1, Particle const &p2) {
@@ -63,11 +63,11 @@ BOOST_AUTO_TEST_CASE(link_cell) {
                            lc_pairs.emplace_back(p1.id(), p2.id());
                        });
 
-  BOOST_CHECK(lc_pairs.size() == (n_part * (n_part - 1) / 2));
+  BOOST_CHECK(lc_pairs.size() == (n_part * (n_part - 1u) / 2u));
 
   auto it = lc_pairs.begin();
-  for (int i = 0; i < n_part; i++)
-    for (int j = i + 1; j < n_part; j++) {
+  for (auto i = 0; i < static_cast<int>(n_part); i++)
+    for (auto j = i + 1; j < static_cast<int>(n_part); j++) {
       BOOST_CHECK((it->first == i) && (it->second == j));
       ++it;
     }
diff --git a/src/core/unit_tests/math_test.cpp b/src/core/unit_tests/math_test.cpp
new file mode 100644
index 00000000000..46a638800f1
--- /dev/null
+++ b/src/core/unit_tests/math_test.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (C) 2017-2024 The ESPResSo project
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_MODULE "Special mathematical functions"
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+
+#include "p3m/math.hpp"
+
+#include <utils/math/sqr.hpp>
+
+#include <boost/math/special_functions/bernoulli.hpp>
+#include <boost/math/special_functions/factorials.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <numbers>
+#include <numeric>
+#include <ranges>
+#include <stdexcept>
+
+/** @brief Compute the n-th term in the Taylor series of @c tan(x). */
+static auto taylor_series_tangent(int n) {
+  auto const two_power_2n = std::pow(2., 2 * n);
+  auto const b2n = boost::math::bernoulli_b2n<double>(n);
+  auto const f2n = boost::math::factorial<double>(2 * n);
+  return std::pow(-1., n - 1) * two_power_2n * (two_power_2n - 1.) * b2n / f2n;
+}
+
+/** @brief Compute the n-th term in the Taylor series of @c sinc(x). */
+static auto taylor_series_sinc(int n) {
+  return std::pow(-1., n) / std::tgamma(2 * n + 2);
+}
+
+BOOST_AUTO_TEST_CASE(abs_test) {
+  static_assert(std::is_same_v<float, decltype(math::abs(1.f))>);
+  static_assert(std::is_same_v<double, decltype(math::abs(1.))>);
+
+  BOOST_CHECK_EQUAL(math::abs(+3.1415), std::abs(+3.1415));
+  BOOST_CHECK_EQUAL(math::abs(-3.1415), std::abs(-3.1415));
+  BOOST_CHECK_EQUAL(math::abs(+3.1415f), std::abs(+3.1415f));
+  BOOST_CHECK_EQUAL(math::abs(-3.1415f), std::abs(-3.1415f));
+}
+
+BOOST_AUTO_TEST_CASE(sinc_test) {
+  // edge case
+  BOOST_CHECK_EQUAL(math::sinc(0.0), 1.0);
+
+  // check against standard math functions
+  auto x = 0.001;
+  while (x <= 0.11) {
+    auto const approx = math::sinc(x);
+    auto const pi_x = std::numbers::pi * x;
+    auto const exact = std::sin(pi_x) / (pi_x);
+    BOOST_CHECK_SMALL(approx - exact, 1e-13);
+    x += 0.01;
+  }
+
+  // check Taylor expansion
+  auto const series = std::views::iota(0, 5) | std::views::reverse |
+                      std::views::transform(taylor_series_sinc);
+  for (auto const x : {1e-6, 1e-5, 1e-4, 1e-3, 1e-2}) {
+    auto const pix2 = Utils::sqr(std::numbers::pi * x);
+    auto const ref = std::accumulate(
+        series.begin(), series.end(), 0.,
+        [pix2](auto const &acc, auto const &c) { return acc * pix2 + c; });
+    BOOST_CHECK_SMALL(math::sinc(x) - ref, 1e-13);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(analytic_cotangent_sum_test) {
+  auto constexpr tol = 8. * 100. * std::numeric_limits<double>::epsilon();
+  auto const kernel = [](int n, double mesh_i, int cao) {
+    return math::get_analytic_cotangent_sum_kernel(cao)(n, mesh_i);
+  };
+
+  // edge case: at theta = 0, aliasing sums are unity
+  for (auto const cao : std::ranges::iota_view{1, 8}) {
+    BOOST_CHECK_CLOSE(kernel(0, 0., cao), 1., tol);
+  }
+  // edge case: at theta = pi / 2, aliasing sums are equal to the cao-th term
+  // in the tangent Taylor series
+  for (auto const cao : std::ranges::iota_view{1, 8}) {
+    BOOST_CHECK_CLOSE(kernel(1, 0.5, cao), taylor_series_tangent(cao), tol);
+  }
+
+  // check assertion
+  for (auto const invalid_cao : {-1, 0, 8}) {
+    BOOST_CHECK_THROW(kernel(1, 0., invalid_cao), std::logic_error);
+  }
+}
diff --git a/src/core/unit_tests/mock/Cell.hpp b/src/core/unit_tests/mock/Cell.hpp
index 810895fcc31..61a6f121138 100644
--- a/src/core/unit_tests/mock/Cell.hpp
+++ b/src/core/unit_tests/mock/Cell.hpp
@@ -19,15 +19,15 @@
 #ifndef CORE_UNIT_TESTS_MOCK_CELL_HPP
 #define CORE_UNIT_TESTS_MOCK_CELL_HPP
 
-#include <utils/Span.hpp>
-
+#include <span>
+#include <utility>
 #include <vector>
 
 namespace Testing {
 template <typename Particle> class Cell {
 public:
-  auto particles() { return Utils::make_span(part); }
-  auto particles() const { return Utils::make_const_span(part); }
+  auto particles() { return std::span(part); }
+  auto particles() const { return std::span(std::as_const(part)); }
 
   std::vector<Particle> part;
 };
diff --git a/src/core/unit_tests/p3m_test.cpp b/src/core/unit_tests/p3m_test.cpp
index 0bc343de094..3f091d565ce 100644
--- a/src/core/unit_tests/p3m_test.cpp
+++ b/src/core/unit_tests/p3m_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2020-2022 The ESPResSo project
  *
  * This file is part of ESPResSo.
  *
@@ -17,7 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#define BOOST_TEST_MODULE p3m test
+#define BOOST_TEST_MODULE "P3M utility functions"
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
@@ -27,8 +27,6 @@
 
 #include <array>
 #include <cstddef>
-#include <limits>
-#include <stdexcept>
 #include <vector>
 
 BOOST_AUTO_TEST_CASE(calc_meshift_false) {
@@ -39,8 +37,8 @@ BOOST_AUTO_TEST_CASE(calc_meshift_false) {
   auto const mesh = Utils::Vector3i{{1, 4, 7}};
   auto const val = detail::calc_meshift(mesh, false);
 
-  for (std::size_t i = 0; i < 3; ++i) {
-    for (std::size_t j = 0; j < ref[i].size(); ++j) {
+  for (std::size_t i = 0u; i < 3u; ++i) {
+    for (std::size_t j = 0u; j < ref[i].size(); ++j) {
       BOOST_CHECK_EQUAL(val[i][j], ref[i][j]);
     }
   }
@@ -54,33 +52,9 @@ BOOST_AUTO_TEST_CASE(calc_meshift_true) {
   auto const mesh = Utils::Vector3i{{1, 4, 7}};
   auto const val = detail::calc_meshift(mesh, true);
 
-  for (std::size_t i = 0; i < 3; ++i) {
-    for (std::size_t j = 0; j < ref[i].size(); ++j) {
+  for (std::size_t i = 0u; i < 3u; ++i) {
+    for (std::size_t j = 0u; j < ref[i].size(); ++j) {
       BOOST_CHECK_EQUAL(val[i][j], ref[i][j]);
     }
   }
 }
-
-#if defined(P3M) || defined(DP3M)
-BOOST_AUTO_TEST_CASE(analytic_cotangent_sum) {
-  auto constexpr kernel = p3m_analytic_cotangent_sum;
-  auto constexpr tol = 8. * 100. * std::numeric_limits<double>::epsilon();
-
-  // check only trivial cases
-  for (auto const cao : {1, 2, 3, 4, 5, 6, 7}) {
-    BOOST_CHECK_CLOSE(kernel(0, 0., cao), 1., tol);
-  }
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 1), 1., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 2), 1. / 3., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 3), 2. / 15., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 4), 17. / 315., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 5), 62. / 2835., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 6), 1382. / 155925., tol);
-  BOOST_CHECK_CLOSE(kernel(1, 0.5, 7), 21844. / 6081075., tol);
-
-  // check assertion
-  for (auto const invalid_cao : {-1, 0, 8}) {
-    BOOST_CHECK_THROW(kernel(1, 0., invalid_cao), std::logic_error);
-  }
-}
-#endif // defined(P3M) || defined(DP3M)
diff --git a/src/core/unit_tests/particle_management.hpp b/src/core/unit_tests/particle_management.hpp
index 2bac8e0d1e8..a104cca5cc3 100644
--- a/src/core/unit_tests/particle_management.hpp
+++ b/src/core/unit_tests/particle_management.hpp
@@ -25,11 +25,12 @@
 #include "system/System.hpp"
 
 #include <boost/mpi/communicator.hpp>
-#include <boost/optional.hpp>
+
+#include <optional>
 
 inline auto copy_particle_to_head_node(boost::mpi::communicator const &comm,
                                        System::System &system, int p_id) {
-  boost::optional<Particle> result{};
+  std::optional<Particle> result{};
   auto p = system.cell_structure->get_local_particle(p_id);
   if (p and not p->is_ghost()) {
     if (comm.rank() == 0) {
diff --git a/src/core/unit_tests/random_test.hpp b/src/core/unit_tests/random_test.hpp
index 93912c7b926..bd03e6b7792 100644
--- a/src/core/unit_tests/random_test.hpp
+++ b/src/core/unit_tests/random_test.hpp
@@ -16,8 +16,9 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef CORE_UNIT_TESTS_RANDOM_TEST_HPP
-#define CORE_UNIT_TESTS_RANDOM_TEST_HPP
+
+#pragma once
+
 #include <boost/test/unit_test.hpp>
 
 /* Helper functions to compute random numbers covariance in a single pass */
@@ -57,8 +58,8 @@ class visitor_size : public boost::static_visitor<std::size_t> {
   std::size_t operator()(Vector<double, N> const &v) const {
     return v.size();
   }
-  std::size_t operator()(Utils::Quaternion<double> const &q) const { return 4; }
-  std::size_t operator()(double v) const { return 1; }
+  std::size_t operator()(Utils::Quaternion<double> const &) const { return 4u; }
+  std::size_t operator()(double) const { return 1u; }
 };
 
 class visitor_get : public boost::static_visitor<double> {
@@ -71,7 +72,7 @@ class visitor_get : public boost::static_visitor<double> {
     return q[i];
   }
   double operator()(double v, std::size_t i) const {
-    assert(i == 0);
+    assert(i == 0u);
     return v;
   }
 };
@@ -110,7 +111,7 @@ noise_statistics(NoiseKernel &&noise_function, std::size_t sample_size) {
   std::transform(first_value.begin(), first_value.end(), dimensions.begin(),
                  [](auto const &element) { return get_size(element); });
   auto const matrix_dim = std::accumulate(dimensions.begin(), dimensions.end(),
-                                          0, std::plus<std::size_t>());
+                                          std::size_t{0u}, std::plus<>());
 
   // set up boost accumulators
   namespace ba = boost::accumulators;
@@ -123,21 +124,21 @@ noise_statistics(NoiseKernel &&noise_function, std::size_t sample_size) {
   auto acc_covariance = ::square_matrix<boost_covariance>(matrix_dim);
 
   // accumulate
-  for (std::size_t step = 0; step < sample_size; ++step) {
+  for (std::size_t step = 0u; step < sample_size; ++step) {
     auto const noise_tuple = noise_function();
     // for each vector, pool the random numbers of all columns
-    for (std::size_t vec1 = 0; vec1 < dimensions.size(); ++vec1) {
-      for (std::size_t col1 = 0; col1 < dimensions[vec1]; ++col1) {
+    for (std::size_t vec1 = 0u; vec1 < dimensions.size(); ++vec1) {
+      for (std::size_t col1 = 0u; col1 < dimensions[vec1]; ++col1) {
         acc_variance[vec1](::get_value(noise_tuple[vec1], col1));
       }
     }
     // fill the covariance matrix (upper triangle)
-    std::size_t index1 = 0;
-    for (std::size_t vec1 = 0; vec1 < dimensions.size(); ++vec1) {
-      for (std::size_t col1 = 0; col1 < dimensions[vec1]; ++col1) {
+    std::size_t index1 = 0u;
+    for (std::size_t vec1 = 0u; vec1 < dimensions.size(); ++vec1) {
+      for (std::size_t col1 = 0u; col1 < dimensions[vec1]; ++col1) {
         std::size_t index2 = index1;
         for (std::size_t vec2 = vec1; vec2 < dimensions.size(); ++vec2) {
-          for (std::size_t col2 = (vec2 == vec1) ? col1 : 0;
+          for (std::size_t col2 = (vec2 == vec1) ? col1 : std::size_t{0u};
                col2 < dimensions[vec2]; ++col2) {
             acc_covariance[index1][index2](
                 ::get_value(noise_tuple[vec1], col1),
@@ -153,19 +154,19 @@ noise_statistics(NoiseKernel &&noise_function, std::size_t sample_size) {
   // compute statistics
   std::vector<double> means(n_vectors);
   std::vector<double> variances(n_vectors);
-  for (std::size_t i = 0; i < n_vectors; ++i) {
+  for (std::size_t i = 0u; i < n_vectors; ++i) {
     means[i] = ba::mean(acc_variance[i]);
     variances[i] = ba::variance(acc_variance[i]);
   }
   auto covariance = ::square_matrix<double>(matrix_dim);
-  for (std::size_t i = 0; i < matrix_dim; ++i) {
+  for (std::size_t i = 0u; i < matrix_dim; ++i) {
     for (std::size_t j = i; j < matrix_dim; ++j) {
       covariance[i][j] = covariance[j][i] =
           ba::covariance(acc_covariance[i][j]);
     }
   }
   auto correlation = ::square_matrix<double>(matrix_dim);
-  for (std::size_t i = 0; i < matrix_dim; ++i) {
+  for (std::size_t i = 0u; i < matrix_dim; ++i) {
     for (std::size_t j = i; j < matrix_dim; ++j) {
       correlation[i][j] = correlation[j][i] =
           covariance[i][j] / sqrt(covariance[i][i] * covariance[j][j]);
@@ -189,4 +190,3 @@ boost::test_tools::predicate_result correlation_almost_equal(
   }
   return true;
 }
-#endif
diff --git a/src/core/unit_tests/rotation_test.cpp b/src/core/unit_tests/rotation_test.cpp
index 278a5232118..eea465459f3 100644
--- a/src/core/unit_tests/rotation_test.cpp
+++ b/src/core/unit_tests/rotation_test.cpp
@@ -30,11 +30,11 @@
 #include "rotation.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/quaternion.hpp>
 
 #include <initializer_list>
 #include <limits>
+#include <numbers>
 #include <stdexcept>
 #include <tuple>
 
@@ -137,7 +137,7 @@ BOOST_AUTO_TEST_CASE(rotate_particle_body_test) {
   {
     // an angle of pi around the z-axis flips the quaternion sequence
     p.set_can_rotate_all_axes();
-    auto const phi = Utils::pi<double>();
+    auto const phi = std::numbers::pi;
     auto const quat = local_rotate_particle_body(p, {0., 0., 1.}, phi);
     auto const quat_ref = Utils::Vector4d{{-4., 3., -2., 1.}};
     for (unsigned int i : {0u, 1u, 2u, 3u}) {
@@ -147,7 +147,7 @@ BOOST_AUTO_TEST_CASE(rotate_particle_body_test) {
   {
     // an angle of 2 pi around the z-axis flips the quaternion sign
     p.set_can_rotate_all_axes();
-    auto const phi = 2. * Utils::pi<double>();
+    auto const phi = 2. * std::numbers::pi;
     auto const quat = local_rotate_particle_body(p, {0., 0., 1.}, phi);
     auto const quat_ref = Utils::Vector4d{{-1., -2., -3., -4.}};
     for (unsigned int i : {0u, 1u, 2u, 3u}) {
@@ -191,7 +191,7 @@ BOOST_AUTO_TEST_CASE(propagate_omega_quat_particle_test) {
 }
 
 BOOST_AUTO_TEST_CASE(convert_operator_body_to_space_test) {
-  auto constexpr sqrt_2_half = Utils::sqrt_2() / 2.0;
+  auto constexpr sqrt_2_half = std::numbers::sqrt2 / 2.0;
   // rotation around y-axis by pi/2
   Utils::Quaternion<double> const quat = {sqrt_2_half, 0.0, sqrt_2_half, 0.0};
   // rotation around z-axis by pi/4
diff --git a/src/core/virtual_sites/lb_tracers.cpp b/src/core/virtual_sites/lb_tracers.cpp
index 9edea02a1d7..4844472983e 100644
--- a/src/core/virtual_sites/lb_tracers.cpp
+++ b/src/core/virtual_sites/lb_tracers.cpp
@@ -39,12 +39,11 @@ static bool lb_sanity_checks(LB::Solver const &lb) {
 void lb_tracers_add_particle_force_to_fluid(CellStructure &cell_structure,
                                             BoxGeometry const &box_geo,
                                             LocalBox const &local_box,
-                                            LB::Solver &lb, double time_step) {
+                                            LB::Solver &lb) {
   if (lb_sanity_checks(lb)) {
     return;
   }
   auto const agrid = lb.get_agrid();
-  auto const to_lb_units = 1. / agrid;
 
   // Distribute summed-up forces from physical particles to ghosts
   init_forces_ghosts(cell_structure.ghost_particles());
@@ -62,7 +61,7 @@ void lb_tracers_add_particle_force_to_fluid(CellStructure &cell_structure,
       if (bookkeeping.should_be_coupled(p)) {
         for (auto const &pos :
              positions_in_halo(p.pos(), box_geo, local_box, agrid)) {
-          add_md_force(lb, pos * to_lb_units, p.force(), time_step);
+          lb.add_force_density(pos, p.force());
         }
       }
     }
@@ -85,7 +84,7 @@ void lb_tracers_propagate(CellStructure &cell_structure, LB::Solver const &lb,
     if (!LB::is_tracer(p))
       continue;
     p.v() = lb.get_coupling_interpolated_velocity(p.pos());
-    for (unsigned int i = 0u; i < 3u; i++) {
+    for (auto i = 0u; i < 3u; i++) {
       if (!p.is_fixed_along(i)) {
         p.pos()[i] += p.v()[i] * time_step;
       }
diff --git a/src/core/virtual_sites/lb_tracers.hpp b/src/core/virtual_sites/lb_tracers.hpp
index bbc9b4ce90d..8a1e9cd7eae 100644
--- a/src/core/virtual_sites/lb_tracers.hpp
+++ b/src/core/virtual_sites/lb_tracers.hpp
@@ -31,7 +31,7 @@
 void lb_tracers_add_particle_force_to_fluid(CellStructure &cell_structure,
                                             BoxGeometry const &box_geo,
                                             LocalBox const &local_box,
-                                            LB::Solver &lb, double time_step);
+                                            LB::Solver &lb);
 void lb_tracers_propagate(CellStructure &cell_structure, LB::Solver const &lb,
                           double time_step);
 
diff --git a/src/particle_observables/include/particle_observables/properties.hpp b/src/particle_observables/include/particle_observables/properties.hpp
index 3e3510c536e..970ed8157f3 100644
--- a/src/particle_observables/include/particle_observables/properties.hpp
+++ b/src/particle_observables/include/particle_observables/properties.hpp
@@ -27,7 +27,9 @@ namespace ParticleObservables {
 template <class DoF> struct traits;
 
 namespace detail {
-template <class T> struct decay { using type = typename std::decay_t<T>; };
+template <class T> struct decay {
+  using type = typename std::decay_t<T>;
+};
 
 template <class U> struct decay<std::reference_wrapper<U>> {
   using type = std::decay_t<U>;
diff --git a/src/particle_observables/tests/CMakeLists.txt b/src/particle_observables/tests/CMakeLists.txt
index 244c47bfabd..241c6d12a60 100644
--- a/src/particle_observables/tests/CMakeLists.txt
+++ b/src/particle_observables/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2020-2022 The ESPResSo project
+# Copyright (C) 2020-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,11 +17,11 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
+include(espresso_unit_test)
 
-unit_test(NAME properties_test SRC properties.cpp DEPENDS
-          espresso::particle_observables)
-unit_test(NAME algorithms_test SRC algorithms.cpp DEPENDS
-          espresso::particle_observables)
-unit_test(NAME observables_test SRC observables.cpp DEPENDS
-          espresso::particle_observables)
+espresso_unit_test(SRC properties_test.cpp DEPENDS
+                   espresso::particle_observables)
+espresso_unit_test(SRC algorithms_test.cpp DEPENDS
+                   espresso::particle_observables)
+espresso_unit_test(SRC observables_test.cpp DEPENDS
+                   espresso::particle_observables)
diff --git a/src/particle_observables/tests/algorithms.cpp b/src/particle_observables/tests/algorithms_test.cpp
similarity index 100%
rename from src/particle_observables/tests/algorithms.cpp
rename to src/particle_observables/tests/algorithms_test.cpp
diff --git a/src/particle_observables/tests/observables.cpp b/src/particle_observables/tests/observables_test.cpp
similarity index 100%
rename from src/particle_observables/tests/observables.cpp
rename to src/particle_observables/tests/observables_test.cpp
diff --git a/src/particle_observables/tests/properties.cpp b/src/particle_observables/tests/properties_test.cpp
similarity index 100%
rename from src/particle_observables/tests/properties.cpp
rename to src/particle_observables/tests/properties_test.cpp
diff --git a/src/python/espressomd/CMakeLists.txt b/src/python/espressomd/CMakeLists.txt
index 4c188ce6a14..bd4b5f9eb63 100644
--- a/src/python/espressomd/CMakeLists.txt
+++ b/src/python/espressomd/CMakeLists.txt
@@ -17,21 +17,31 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
+include(espresso_resource_files)
+
 add_custom_target(espressomd)
 
-# Make the cython_SRC, cython_HEADER and cython_AUX a cached variable to be able
-# to extend it in the subdirectories.
 file(GLOB cython_SRC *.pyx)
-set(cython_SRC "${cython_SRC}" CACHE INTERNAL "cython_SRC")
 file(GLOB cython_HEADER *.pxd)
-set(cython_HEADER "${cython_HEADER}" CACHE INTERNAL "cython_HEADER")
-file(GLOB cython_AUX *.py)
-set(cython_AUX "${cython_AUX}" CACHE INTERNAL "cython_AUX")
+file(GLOB python_SRC RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" *.py)
+
+espresso_target_resources(espressomd ${python_SRC})
 
 add_subdirectory(io)
 add_subdirectory(detail)
+add_subdirectory(plugins)
 
-list(REMOVE_DUPLICATES cython_SRC)
+# Install resource files (Python files, text files, etc.)
+get_property(ESPRESSOMD_RESOURCE_FILES TARGET espressomd
+             PROPERTY EspressoResourceFiles)
+foreach(RESOURCE_ABSPATH ${ESPRESSOMD_RESOURCE_FILES})
+  cmake_path(RELATIVE_PATH RESOURCE_ABSPATH BASE_DIRECTORY
+             ${CMAKE_CURRENT_BINARY_DIR} OUTPUT_VARIABLE RESOURCE_RELPATH)
+  cmake_path(GET RESOURCE_RELPATH PARENT_PATH RESOURCE_RELPARENT)
+  install(
+    FILES "${RESOURCE_ABSPATH}"
+    DESTINATION "${ESPRESSO_INSTALL_PYTHON}/espressomd/${RESOURCE_RELPARENT}")
+endforeach()
 
 add_library(espresso_pyx_flags INTERFACE)
 add_library(espresso::pyx_flags ALIAS espresso_pyx_flags)
@@ -47,74 +57,47 @@ target_compile_options(
     $<$<CXX_COMPILER_ID:GNU>:-Wno-volatile>
     $<$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>:-Wno-sometimes-uninitialized>
     $<$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>:-Wno-\#warnings>
+    $<$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>:-Wno-deprecated-volatile>
+    -Wno-missing-field-initializers
     -Wno-unused-variable)
 
 # Configure, compile and install Cython files
 foreach(cython_file ${cython_SRC})
-  get_filename_component(basename ${cython_file} NAME_WE)
-  file(RELATIVE_PATH relpath ${CMAKE_CURRENT_SOURCE_DIR} ${cython_file})
-  if(basename STREQUAL "code_info")
-    file(RELATIVE_PATH relpath ${CMAKE_CURRENT_BINARY_DIR} ${cython_file})
-  endif()
-  get_filename_component(relpath ${relpath} DIRECTORY)
-  if(relpath STREQUAL "")
-    string(CONCAT outputpath ${CMAKE_CURRENT_BINARY_DIR} "/" ${basename} ".cpp")
+  cmake_path(GET cython_file STEM basename)
+  cmake_path(RELATIVE_PATH cython_file BASE_DIRECTORY
+             ${CMAKE_CURRENT_SOURCE_DIR} OUTPUT_VARIABLE relpath)
+  cmake_path(GET relpath PARENT_PATH subfolder)
+  cmake_path(APPEND CMAKE_CURRENT_BINARY_DIR ${subfolder} "${basename}.cpp"
+             OUTPUT_VARIABLE outputpath)
+  add_custom_command(
+    OUTPUT ${outputpath}
+    COMMAND
+      ${CYTHON_EXECUTABLE} -3 --cplus --directive embedsignature=True
+      --directive binding=True -I ${CMAKE_CURRENT_SOURCE_DIR} -I
+      ${CMAKE_CURRENT_BINARY_DIR} ${cython_file} -o ${outputpath}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/..
+    DEPENDS ${cython_file} ${cython_HEADER})
+  set(target "espressomd_${basename}")
+  add_library(${target} SHARED ${outputpath})
+  if(NOT "${subfolder}" STREQUAL "")
+    set_target_properties(${target} PROPERTIES PREFIX "${subfolder}/")
   else()
-    string(CONCAT outputpath ${CMAKE_CURRENT_BINARY_DIR} "/" ${relpath} "/"
-                  ${basename} ".cpp")
+    set_target_properties(${target} PROPERTIES PREFIX "")
   endif()
-  if(basename STREQUAL "")
-    message(FATAL_ERROR "Internal error empty basename of file ${cython_file}")
-  else()
-    add_custom_command(
-      OUTPUT ${outputpath}
-      COMMAND
-        ${CYTHON_EXECUTABLE}
-        $<$<BOOL:${ESPRESSO_WARNINGS_ARE_ERRORS}>:--warning-errors> -3 --cplus
-        --directive embedsignature=True --directive binding=True -I
-        ${CMAKE_CURRENT_SOURCE_DIR} -I ${CMAKE_CURRENT_BINARY_DIR}
-        ${cython_file} -o ${outputpath}
-      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/..
-      DEPENDS ${cython_file} ${cython_HEADER})
-    set(target "espressomd_${basename}")
-    add_library(${target} SHARED ${outputpath})
-    if(NOT "${relpath}" STREQUAL "")
-      set_target_properties(${target} PROPERTIES PREFIX "${relpath}/")
-    else()
-      set_target_properties(${target} PROPERTIES PREFIX "")
-    endif()
-    set_target_properties(${target} PROPERTIES OUTPUT_NAME ${basename})
-    if(APPLE)
-      set_target_properties(
-        ${target} PROPERTIES SUFFIX ".so" LINK_FLAGS
-                                          "-undefined dynamic_lookup")
-    endif()
-    set_target_properties(${target} PROPERTIES CXX_CLANG_TIDY "")
-    target_link_libraries(${target} PRIVATE espresso::config espresso::core
-                                            espresso::script_interface)
-    target_link_libraries(${target} PRIVATE espresso::cpp_flags)
-    target_link_libraries(${target} PRIVATE espresso::pyx_flags)
-    target_include_directories(
-      ${target} SYSTEM PRIVATE ${Python_INCLUDE_DIRS}
-                               ${Python_NumPy_INCLUDE_DIRS})
-    add_dependencies(espressomd ${target})
-    install(TARGETS ${target}
-            LIBRARY DESTINATION ${ESPRESSO_INSTALL_PYTHON}/espressomd)
+  set_target_properties(${target} PROPERTIES OUTPUT_NAME ${basename})
+  if(APPLE)
+    set_target_properties(
+      ${target} PROPERTIES SUFFIX ".so" LINK_FLAGS "-undefined dynamic_lookup")
   endif()
+  set_target_properties(${target} PROPERTIES CXX_CLANG_TIDY "")
+  target_link_libraries(${target} PRIVATE espresso::config espresso::core
+                                          espresso::script_interface)
+  target_link_libraries(${target} PRIVATE espresso::cpp_flags)
+  target_link_libraries(${target} PRIVATE espresso::pyx_flags)
+  target_include_directories(
+    ${target} SYSTEM PRIVATE ${Python_INCLUDE_DIRS}
+                             ${Python_NumPy_INCLUDE_DIRS})
+  add_dependencies(espressomd ${target})
+  install(TARGETS "${target}"
+          LIBRARY DESTINATION "${ESPRESSO_INSTALL_PYTHON}/espressomd")
 endforeach()
-
-# Configure Python files
-foreach(auxfile ${cython_AUX})
-  get_filename_component(filename ${auxfile} NAME)
-  file(RELATIVE_PATH relpath ${CMAKE_CURRENT_SOURCE_DIR} ${auxfile})
-  get_filename_component(relpath ${relpath} DIRECTORY)
-  string(CONCAT outputpath ${CMAKE_CURRENT_BINARY_DIR} "/" ${relpath} "/"
-                ${filename})
-  add_custom_command(TARGET espressomd COMMAND ${CMAKE_COMMAND} -E copy
-                                               ${auxfile} ${outputpath})
-endforeach(auxfile)
-
-# Install Python files
-install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-        DESTINATION ${ESPRESSO_INSTALL_PYTHON} FILES_MATCHING PATTERN "*.py"
-        PATTERN "CMakeFiles" EXCLUDE)
diff --git a/src/python/espressomd/analyze.py b/src/python/espressomd/analyze.py
index 42d0b229df0..7345c739a04 100644
--- a/src/python/espressomd/analyze.py
+++ b/src/python/espressomd/analyze.py
@@ -285,14 +285,17 @@ class Analysis(ScriptInterfaceHelper):
             and [1] contains the structure factors S(q)
 
     distribution()
-        Calculates the distance distribution of particles (probability of
-        finding a particle of type A at a certain distance around a particle of
+        Calculate the minimal distance distribution of particles (probability of
+        finding a particle of type A at a certain distance to the nearest particle of
         type B, disregarding the fact that a spherical shell of a larger radius
         covers a larger volume). The distance is defined as the minimal distance
         between a particle of group ``type_list_a`` to any of the group
         ``type_list_b``. Returns two arrays, the bins and the (normalized)
         distribution.
 
+        For the radial distribution function,
+        use :class:`espressomd.observables.RDF` instead.
+
         Parameters
         ----------
         type_list_a : list of :obj:`int`
@@ -319,7 +322,7 @@ class Analysis(ScriptInterfaceHelper):
         -------
         :obj:`ndarray`
             Where [0] contains the midpoints of the bins,
-            and [1] contains the values of the rdf.
+            and [1] contains the values of the minimal distance distribution function.
 
     """
     _so_name = "Analysis::Analysis"
diff --git a/src/python/espressomd/cell_system.py b/src/python/espressomd/cell_system.py
index 6450bd55d11..17a5364337a 100644
--- a/src/python/espressomd/cell_system.py
+++ b/src/python/espressomd/cell_system.py
@@ -115,6 +115,12 @@ def set_regular_decomposition(self, **kwargs):
         use_verlet_lists : :obj:`bool`, optional
             Activates or deactivates the usage of Verlet lists.
             Defaults to ``True``.
+        fully_connected_boundary : :obj:`dict`, optional
+            If set, connects all cells on a given boundary along the given direction.
+            Example: ``{"boundary": "z", "direction": "x"}`` connects all
+            cells on the boundary normal to the z-direction along the x-axis.
+            This corresponds to z-axis as shear plane normal and x-axis as
+            shear direction in Lees-Edwards boundary conditions.
 
         """
         self.call_method("initialize", name="regular_decomposition", **kwargs)
diff --git a/src/python/espressomd/collision_detection.py b/src/python/espressomd/collision_detection.py
index d7470e5b150..24dde5de809 100644
--- a/src/python/espressomd/collision_detection.py
+++ b/src/python/espressomd/collision_detection.py
@@ -63,7 +63,7 @@ def set_params(self, **kwargs):
 
         Parameters
         ----------
-        mode : :obj:`str`, {"off", "bind_centers", "bind_at_point_of_collision", "bind_three_particles", "glue_to_surface"}
+        mode : :obj:`str`, {"off", "bind_centers", "bind_at_point_of_collision", "glue_to_surface"}
             Collision detection mode
 
         distance : :obj:`float`
@@ -92,22 +92,13 @@ def set_params(self, **kwargs):
         distance_glued_particle_to_vs : :obj:`float`
             Distance for ``"glue_to_surface"`` mode. See user guide.
 
-        bond_three_particles : :obj:`espressomd.interactions.BondedInteraction`
-            First angular bond for the ``"bind_three_particles"`` mode. See
-            user guide
-
-        three_particle_binding_angle_resolution : :obj:`int`
-            Resolution for the angular bonds (mode ``"bind_three_particles"``).
-            Resolution+1 bonds are needed to accommodate the case of 180 degrees
-            angles
-
         """
 
         if "mode" not in kwargs:
             raise ValueError(
                 "Collision mode must be specified via the 'mode' argument")
         # Convert bonds to bond ids
-        for name in ["bond_centers", "bond_vs", "bond_three_particle_binding"]:
+        for name in ["bond_centers", "bond_vs"]:
             if name in kwargs:
                 if isinstance(kwargs[name], BondedInteraction):
                     kwargs[name] = kwargs[name]._bond_id
@@ -117,7 +108,7 @@ def get_parameter(self, name):
         """Gets a single parameter from the collision detection."""
 
         value = super().get_parameter(name)
-        if name in ["bond_centers", "bond_vs", "bond_three_particle_binding"]:
+        if name in ["bond_centers", "bond_vs"]:
             if value == -1:  # Not defined
                 value = None
             else:
diff --git a/src/python/espressomd/constraints.py b/src/python/espressomd/constraints.py
index eb0dba268c9..79217173de2 100644
--- a/src/python/espressomd/constraints.py
+++ b/src/python/espressomd/constraints.py
@@ -183,8 +183,19 @@ def total_normal_force(self):
 
 @script_interface_register
 class HomogeneousMagneticField(Constraint):
-
     """
+    Homogeneous magnetic field :math:`\\vec{H}`.
+    The resulting force :math:`\\vec{F}`, torque :math:`\\vec{\\tau}`
+    and energy `U` on the particles are then
+
+    :math:`\\vec{F} = \\vec{0}`
+
+    :math:`\\vec{\\tau} = \\vec{\\mu} \\times \\vec{H}`
+
+    :math:`U = -\\vec{\\mu} \\cdot \\vec{H}`
+
+    where :math:`\\vec{\\mu}` is the particle dipole moment.
+
     Attributes
     ----------
     H : (3,) array_like of :obj:`float`
@@ -393,12 +404,12 @@ class Gravity(Constraint):
     """
     Gravity force
 
-    :math:`F = m \\cdot g`
+    :math:`\\vec{F} = m \\cdot \\vec{g}`
 
     Arguments
     ----------
     g : (3,) array_like of :obj:`float`
-        The gravitational acceleration.
+        The gravitational constant.
 
     """
 
@@ -420,21 +431,21 @@ class LinearElectricPotential(Constraint):
     """
     Electric potential of the form
 
-    :math:`\\phi = -E \\cdot x + \\phi_0`,
+    :math:`\\phi = -\\vec{E} \\cdot \\vec{x} + \\phi_0`,
 
-    resulting in the electric field E
-    everywhere. (E.g. in a plate capacitor).
+    resulting in the electric field :math:`\\vec{E}` everywhere.
     The resulting force on the particles are then
 
-    :math:`F = q \\cdot E`
+    :math:`\\vec{F} = q \\cdot \\vec{E}`
 
-    where :math:`q` is the charge of the particle.
+    where :math:`q` and :math:`\\vec{x}` are the particle charge and position
+    in folded coordinates.
+    This can be used to model a plate capacitor.
 
     Arguments
     ----------
     E : array_like of :obj:`float`
         The electric field.
-
     phi0 : :obj:`float`
         The potential at the origin
 
@@ -463,15 +474,18 @@ class ElectricPlaneWave(Constraint):
     """
     Electric field of the form
 
-    :math:`E = E0 \\cdot \\sin(k \\cdot x + \\omega \\cdot t + \\phi)`
+    :math:`\\vec{E} = \\vec{E_0} \\cdot \\sin(\\vec{k} \\cdot \\vec{x} + \\omega \\cdot t + \\phi)`
 
     The resulting force on the particles are then
 
-    :math:`F = q \\cdot E`
+    :math:`\\vec{F} = q \\cdot \\vec{E}`
 
-    where :math:`q` is the charge of the particle.
+    where :math:`q` and :math:`\\vec{x}` are the particle charge and position
+    in folded coordinates.
     This can be used to generate a homogeneous AC
-    field by setting k to zero.
+    field by setting :math:`\\vec{k}` to the null vector.
+    For periodic systems, :math:`\\vec{k}` must be an integer multiple
+    of :math:`2\\pi \\vec{L}^{-1}` with :math:`\\vec{L}` the box length.
 
     Arguments
     ----------
@@ -482,7 +496,7 @@ class ElectricPlaneWave(Constraint):
     omega : :obj:`float`
         Frequency of the wave
     phi : :obj:`float`, optional
-        Phase shift
+        Phase
 
     """
 
@@ -520,9 +534,10 @@ class FlowField(_Interpolated):
     Viscous coupling to a flow field that is
     interpolated from tabulated data like
 
-    :math:`F = -\\gamma \\cdot \\left( u(r) - v \\right)`
+    :math:`\\vec{F} = -\\gamma \\cdot \\left( \\vec{u}(\\vec{x}) - \\vec{v} \\right)`
 
-    where :math:`v` is the velocity of the particle.
+    where :math:`\\vec{v}` and :math:`\\vec{x}` are the particle velocity and position
+    in folded coordinates, and :math:`\\vec{u}(\\vec{x})` is a 3D flow field on a grid.
 
     Arguments
     ----------
@@ -549,9 +564,10 @@ class HomogeneousFlowField(Constraint):
     Viscous coupling to a flow field that is
     constant in space with the force
 
-    :math:`F = -\\gamma \\cdot (u - v)`
+    :math:`\\vec{F} = -\\gamma \\cdot (\\vec{u} - \\vec{v})`
 
-    where :math:`v` is the velocity of the particle.
+    where :math:`\\vec{v}` is the velocity of the particle
+    and :math:`\\vec{u}` is the constant flow field.
 
     Attributes
     ----------
@@ -580,11 +596,11 @@ class ElectricPotential(_Interpolated):
 
     """
     Electric potential interpolated from
-    provided data. The electric field E is
+    provided data. The electric field :math:`\\vec{E}` is
     calculated numerically from the potential,
     and the resulting force on the particles are
 
-    :math:`F = q \\cdot E`
+    :math:`\\vec{F} = q \\cdot \\vec{E}`
 
     where :math:`q` is the charge of the particle.
 
diff --git a/src/python/espressomd/detail/CMakeLists.txt b/src/python/espressomd/detail/CMakeLists.txt
index 926a01b5f2e..48140e63b88 100644
--- a/src/python/espressomd/detail/CMakeLists.txt
+++ b/src/python/espressomd/detail/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2023 The ESPResSo project
+# Copyright (C) 2023-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,5 +17,4 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-configure_file(__init__.py __init__.py COPYONLY)
-configure_file(walberla.py walberla.py COPYONLY)
+espresso_target_resources(espressomd __init__.py walberla.py)
diff --git a/src/python/espressomd/electrostatics.py b/src/python/espressomd/electrostatics.py
index 7be00eac153..a197c56528f 100644
--- a/src/python/espressomd/electrostatics.py
+++ b/src/python/espressomd/electrostatics.py
@@ -393,40 +393,6 @@ def required_keys(self):
         return {"prefactor", "maxPWerror"}
 
 
-@script_interface_register
-class MMM1DGPU(ElectrostaticInteraction):
-    """
-    Electrostatics solver with GPU support for systems with one periodic
-    direction. See :ref:`MMM1D on GPU` for more details.
-
-    Parameters
-    ----------
-    prefactor : :obj:`float`
-        Electrostatics prefactor (see :eq:`coulomb_prefactor`).
-    maxWPerror : :obj:`float`
-        Maximal pairwise error.
-    far_switch_radius : :obj:`float`, optional
-        Radius where near-field and far-field calculation are switched
-    bessel_cutoff : :obj:`int`, optional
-    timings : :obj:`int`, optional
-        Number of force calculations during tuning.
-    check_neutrality : :obj:`bool`, optional
-        Raise a warning if the system is not electrically neutral when
-        set to ``True`` (default).
-    """
-    _so_name = "Coulomb::CoulombMMM1DGpu"
-    _so_creation_policy = "GLOBAL"
-    _so_features = ("MMM1D_GPU",)
-
-    def default_params(self):
-        return {"far_switch_radius": -1.,
-                "bessel_cutoff": -1,
-                "check_neutrality": True}
-
-    def required_keys(self):
-        return {"prefactor", "maxPWerror"}
-
-
 @script_interface_register
 class Scafacos(ElectrostaticInteraction):
 
diff --git a/src/python/espressomd/integrate.py b/src/python/espressomd/integrate.py
index 519dff9f702..9e2ac99f283 100644
--- a/src/python/espressomd/integrate.py
+++ b/src/python/espressomd/integrate.py
@@ -36,7 +36,7 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
     def __str__(self):
-        return f'{self.__class__.__name__}({self.integrator.__class__.__name__})'
+        return f'{self.__class__.__name__}({self.integrator.__class__.__name__})'  # nopep8
 
     def run(self, *args, **kwargs):
         """
diff --git a/src/python/espressomd/io/CMakeLists.txt b/src/python/espressomd/io/CMakeLists.txt
index f6d2f170963..89a182350e5 100644
--- a/src/python/espressomd/io/CMakeLists.txt
+++ b/src/python/espressomd/io/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2016-2022 The ESPResSo project
+# Copyright (C) 2016-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,9 +17,6 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-configure_file(mpiio.py mpiio.py COPYONLY)
-configure_file(vtk.py vtk.py COPYONLY)
+espresso_target_resources(espressomd __init__.py vtk.py mpiio.py)
+
 add_subdirectory(writer)
-set(cython_AUX ${cython_AUX}
-               "${CMAKE_SOURCE_DIR}/src/python/espressomd/io/__init__.py"
-    CACHE INTERNAL "cython_AUX" FORCE)
diff --git a/src/python/espressomd/io/writer/CMakeLists.txt b/src/python/espressomd/io/writer/CMakeLists.txt
index 1d08dbb21fd..e211856d65d 100644
--- a/src/python/espressomd/io/writer/CMakeLists.txt
+++ b/src/python/espressomd/io/writer/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2016-2022 The ESPResSo project
+# Copyright (C) 2016-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,6 +17,4 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-configure_file(__init__.py __init__.py COPYONLY)
-configure_file(vtf.py vtf.py COPYONLY)
-configure_file(h5md.py h5md.py COPYONLY)
+espresso_target_resources(espressomd __init__.py vtf.py h5md.py)
diff --git a/src/python/espressomd/lb.py b/src/python/espressomd/lb.py
index b805a0307f3..27f2f851de1 100644
--- a/src/python/espressomd/lb.py
+++ b/src/python/espressomd/lb.py
@@ -211,7 +211,7 @@ class LBFluidWalberla(HydrodynamicInteraction,
 
     """
 
-    _so_name = "walberla::LBFluid"
+    _so_name = "walberla::LBFluidCPU"
     _so_features = ("WALBERLA",)
     _so_creation_policy = "GLOBAL"
     _so_bind_methods = (
@@ -306,18 +306,22 @@ def add_boundary_from_shape(self, shape,
             values=array_variant(velocity.flatten()))
 
 
-class LBFluidWalberlaGPU(HydrodynamicInteraction):
+@script_interface_register
+class LBFluidWalberlaGPU(LBFluidWalberla):
     """
     Initialize the lattice-Boltzmann method for hydrodynamic flow using
     waLBerla for the GPU. See :class:`HydrodynamicInteraction` for the
     list of parameters.
 
     """
+    _so_name = "walberla::LBFluidGPU"
+    _so_creation_policy = "GLOBAL"
     _so_features = ("WALBERLA", "CUDA")
 
-    # pylint: disable=unused-argument
-    def __init__(self, *args, **kwargs):
-        raise NotImplementedError("Not implemented yet")
+    def default_params(self):
+        params = super().default_params()
+        params["single_precision"] = True
+        return params
 
 
 @script_interface_register
diff --git a/src/python/espressomd/magnetostatics.py b/src/python/espressomd/magnetostatics.py
index f3787d8dbce..ca611d6b461 100644
--- a/src/python/espressomd/magnetostatics.py
+++ b/src/python/espressomd/magnetostatics.py
@@ -255,38 +255,6 @@ def required_keys(self):
         return {"prefactor"}
 
 
-@script_interface_register
-class DipolarBarnesHutGpu(MagnetostaticInteraction):
-
-    """
-    Calculates magnetostatic interactions by direct summation over all
-    pairs. See :ref:`Barnes-Hut octree sum on GPU` for more details.
-
-    TODO: If the system has periodic boundaries, the minimum image
-    convention is applied.
-
-    Requires feature ``DIPOLAR_BARNES_HUT``, which depends on
-    ``DIPOLES`` and ``CUDA``.
-
-    Parameters
-    ----------
-    epssq : :obj:`float`, optional
-        Squared skin of the octant cells.
-    itolsq : :obj:`float`, optional
-        Squared inverse fraction of the octant cells.
-
-    """
-    _so_name = "Dipoles::DipolarBarnesHutGpu"
-    _so_creation_policy = "GLOBAL"
-    _so_features = ("DIPOLAR_BARNES_HUT", "CUDA")
-
-    def default_params(self):
-        return {"epssq": 100.0, "itolsq": 4.0}
-
-    def required_keys(self):
-        return {"prefactor"}
-
-
 @script_interface_register
 class DLC(MagnetostaticInteraction):
 
diff --git a/src/python/espressomd/observables.py b/src/python/espressomd/observables.py
index 48b5614b4c6..c13597e587a 100644
--- a/src/python/espressomd/observables.py
+++ b/src/python/espressomd/observables.py
@@ -28,7 +28,7 @@ class Observable(ScriptInterfaceHelper):
     Methods
     -------
     shape()
-        Return the shape of the observable.
+        Get the shape of the numpy array returned by the observable.
     """
     _so_name = "Observables::Observable"
     _so_bind_methods = ("shape",)
@@ -96,9 +96,14 @@ class ComPosition(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (3,) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (3,) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::ComPosition"
@@ -118,9 +123,14 @@ class ComVelocity(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (3,) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (3,) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::ComVelocity"
@@ -154,9 +164,14 @@ class DensityProfile(ProfileObservable):
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
-    Returns
+    Methods
     -------
-    (3,) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (``n_x_bins``, ``n_y_bins``, ``n_z_bins``) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::DensityProfile"
@@ -165,7 +180,7 @@ class DensityProfile(ProfileObservable):
 @script_interface_register
 class DipoleMoment(Observable):
 
-    """Calculates the dipole moment for particles with given ids.
+    """Calculates the electric dipole moment for particles with given ids.
 
     Output format: :math:`\\left(\\sum_i q_i r^x_i, \\sum_i q_i r^y_i, \\sum_i q_i r^z_i\\right)`
 
@@ -174,9 +189,14 @@ class DipoleMoment(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (3,) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (3,) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::DipoleMoment"
@@ -210,11 +230,16 @@ class FluxDensityProfile(ProfileObservable):
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
-    Returns
+    Methods
     -------
-    (``n_x_bins``, ``n_y_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
-        The fourth component contains the histogram for the x, y and z
-        components of the flux density.
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (``n_x_bins``, ``n_y_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
+            The fourth dimension of the array stores the histogram for the x,
+            y and z components of the flux density, respectively.
 
     """
     _so_name = "Observables::FluxDensityProfile"
@@ -248,11 +273,16 @@ class ForceDensityProfile(ProfileObservable):
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
-    Returns
+    Methods
     -------
-    (``n_x_bins``, ``n_y_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
-        The fourth component contains the histogram for the x, y and z
-        components of the force.
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (``n_x_bins``, ``n_y_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
+            The fourth dimension of the array stores the histogram for the x,
+            y and z components of the force, respectively.
 
     """
     _so_name = "Observables::ForceDensityProfile"
@@ -302,11 +332,16 @@ class LBVelocityProfile(ProfileObservable):
     allow_empty_bins : :obj:`bool`, default=False
         Whether or not to allow bins that will not be sampled at all.
 
-    Returns
+    Methods
     -------
-    (``n_x_bins``, ``n_y_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
-        The fourth component contains the histogram for the x, y and z
-        components of the LB velocity.
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (``n_x_bins``, ``n_y_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
+            The fourth dimension of the array stores the histogram for the x,
+            y and z components of the LB velocity, respectively.
 
     """
     _so_name = "Observables::LBVelocityProfile"
@@ -321,9 +356,14 @@ class LBFluidPressureTensor(Observable):
     ----------
     None
 
-    Returns
+    Methods
     -------
-    (3, 3) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (3, 3) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::LBFluidPressureTensor"
@@ -341,9 +381,14 @@ class MagneticDipoleMoment(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (3,) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (3,) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::MagneticDipoleMoment"
@@ -363,9 +408,14 @@ class ParticleAngularVelocities(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N, 3) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N, 3) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::ParticleAngularVelocities"
@@ -388,9 +438,14 @@ class ParticleBodyAngularVelocities(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N, 3) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N, 3) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::ParticleBodyAngularVelocities"
@@ -413,9 +468,14 @@ class ParticleBodyVelocities(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N, 3) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N, 3) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::ParticleBodyVelocities"
@@ -435,9 +495,14 @@ class ParticleForces(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N, 3) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N, 3) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::ParticleForces"
@@ -457,9 +522,14 @@ class ParticlePositions(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N, 3) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N, 3) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::ParticlePositions"
@@ -479,9 +549,14 @@ class ParticleVelocities(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N, 3) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N, 3) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::ParticleVelocities"
@@ -492,7 +567,7 @@ class ParticleDirectors(Observable):
 
     """Calculates the particle directors for particles with given ids.
 
-    Output format: :math:`(d1_x,\\ d1_y,\\ d1_z),\\ (d2_x,\\ d2_y,\\ d2_z),\\ \\dots,\\ (dn_x,\\ dn_y,\\ dn_z)`.
+    Output format: :math:`(d^x_1,\\ d^y_1,\\ d^z_1),\\ (d^x_2,\\ d^y_2,\\ d^z_2),\\ \\dots,\\ (d^x_n,\\ d^y_n,\\ d^z_n)`.
 
     The particles are ordered according to the list of ids passed to the observable.
 
@@ -501,9 +576,14 @@ class ParticleDirectors(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N, 3) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N, 3) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::ParticleDirectors"
@@ -514,7 +594,7 @@ class ParticleDipoleFields(Observable):
 
     """Calculates the particle dipole fields for particles with given ids.
 
-    Output format: :math:`(h_d1_x,\\ h_d1_y,\\ h_d1_z),\\ (h_d2_x,\\ h_d2_y,\\ h_d2_z),\\ \\dots,\\ (h_dn_x,\\ h_dn_y,\\ h_dn_z)`.
+    Output format: :math:`(h^x_1,\\ h^y_1,\\ h^z_1),\\ (h^x_2,\\ h^y_2,\\ h^z_2),\\ \\dots,\\ (h^x_n,\\ h^y_n,\\ h^z_n)`.
 
     The particles are ordered according to the list of ids passed to the observable.
 
@@ -523,9 +603,14 @@ class ParticleDipoleFields(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N, 3) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N, 3) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::ParticleDipoleFields"
@@ -542,9 +627,14 @@ class ParticleDistances(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N - 1,) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N - 1,) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::ParticleDistances"
@@ -564,9 +654,14 @@ class TotalForce(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (3,) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (3,) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::TotalForce"
@@ -583,9 +678,14 @@ class BondAngles(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N - 2,) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N - 2,) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::BondAngles"
@@ -604,9 +704,14 @@ class CosPersistenceAngles(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N - 2,) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N - 2,) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::CosPersistenceAngles"
@@ -623,9 +728,14 @@ class BondDihedrals(Observable):
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
 
-    Returns
+    Methods
     -------
-    (N - 3,) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (N - 3,) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::BondDihedrals"
@@ -636,9 +746,14 @@ class Energy(Observable):
 
     """Calculates the total energy.
 
-    Returns
+    Methods
     -------
-    :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        :obj:`float`
 
     """
     _so_name = "Observables::Energy"
@@ -649,9 +764,14 @@ class Pressure(Observable):
 
     """Calculates the total scalar pressure.
 
-    Returns
+    Methods
     -------
-    :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        :obj:`float`
 
     """
     _so_name = "Observables::Pressure"
@@ -662,9 +782,14 @@ class PressureTensor(Observable):
 
     """Calculates the total pressure tensor.
 
-    Returns
+    Methods
     -------
-    (3, 3) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (3, 3) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::PressureTensor"
@@ -680,9 +805,14 @@ class DPDStress(Observable):
     ----------
     None
 
-    Returns
+    Methods
     -------
-    (3, 3) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (3, 3) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::DPDStress"
@@ -707,20 +837,25 @@ class CylindricalDensityProfile(CylindricalProfileObservable):
         Number of bins in ``z`` direction.
     min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`, default = -pi
-        Minimum ``phi`` to consider. Must be in [-pi,pi).
+    min_phi : :obj:`float`, default = :math:`-\\pi`
+        Minimum ``phi`` to consider. Must be in :math:`[-\\pi,\\pi)`.
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`, default = pi
-        Maximum ``phi`` to consider. Must be in (-pi,pi].
+    max_phi : :obj:`float`, default = :math:`\\pi`
+        Maximum ``phi`` to consider. Must be in :math:`(-\\pi,\\pi]`.
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
-    Returns
+    Methods
     -------
-    (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``) :obj:`ndarray` of :obj:`float`
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``) :obj:`ndarray` of :obj:`float`
 
     """
     _so_name = "Observables::CylindricalDensityProfile"
@@ -745,22 +880,28 @@ class CylindricalFluxDensityProfile(CylindricalProfileObservable):
         Number of bins in ``z`` direction.
     min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`, default = -pi
-        Minimum ``phi`` to consider. Must be in [-pi,pi).
+    min_phi : :obj:`float`, default = :math:`-\\pi`
+        Minimum ``phi`` to consider. Must be in :math:`[-\\pi,\\pi)`.
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`, default = pi
-        Maximum ``phi`` to consider. Must be in (-pi,pi].
+    max_phi : :obj:`float`, default = :math:`\\pi`
+        Maximum ``phi`` to consider. Must be in :math:`(-\\pi,\\pi]`.
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
-    Returns
+    Methods
     -------
-    (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
-        The fourth component contains the histogram for the radial distance,
-        azimuth and axial coordinate of the particle flux density field.
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
+            The fourth dimension of the array stores the histogram for the
+            radial distance, azimuth and axial coordinate of the particle
+            flux density field, respectively.
 
     """
     _so_name = "Observables::CylindricalFluxDensityProfile"
@@ -787,22 +928,28 @@ class CylindricalLBFluxDensityProfileAtParticlePositions(
         Number of bins in ``z`` direction.
     min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`, default = -pi
-        Minimum ``phi`` to consider. Must be in [-pi,pi).
+    min_phi : :obj:`float`, default = :math:`-\\pi`
+        Minimum ``phi`` to consider. Must be in :math:`[-\\pi,\\pi)`.
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`, default = pi
-        Maximum ``phi`` to consider. Must be in (-pi,pi].
+    max_phi : :obj:`float`, default = :math:`\\pi`
+        Maximum ``phi`` to consider. Must be in :math:`(-\\pi,\\pi]`.
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
-    Returns
+    Methods
     -------
-    (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
-        The fourth component contains the histogram for the radial distance,
-        azimuth and axial coordinate of the LB flux density field.
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
+            The fourth dimension of the array stores the histogram for the
+            radial distance, azimuth and axial coordinate of the LB flux
+            density field, respectively.
 
     """
     _so_name = "Observables::CylindricalLBFluxDensityProfileAtParticlePositions"
@@ -829,22 +976,28 @@ class CylindricalLBVelocityProfileAtParticlePositions(
         Number of bins in ``z`` direction.
     min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`, default = -pi
-        Minimum ``phi`` to consider. Must be in [-pi,pi).
+    min_phi : :obj:`float`, default = :math:`-\\pi`
+        Minimum ``phi`` to consider. Must be in :math:`[-\\pi,\\pi)`.
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`, default = pi
-        Maximum ``phi`` to consider. Must be in (-pi,pi].
+    max_phi : :obj:`float`, default = :math:`\\pi`
+        Maximum ``phi`` to consider. Must be in :math:`(-\\pi,\\pi]`.
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
-    Returns
+    Methods
     -------
-    (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
-        The fourth component contains the histogram for the radial distance,
-        azimuth and axial coordinate of the LB velocity field.
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
+            The fourth dimension of the array stores the histogram for the
+            radial distance, azimuth and axial coordinate of the LB velocity
+            field, respectively.
 
     """
     _so_name = "Observables::CylindricalLBVelocityProfileAtParticlePositions"
@@ -869,22 +1022,28 @@ class CylindricalVelocityProfile(CylindricalProfileObservable):
         Number of bins in ``z`` direction.
     min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`, default = -pi
-        Minimum ``phi`` to consider. Must be in [-pi,pi).
+    min_phi : :obj:`float`, default = :math:`-\\pi`
+        Minimum ``phi`` to consider. Must be in :math:`[-\\pi,\\pi)`.
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`, default = pi
-        Maximum ``phi`` to consider. Must be in (-pi,pi].
+    max_phi : :obj:`float`, default = :math:`\\pi`
+        Maximum ``phi`` to consider. Must be in :math:`(-\\pi,\\pi]`.
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
-    Returns
+    Methods
     -------
-    (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
-        The fourth component contains the histogram for the radial distance,
-        azimuth and axial coordinate of the particle velocity field.
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
+            The fourth dimension of the array stores the histogram for the
+            radial distance, azimuth and axial coordinate of the particle
+            velocity field, respectively.
 
     """
     _so_name = "Observables::CylindricalVelocityProfile"
@@ -911,24 +1070,30 @@ class CylindricalLBVelocityProfile(CylindricalProfileObservable):
         Number of bins in ``z`` direction.
     min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`, default = -pi
-        Minimum ``phi`` to consider. Must be in [-pi,pi).
+    min_phi : :obj:`float`, default = :math:`-\\pi`
+        Minimum ``phi`` to consider. Must be in :math:`[-\\pi,\\pi)`.
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`, default = pi
-        Maximum ``phi`` to consider. Must be in (-pi,pi].
+    max_phi : :obj:`float`, default = :math:`\\pi`
+        Maximum ``phi`` to consider. Must be in :math:`(-\\pi,\\pi]`.
     max_z : :obj:`float`
         Maximum ``z`` to consider.
     sampling_density : :obj:`float`
         Samples per unit volume for the LB velocity interpolation.
 
-    Returns
+    Methods
     -------
-    (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
-        The fourth component contains the histogram for the radial distance,
-        azimuth and axial coordinate of the LB velocity field.
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (``n_r_bins``, ``n_phi_bins``, ``n_z_bins``, 3) :obj:`ndarray` of :obj:`float`
+            The fourth dimension of the array stores the histogram for the
+            radial distance, azimuth and axial coordinate of the LB velocity
+            field, respectively.
 
     """
     _so_name = "Observables::CylindricalLBVelocityProfile"
@@ -954,10 +1119,15 @@ class RDF(Observable):
     max_r : :obj:`float`
         Maximum ``r`` to consider.
 
-    Returns
+    Methods
     -------
-    (``n_r_bins``,) :obj:`ndarray` of :obj:`float`
-        The RDF.
+    calculate()
+        Run the observable.
+
+        Returns
+        -------
+        (``n_r_bins``,) :obj:`ndarray` of :obj:`float`
+            The RDF.
 
     """
     _so_name = "Observables::RDF"
diff --git a/src/python/espressomd/particle_data.py b/src/python/espressomd/particle_data.py
index 8272834e47a..4880dc4e3ef 100644
--- a/src/python/espressomd/particle_data.py
+++ b/src/python/espressomd/particle_data.py
@@ -279,8 +279,8 @@ class ParticleHandle(ScriptInterfaceHelper):
             * This needs features ``EXTERNAL_FORCES`` and ``ROTATION``.
 
     gamma: :obj:`float` or (3,) array_like of :obj:`float`
-        The translational frictional coefficient used in the Langevin
-        and Brownian thermostats.
+        The translational frictional coefficient used in the Langevin,
+        Brownian and LB thermostats.
 
         .. note::
             This needs feature ``THERMOSTAT_PER_PARTICLE`` and
@@ -594,10 +594,8 @@ def vs_auto_relate_to(self, rel_to, override_cutoff_check=False,
         else:
             check_type_or_throw_except(
                 rel_to, 1, int, "Argument of 'vs_auto_relate_to' has to be of type ParticleHandle or int")
-        self.call_method(
-            "vs_relate_to",
-            pid=rel_to,
-            override_cutoff_check=override_cutoff_check)
+        self.call_method("vs_auto_relate_to", pid=rel_to,
+                         override_cutoff_check=override_cutoff_check)
         if self.propagation != Propagation.NONE:
             if couple_to_lb:
                 self.propagation |= Propagation.TRANS_LB_MOMENTUM_EXCHANGE
diff --git a/src/python/espressomd/plugins/CMakeLists.txt b/src/python/espressomd/plugins/CMakeLists.txt
new file mode 100644
index 00000000000..b99a7d53aab
--- /dev/null
+++ b/src/python/espressomd/plugins/CMakeLists.txt
@@ -0,0 +1,20 @@
+#
+# Copyright (C) 2024 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+espresso_target_resources(espressomd __init__.py ase.py)
diff --git a/src/python/espressomd/plugins/__init__.py b/src/python/espressomd/plugins/__init__.py
new file mode 100644
index 00000000000..c893459dbae
--- /dev/null
+++ b/src/python/espressomd/plugins/__init__.py
@@ -0,0 +1,18 @@
+#
+# Copyright (C) 2024 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
diff --git a/src/python/espressomd/plugins/ase.py b/src/python/espressomd/plugins/ase.py
new file mode 100644
index 00000000000..7280a0b639c
--- /dev/null
+++ b/src/python/espressomd/plugins/ase.py
@@ -0,0 +1,69 @@
+#
+# Copyright (C) 2024 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import dataclasses
+import typing
+import ase
+from ase.calculators.singlepoint import SinglePointCalculator
+import numpy as np
+if typing.TYPE_CHECKING:
+    from espressomd.system import System
+
+
+@dataclasses.dataclass
+class ASEInterface:
+    """
+    ASE interface for ESPResSo.
+    """
+
+    type_mapping: dict
+    """
+    Mapping of ESPResSo particle types to ASE symbols. E.g. ``{0: "H", 1: "O"}``.
+    """
+    _system: typing.Union["System", None] = None
+
+    def register_system(self, system):
+        """Register the system."""
+        self._system = system
+
+    def __getstate__(self):
+        return {"type_mapping": self.type_mapping}
+
+    def get(self) -> ase.Atoms:
+        """Export the ESPResSo system particle data to an ASE atoms object."""
+        particles = self._system.part.all()
+        positions = np.copy(particles.pos)
+        types = np.copy(particles.type)
+        forces = np.copy(particles.f)
+        unknown_types = set(types) - set(self.type_mapping)
+        if unknown_types:
+            raise RuntimeError(
+                f"Particle types '{unknown_types}' haven't been registered in the ASE type map"  # nopep8
+            )
+        if any(p.is_virtual() for p in particles):
+            raise RuntimeError("ASE doesn't support virtual sites")
+
+        atoms = ase.Atoms(
+            positions=positions,
+            symbols=[self.type_mapping[t] for t in types],
+            pbc=np.copy(self._system.periodicity),
+            cell=np.copy(self._system.box_l),
+        )
+        atoms.calc = SinglePointCalculator(atoms, forces=forces)
+        return atoms
diff --git a/src/python/espressomd/reaction_methods.py b/src/python/espressomd/reaction_methods.py
index 7cf5eb9ca17..14bef13c3fd 100644
--- a/src/python/espressomd/reaction_methods.py
+++ b/src/python/espressomd/reaction_methods.py
@@ -598,7 +598,8 @@ def add_reaction(self, *args, **kwargs):
         if warn_user:
             warnings.warn(warn_msg, FutureWarning)
 
-        if(len(kwargs["product_types"]) != 2 or len(kwargs["reactant_types"]) != 1):
+        if (len(kwargs["product_types"]) != 2 or len(
+                kwargs["reactant_types"]) != 1):
             raise ValueError(
                 "The constant pH method is only implemented for reactions "
                 "with two product types and one adduct type.")
diff --git a/src/python/espressomd/script_interface.pyx b/src/python/espressomd/script_interface.pyx
index 13469ab34b4..338b21a0148 100644
--- a/src/python/espressomd/script_interface.pyx
+++ b/src/python/espressomd/script_interface.pyx
@@ -419,7 +419,8 @@ def _unpickle_so_class(so_name, state):
     so_ptr.sip = _om.get().deserialize(state)
 
     assert so_name in _python_class_by_so_name, \
-        f"C++ class '{so_name}' is not associated to any Python class (hint: the corresponding 'import espressomd.*' may be missing)"
+        f"C++ class '{so_name}' is not associated to any Python class " \
+        "(hint: the corresponding 'import espressomd.*' may be missing)"
     so = _python_class_by_so_name[so_name](sip=so_ptr)
     so.define_bound_methods()
 
diff --git a/src/python/espressomd/system.py b/src/python/espressomd/system.py
index b4d0dd5f18e..dc0e3151cda 100644
--- a/src/python/espressomd/system.py
+++ b/src/python/espressomd/system.py
@@ -200,6 +200,7 @@ def __init__(self, **kwargs):
         self.non_bonded_inter = interactions.NonBondedInteractions()
         self.part = particle_data.ParticleList()
         self.thermostat = thermostat.Thermostat()
+        self._ase_interface = None
 
         # lock class
         self.call_method("lock_system_creation")
@@ -238,11 +239,17 @@ def __getstate__(self):
         odict["_system_handle"] = self.call_method("get_system_handle")
         for property_name in checkpointable_properties:
             odict[property_name] = System.__getattribute__(self, property_name)
+        if self._ase_interface is not None:
+            odict["_ase_interface"] = self._ase_interface.__getstate__()
         return odict
 
     def __setstate__(self, params):
         # note: this class is initialized twice by pickle
         self.call_method("set_system_handle", obj=params.pop("_system_handle"))
+        # initialize Python-only members
+        if "_ase_interface" in params:
+            from espressomd.plugins.ase import ASEInterface
+            self.ase = ASEInterface(**params.pop("_ase_interface"))
         for property_name in params.keys():
             System.__setattr__(self, property_name, params[property_name])
         # note: several members can only be instantiated once
@@ -338,6 +345,15 @@ def lb(self, lb):
                 lb.call_method("activate")
                 self._lb = lb
 
+    @property
+    def ase(self):
+        return self._ase_interface
+
+    @ase.setter
+    def ase(self, ase):
+        ase.register_system(self)
+        self._ase_interface = ase
+
     @property
     def ekcontainer(self):
         """
diff --git a/src/python/espressomd/utils.pyx b/src/python/espressomd/utils.pyx
index 2538be38f6d..323a5455db4 100644
--- a/src/python/espressomd/utils.pyx
+++ b/src/python/espressomd/utils.pyx
@@ -16,7 +16,6 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
-cimport numpy as np
 import numpy as np
 
 
diff --git a/src/python/object_in_fluid/CMakeLists.txt b/src/python/object_in_fluid/CMakeLists.txt
index fbe81d577f2..200c9f810c1 100644
--- a/src/python/object_in_fluid/CMakeLists.txt
+++ b/src/python/object_in_fluid/CMakeLists.txt
@@ -17,16 +17,22 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
+include(espresso_resource_files)
+
 add_custom_target(object_in_fluid)
-file(GLOB python_AUX *.py)
-set(python_AUX "${python_AUX}" CACHE INTERNAL "python_AUX")
+file(GLOB python_SRC RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" *.py)
+
+espresso_target_resources(object_in_fluid ${python_SRC})
 
-foreach(auxfile ${python_AUX})
-  get_filename_component(filename ${auxfile} NAME)
-  file(RELATIVE_PATH relpath ${CMAKE_CURRENT_SOURCE_DIR} ${auxfile})
-  get_filename_component(relpath ${relpath} DIRECTORY)
-  string(CONCAT outputpath ${CMAKE_CURRENT_BINARY_DIR} "/" ${relpath} "/"
-                ${filename})
-  add_custom_command(TARGET object_in_fluid COMMAND ${CMAKE_COMMAND} -E copy
-                                                    ${auxfile} ${outputpath})
-endforeach(auxfile)
+# Install resource files (Python files, text files, etc.)
+get_property(OIF_RESOURCE_FILES TARGET object_in_fluid
+             PROPERTY EspressoResourceFiles)
+foreach(RESOURCE_ABSPATH ${OIF_RESOURCE_FILES})
+  cmake_path(RELATIVE_PATH RESOURCE_ABSPATH BASE_DIRECTORY
+             ${CMAKE_CURRENT_BINARY_DIR} OUTPUT_VARIABLE RESOURCE_RELPATH)
+  cmake_path(GET RESOURCE_RELPATH PARENT_PATH RESOURCE_RELPARENT)
+  install(
+    FILES "${RESOURCE_ABSPATH}"
+    DESTINATION
+      "${ESPRESSO_INSTALL_PYTHON}/object_in_fluid/${RESOURCE_RELPARENT}")
+endforeach()
diff --git a/src/python/pypresso.cmakein b/src/python/pypresso.cmakein
index c73c4fe1f67..70856604d52 100755
--- a/src/python/pypresso.cmakein
+++ b/src/python/pypresso.cmakein
@@ -1,5 +1,5 @@
 #!/usr/bin/env sh
-# Copyright (C) 2010-2022 The ESPResSo project
+# Copyright (C) 2010-2024 The ESPResSo project
 #
 # Copying and distribution of this file, with or without modification,
 # are permitted in any medium without royalty provided the copyright
@@ -14,8 +14,30 @@ else
 fi
 export PYTHONPATH
 
+espresso_parse_cmake_boolean_value() {
+  local varname="${2}"
+  local value=""
+  if [ "${3}" = "1" ]; then
+    value="ON"
+  elif [ "${3}" = "0" ]; then
+    value="OFF"
+  fi
+  if [ "${1}" != "#define" ] || [ "${value}" = "" ]; then
+    echo "error: failed to parse CMake variable ${varname} (using '$(ps -hp $$ | awk '{print $5}')' and CMake @CMAKE_VERSION@)" >&2
+    exit 2
+  fi
+  eval "${varname}='${value}'"
+}
+
+espresso_parse_cmake_boolean_value \#cmakedefine01 ESPRESSO_WARNINGS_ARE_ERRORS
+espresso_parse_cmake_boolean_value \#cmakedefine01 ESPRESSO_MPIEXEC_GUARD_SINGLETON_NUMA
+espresso_parse_cmake_boolean_value \#cmakedefine01 ESPRESSO_ADD_OMPI_SINGLETON_WARNING
+espresso_parse_cmake_boolean_value \#cmakedefine01 ESPRESSO_BUILD_WITH_UBSAN
+espresso_parse_cmake_boolean_value \#cmakedefine01 ESPRESSO_BUILD_WITH_ASAN
+espresso_parse_cmake_boolean_value \#cmakedefine01 ESPRESSO_BUILD_WITH_MSAN
+
 # Open MPI 4.x cannot run in singleton mode on some NUMA systems
-if [ "@ESPRESSO_ADD_OMPI_SINGLETON_WARNING@" = "ON" ] && [ "@ESPRESSO_MPIEXEC_GUARD_SINGLETON_NUMA@" = "ON" ]; then
+if [ "${ESPRESSO_ADD_OMPI_SINGLETON_WARNING}" = "ON" ] && [ "${ESPRESSO_MPIEXEC_GUARD_SINGLETON_NUMA}" = "ON" ]; then
   if [ -z "${OMPI_COMM_WORLD_SIZE}" ] && [ "${OMPI_MCA_hwloc_base_binding_policy}" = "numa" ]; then
     if test -f /proc/cpuinfo && grep --quiet -P "^[Mm]odel name[ \t]*:[ \t]+@ESPRESSO_CPU_MODEL_NAME_OMPI_SINGLETON_NUMA_PATTERN@( |$)" /proc/cpuinfo; then
       echo "warning: if Open MPI fails to set processor affinity, set environment variable OMPI_MCA_hwloc_base_binding_policy to \"none\" or \"l3cache\""
@@ -23,29 +45,29 @@ if [ "@ESPRESSO_ADD_OMPI_SINGLETON_WARNING@" = "ON" ] && [ "@ESPRESSO_MPIEXEC_GU
   fi
 fi
 
-if [ "@CMAKE_CXX_COMPILER_ID@" != "GNU" ] && [ "@ESPRESSO_BUILD_WITH_ASAN@" = "ON" ]; then
+if [ "@CMAKE_CXX_COMPILER_ID@" != "GNU" ] && [ "${ESPRESSO_BUILD_WITH_ASAN}" = "ON" ]; then
   asan_lib=$("@CMAKE_CXX_COMPILER@" /dev/null -### -o /dev/null -fsanitize=address 2>&1 | grep -o '[" ][^" ]*libclang_rt.asan[^" ]*[^s][" ]' | sed 's/[" ]//g' | sed 's/\.a$/.so/g')
   export DYLD_INSERT_LIBRARIES="$asan_lib"
   for lib in $asan_lib; do
       test -f $lib && LD_PRELOAD="$lib $LD_PRELOAD"
   done
 fi
-if [ "@CMAKE_CXX_COMPILER_ID@" != "GNU" ] && [ "@ESPRESSO_BUILD_WITH_UBSAN@" = "ON" ] && [ "@ESPRESSO_BUILD_WITH_ASAN@" != "ON" ]; then
+if [ "@CMAKE_CXX_COMPILER_ID@" != "GNU" ] && [ "${ESPRESSO_BUILD_WITH_UBSAN}" = "ON" ] && [ "${ESPRESSO_BUILD_WITH_ASAN}" != "ON" ]; then
   ubsan_lib=$("@CMAKE_CXX_COMPILER@" /dev/null -### -o /dev/null -fsanitize=undefined 2>&1 | grep -o '[" ][^" ]*libclang_rt.ubsan[^" ]*[^s][" ]' | sed 's/[" ]//g' | sed 's/\.a$/.so/g')
   for lib in $ubsan_lib; do
     test -f $lib && LD_PRELOAD="$lib $LD_PRELOAD"
   done
 fi
 export LD_PRELOAD
-if [ "@ESPRESSO_BUILD_WITH_UBSAN@" = "ON" ]; then
+if [ "${ESPRESSO_BUILD_WITH_UBSAN}" = "ON" ]; then
   export UBSAN_OPTIONS="print_stacktrace=1 suppressions=\"@CMAKE_SOURCE_DIR@/maintainer/CI/ubsan.supp\" $UBSAN_OPTIONS"
-  if [ "@ESPRESSO_WARNINGS_ARE_ERRORS@" = "ON" ]; then
+  if [ "${ESPRESSO_WARNINGS_ARE_ERRORS}" = "ON" ]; then
     export UBSAN_OPTIONS="halt_on_error=1 $UBSAN_OPTIONS"
   fi
 fi
-if [ "@ESPRESSO_BUILD_WITH_ASAN@" = "ON" ]; then
+if [ "${ESPRESSO_BUILD_WITH_ASAN}" = "ON" ]; then
   ASAN_OPTIONS="protect_shadow_gap=0 allocator_may_return_null=1 $ASAN_OPTIONS"
-  if [ "@ESPRESSO_WARNINGS_ARE_ERRORS@" = "ON" ]; then
+  if [ "${ESPRESSO_WARNINGS_ARE_ERRORS}" = "ON" ]; then
     ASAN_OPTIONS="halt_on_error=1 $ASAN_OPTIONS"
   fi
   if [ "$1" = "--leaks" ]; then
@@ -55,7 +77,7 @@ if [ "@ESPRESSO_BUILD_WITH_ASAN@" = "ON" ]; then
   fi
   export ASAN_OPTIONS
 fi
-if [ "@ESPRESSO_BUILD_WITH_MSAN@" = "ON" ] && [ "@ESPRESSO_WARNINGS_ARE_ERRORS@" = "ON" ]; then
+if [ "${ESPRESSO_BUILD_WITH_MSAN}" = "ON" ] && [ "${ESPRESSO_WARNINGS_ARE_ERRORS}" = "ON" ]; then
   export MSAN_OPTIONS="halt_on_error=1 $MSAN_OPTIONS"
 fi
 
diff --git a/src/scafacos/CMakeLists.txt b/src/scafacos/CMakeLists.txt
index 222f05c4944..b6e4350a37d 100644
--- a/src/scafacos/CMakeLists.txt
+++ b/src/scafacos/CMakeLists.txt
@@ -22,12 +22,6 @@ add_library(espresso_scafacos SHARED src/Scafacos.cpp src/Coulomb.cpp
 add_library(espresso::scafacos ALIAS espresso_scafacos)
 set_target_properties(espresso_scafacos PROPERTIES CXX_CLANG_TIDY
                                                    "${ESPRESSO_CXX_CLANG_TIDY}")
-if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10
-   AND ESPRESSO_INSIDE_DOCKER AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL
-                                  11 AND ESPRESSO_BUILD_WITH_COVERAGE)
-  target_link_libraries(espresso_scafacos
-                        PRIVATE "-L/usr/lib/gcc/x86_64-linux-gnu/10")
-endif()
 target_link_libraries(espresso_scafacos PUBLIC MPI::MPI_CXX
                       PRIVATE ${SCAFACOS_LDFLAGS} espresso::cpp_flags)
 
diff --git a/src/script_interface/CMakeLists.txt b/src/script_interface/CMakeLists.txt
index 14a9f75bd9f..3e795b23656 100644
--- a/src/script_interface/CMakeLists.txt
+++ b/src/script_interface/CMakeLists.txt
@@ -64,10 +64,14 @@ target_link_libraries(
 
 set_source_files_properties(
   ${CMAKE_CURRENT_SOURCE_DIR}/particle_data/ParticleHandle.cpp
-  PROPERTIES COMPILE_FLAGS -fno-finite-math-only)
+  PROPERTIES COMPILE_OPTIONS -fno-finite-math-only)
 
 target_include_directories(espresso_script_interface
                            PUBLIC ${CMAKE_SOURCE_DIR}/src)
+if(ESPRESSO_BUILD_WITH_COVERAGE AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  target_compile_options(espresso_script_interface
+                         PRIVATE -fno-default-inline -fno-elide-constructors)
+endif()
 
 if(ESPRESSO_BUILD_TESTS)
   add_subdirectory(tests)
diff --git a/src/script_interface/GlobalContext.cpp b/src/script_interface/GlobalContext.cpp
index c4859b71be9..604c0ff9257 100644
--- a/src/script_interface/GlobalContext.cpp
+++ b/src/script_interface/GlobalContext.cpp
@@ -50,7 +50,7 @@ void GlobalContext::make_handle(ObjectId id, const std::string &name,
         name, unpack(parameters, m_local_objects));
 
     m_local_objects[id] = std::move(so);
-  } catch (Exception const &) {
+  } catch (Exception const &) { // NOLINT(bugprone-empty-catch)
   }
 }
 
@@ -63,7 +63,7 @@ void GlobalContext::set_parameter(ObjectId id, std::string const &name,
                                   PackedVariant const &value) {
   try {
     m_local_objects.at(id)->set_parameter(name, unpack(value, m_local_objects));
-  } catch (Exception const &) {
+  } catch (Exception const &) { // NOLINT(bugprone-empty-catch)
   }
 }
 
@@ -78,7 +78,7 @@ void GlobalContext::call_method(ObjectId id, std::string const &name,
   try {
     m_local_objects.at(id)->call_method(name,
                                         unpack(arguments, m_local_objects));
-  } catch (Exception const &) {
+  } catch (Exception const &) { // NOLINT(bugprone-empty-catch)
   }
 }
 
diff --git a/src/script_interface/ObjectHandle.cpp b/src/script_interface/ObjectHandle.cpp
index 1ad9bedc7fd..224a174b6c4 100644
--- a/src/script_interface/ObjectHandle.cpp
+++ b/src/script_interface/ObjectHandle.cpp
@@ -26,6 +26,7 @@
 
 #include <utils/serialization/pack.hpp>
 
+#include <algorithm>
 #include <iterator>
 #include <memory>
 #include <string>
@@ -58,16 +59,18 @@ std::string ObjectHandle::serialize() const {
   PackVisitor visit;
 
   /* Pack parameters and keep track of ObjectRef parameters */
-  boost::transform(params, state.params.begin(),
-                   [&visit](auto const &kv) -> PackedMap::value_type {
-                     return {kv.first, boost::apply_visitor(visit, kv.second)};
-                   });
+  std::ranges::transform(params, state.params.begin(),
+                         [&visit](auto const &kv) -> PackedMap::value_type {
+                           return {kv.first,
+                                   boost::apply_visitor(visit, kv.second)};
+                         });
 
   /* Packed Object parameters */
   state.objects.resize(visit.objects().size());
-  boost::transform(visit.objects(), state.objects.begin(), [](auto const &kv) {
-    return std::make_pair(kv.first, kv.second->serialize());
-  });
+  std::ranges::transform(
+      visit.objects(), state.objects.begin(), [](auto const &kv) {
+        return std::make_pair(kv.first, kv.second->serialize());
+      });
 
   state.name = name().to_string();
   state.internal_state = get_internal_state();
@@ -80,11 +83,11 @@ ObjectRef ObjectHandle::deserialize(const std::string &packed_state,
   auto const state = Utils::unpack<ObjectState>(packed_state);
 
   std::unordered_map<ObjectId, ObjectRef> objects;
-  boost::transform(state.objects, std::inserter(objects, objects.end()),
-                   [&ctx](auto const &kv) {
-                     return std::make_pair(kv.first,
-                                           deserialize(kv.second, ctx));
-                   });
+  std::ranges::transform(state.objects, std::inserter(objects, objects.end()),
+                         [&ctx](auto const &kv) {
+                           return std::make_pair(kv.first,
+                                                 deserialize(kv.second, ctx));
+                         });
 
   VariantMap params;
   for (auto const &kv : state.params) {
diff --git a/src/script_interface/ObjectHandle.hpp b/src/script_interface/ObjectHandle.hpp
index 01e44be5774..c8258fcc09a 100644
--- a/src/script_interface/ObjectHandle.hpp
+++ b/src/script_interface/ObjectHandle.hpp
@@ -21,11 +21,10 @@
 
 #include "Variant.hpp"
 
-#include <utils/Span.hpp>
-
 #include <boost/utility/string_ref.hpp>
 
 #include <memory>
+#include <span>
 #include <string>
 #include <utility>
 #include <vector>
@@ -104,7 +103,7 @@ class ObjectHandle {
    * @brief Get required and optional parameters for class.
    * @return Expected parameters.
    */
-  virtual Utils::Span<const boost::string_ref> valid_parameters() const {
+  virtual std::span<const boost::string_ref> valid_parameters() const {
     return {};
   }
 
@@ -119,7 +118,10 @@ class ObjectHandle {
    * @param name Name of the parameter
    * @return Value of parameter @p name
    */
-  virtual Variant get_parameter(const std::string &name) const { return {}; }
+  virtual Variant get_parameter(std::string const &name) const {
+    static_cast<void>(name);
+    return {};
+  }
 
   /**
    * @brief Set single parameter.
@@ -169,6 +171,6 @@ class ObjectHandle {
 
 private:
   virtual std::string get_internal_state() const { return {}; }
-  virtual void set_internal_state(std::string const &state) {}
+  virtual void set_internal_state(std::string const &) {}
 };
 } /* namespace ScriptInterface */
diff --git a/src/script_interface/ObjectList.hpp b/src/script_interface/ObjectList.hpp
index 9bfc7895a83..ccfcad19f44 100644
--- a/src/script_interface/ObjectList.hpp
+++ b/src/script_interface/ObjectList.hpp
@@ -98,8 +98,7 @@ class ObjectList : public ObjectContainer<ObjectList, ManagedType, BaseType> {
       throw Exception("");
     }
     remove_in_core(element);
-    m_elements.erase(std::remove(m_elements.begin(), m_elements.end(), element),
-                     m_elements.end());
+    std::erase(m_elements, element);
   }
 
   /**
diff --git a/src/script_interface/ObjectState.hpp b/src/script_interface/ObjectState.hpp
index 267f5f460b5..0bf5e81ad9c 100644
--- a/src/script_interface/ObjectState.hpp
+++ b/src/script_interface/ObjectState.hpp
@@ -44,7 +44,7 @@ struct ObjectState {
   std::string internal_state;
 
   template <class Archive> void serialize(Archive &ar, long int) {
-    ar &name &params &objects &internal_state;
+    ar & name & params & objects & internal_state;
   }
 };
 } // namespace ScriptInterface
diff --git a/src/script_interface/Variant.hpp b/src/script_interface/Variant.hpp
index abaa7ad5833..f35144efb0b 100644
--- a/src/script_interface/Variant.hpp
+++ b/src/script_interface/Variant.hpp
@@ -22,26 +22,13 @@
 #include "None.hpp"
 
 #include <utils/Vector.hpp>
+#include <utils/serialization/unordered_map.hpp>
 
-#include <boost/variant.hpp>
-
-/* This <boost/serialization/library_version_type.hpp> include guards against
- * an issue in boost::serialization from boost 1.74.0 that leads to compiler
- * error "'library_version_type' is not a member of 'boost::serialization'"
- * when including <boost/serialization/unordered_map.hpp>. More details
- * in ticket https://github.com/boostorg/serialization/issues/219
- */
-#include <boost/serialization/version.hpp>
-#if BOOST_VERSION / 100000 == 1 && BOOST_VERSION / 100 % 1000 == 74
-#include <boost/serialization/library_version_type.hpp>
-#endif
-
-#include <boost/range/algorithm/transform.hpp>
 #include <boost/serialization/serialization.hpp>
 #include <boost/serialization/string.hpp>
-#include <boost/serialization/unordered_map.hpp>
 #include <boost/serialization/variant.hpp>
 #include <boost/serialization/vector.hpp>
+#include <boost/variant.hpp>
 
 #include <cstddef>
 #include <memory>
diff --git a/src/script_interface/accumulators/AccumulatorBase.hpp b/src/script_interface/accumulators/AccumulatorBase.hpp
index ab6172833a4..b5e7c88cd9c 100644
--- a/src/script_interface/accumulators/AccumulatorBase.hpp
+++ b/src/script_interface/accumulators/AccumulatorBase.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef SCRIPTINTERFACE_ACCUMULATORS_ACCUMULATORBASE_HPP
-#define SCRIPTINTERFACE_ACCUMULATORS_ACCUMULATORBASE_HPP
+
+#pragma once
 
 #include "core/accumulators/AccumulatorBase.hpp"
 #include "script_interface/ScriptInterface.hpp"
@@ -53,5 +53,3 @@ class AccumulatorBase : public AutoParameters<AccumulatorBase> {
 
 } // namespace Accumulators
 } // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/accumulators/AutoUpdateAccumulators.hpp b/src/script_interface/accumulators/AutoUpdateAccumulators.hpp
index a5ada2cbd98..79057a0e241 100644
--- a/src/script_interface/accumulators/AutoUpdateAccumulators.hpp
+++ b/src/script_interface/accumulators/AutoUpdateAccumulators.hpp
@@ -17,8 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef SCRIPT_INTERFACE_ACCUMULATOR_AUTOUPDATEACCUMULATORS_HPP
-#define SCRIPT_INTERFACE_ACCUMULATOR_AUTOUPDATEACCUMULATORS_HPP
+#pragma once
 
 #include "AccumulatorBase.hpp"
 
@@ -46,9 +45,7 @@ class AutoUpdateAccumulators : public ObjectList<AccumulatorBase> {
 private:
   // disable serialization: pickling done by the python interface
   std::string get_internal_state() const override { return {}; }
-  void set_internal_state(std::string const &state) override {}
+  void set_internal_state(std::string const &) override {}
 };
 } /* namespace Accumulators */
 } /* namespace ScriptInterface */
-
-#endif // SCRIPT_INTERFACE_ACCUMULATOR_AUTOUPDATEACCUMULATORS_HPP
diff --git a/src/script_interface/accumulators/TimeSeries.hpp b/src/script_interface/accumulators/TimeSeries.hpp
index 1c113d68856..e9f83d8fe75 100644
--- a/src/script_interface/accumulators/TimeSeries.hpp
+++ b/src/script_interface/accumulators/TimeSeries.hpp
@@ -24,8 +24,7 @@
 
 #include "core/accumulators/TimeSeries.hpp"
 
-#include <boost/range/algorithm/transform.hpp>
-
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <utility>
@@ -56,7 +55,7 @@ class TimeSeries : public AccumulatorBase {
       auto const &series = m_accumulator->time_series();
       std::vector<Variant> ret(series.size());
 
-      boost::transform(
+      std::ranges::transform(
           series, ret.begin(),
           [](std::vector<double> const &sample) { return sample; });
 
diff --git a/src/script_interface/analysis/ObservableStat.cpp b/src/script_interface/analysis/ObservableStat.cpp
index ea2ba04cd27..fc3dcd0298d 100644
--- a/src/script_interface/analysis/ObservableStat.cpp
+++ b/src/script_interface/analysis/ObservableStat.cpp
@@ -26,9 +26,8 @@
 
 #include "core/Observable_stat.hpp"
 
-#include <utils/Span.hpp>
-
 #include <cstddef>
+#include <span>
 #include <string>
 #include <vector>
 
@@ -46,17 +45,17 @@ static auto get_summary(::System::System const &system,
   auto const obs_dim = obs.get_chunk_size();
 
   auto const get_obs_contribs = [obs_dim,
-                                 calc_sp](Utils::Span<double> const views) {
-    if (obs_dim == 1) {
+                                 calc_sp](std::span<double> const &views) {
+    if (obs_dim == 1ul) {
       return std::vector<Variant>(views.begin(), views.end());
     }
     assert(obs_dim == 9ul);
     assert(views.size() % 9ul == 0ul);
     std::vector<Variant> out;
     for (std::size_t i = 0ul; i < views.size() / 9ul; ++i) {
-      auto const view = Utils::Span<double>{views.data() + i * 9ul, 9ul};
+      auto const view = views.subspan(i * 9ul, 9ul);
       if (calc_sp) {
-        auto const trace = view[0] + view[4] + view[8];
+        auto const trace = view[0ul] + view[4ul] + view[8ul];
         out.emplace_back(trace / 3.);
       } else {
         auto const flat_matrix = std::vector<double>(view.begin(), view.end());
@@ -67,8 +66,8 @@ static auto get_summary(::System::System const &system,
   };
 
   auto const get_obs_contrib =
-      [&get_obs_contribs](Utils::Span<double> const views) -> Variant {
-    return get_obs_contribs(views)[0];
+      [&get_obs_contribs](std::span<double> const &views) -> Variant {
+    return get_obs_contribs(views)[0ul];
   };
 
   std::unordered_map<std::string, Variant> dict;
@@ -77,7 +76,7 @@ static auto get_summary(::System::System const &system,
 
   {
     auto values = std::vector<double>(obs_dim);
-    for (std::size_t i = 0; i < obs_dim; ++i) {
+    for (std::size_t i = 0ul; i < obs_dim; ++i) {
       values[i] = obs.accumulate(0., i);
     }
     dict["total"] = get_obs_contrib({values.data(), obs_dim});
@@ -106,7 +105,7 @@ static auto get_summary(::System::System const &system,
 #ifdef ELECTROSTATICS
   {
     auto const values = get_obs_contribs(obs.coulomb);
-    for (std::size_t i = 0; i < values.size(); ++i) {
+    for (std::size_t i = 0ul; i < values.size(); ++i) {
       dict["coulomb," + std::to_string(i)] = values[i];
     }
   }
@@ -115,7 +114,7 @@ static auto get_summary(::System::System const &system,
 #ifdef DIPOLES
   {
     auto const values = get_obs_contribs(obs.dipolar);
-    for (std::size_t i = 0; i < values.size(); ++i) {
+    for (std::size_t i = 0ul; i < values.size(); ++i) {
       dict["dipolar," + std::to_string(i)] = values[i];
     }
   }
@@ -124,7 +123,7 @@ static auto get_summary(::System::System const &system,
 #ifdef VIRTUAL_SITES
   {
     auto const values = get_obs_contribs(obs.virtual_sites);
-    for (std::size_t i = 0; i < values.size(); ++i) {
+    for (std::size_t i = 0ul; i < values.size(); ++i) {
       dict["virtual_sites," + std::to_string(i)] = values[i];
     }
   }
diff --git a/src/script_interface/auto_parameters/AutoParameters.hpp b/src/script_interface/auto_parameters/AutoParameters.hpp
index 952935509cf..0769698faac 100644
--- a/src/script_interface/auto_parameters/AutoParameters.hpp
+++ b/src/script_interface/auto_parameters/AutoParameters.hpp
@@ -23,6 +23,8 @@
 #include "script_interface/ObjectHandle.hpp"
 #include "script_interface/auto_parameters/AutoParameter.hpp"
 
+#include <algorithm>
+#include <span>
 #include <stdexcept>
 #include <string>
 #include <type_traits>
@@ -114,11 +116,9 @@ class AutoParameters : public Base {
     for (auto const &p : params) {
       if (m_parameters.count(p.name)) {
         m_parameters.erase(p.name);
-        for (auto it = m_key_order.begin(); it != m_key_order.end(); ++it) {
-          if (*it == p.name) {
-            m_key_order.erase(it);
-            break;
-          }
+        if (auto const it = std::ranges::find(m_key_order, p.name);
+            it != m_key_order.end()) {
+          m_key_order.erase(it);
         }
       }
       m_key_order.emplace_back(p.name);
@@ -130,7 +130,7 @@ class AutoParameters : public Base {
 
 public:
   /* ObjectHandle implementation */
-  Utils::Span<const boost::string_ref> valid_parameters() const final {
+  std::span<const boost::string_ref> valid_parameters() const final {
     static std::vector<boost::string_ref> valid_params;
     valid_params.clear();
 
diff --git a/src/script_interface/cell_system/CellSystem.cpp b/src/script_interface/cell_system/CellSystem.cpp
index 42388dc9bce..8b88ed16a7d 100644
--- a/src/script_interface/cell_system/CellSystem.cpp
+++ b/src/script_interface/cell_system/CellSystem.cpp
@@ -40,7 +40,9 @@
 #include <boost/variant.hpp>
 
 #include <algorithm>
+#include <cassert>
 #include <iterator>
+#include <optional>
 #include <set>
 #include <sstream>
 #include <stdexcept>
@@ -49,6 +51,25 @@
 #include <utility>
 #include <vector>
 
+static int coord(std::string const &s) {
+  if (s == "x")
+    return 0;
+  if (s == "y")
+    return 1;
+  if (s == "z")
+    return 2;
+  throw std::invalid_argument("Invalid Cartesian coordinate: '" + s + "'");
+}
+
+static std::string coord_letter(int c) {
+  if (c == 0)
+    return "x";
+  if (c == 1)
+    return "y";
+  assert(c == 2);
+  return "z";
+}
+
 namespace ScriptInterface {
 namespace CellSystem {
 
@@ -110,6 +131,20 @@ CellSystem::CellSystem() {
          auto const ns_types = hd.get_n_square_types();
          return Variant{std::vector<int>(ns_types.begin(), ns_types.end())};
        }},
+      {"fully_connected_boundary", AutoParameter::read_only,
+       [this]() {
+         if (get_cell_structure().decomposition_type() !=
+             CellStructureType::REGULAR) {
+           return Variant{none};
+         }
+         auto const rd = get_regular_decomposition();
+         auto const fcb = rd.fully_connected_boundary();
+         if (not fcb)
+           return Variant{none};
+         return Variant{std::unordered_map<std::string, Variant>{
+             {{"boundary", Variant{coord_letter((*fcb).first)}},
+              {"direction", Variant{coord_letter((*fcb).second)}}}}};
+       }},
       {"cutoff_regular", AutoParameter::read_only,
        [this]() {
          if (get_cell_structure().decomposition_type() !=
@@ -261,6 +296,21 @@ void CellSystem::initialize(CellStructureType const &cs_type,
         get_value_or<std::vector<int>>(params, "n_square_types", {});
     auto n_square_types = std::set<int>{ns_types.begin(), ns_types.end()};
     m_cell_structure->set_hybrid_decomposition(cutoff_regular, n_square_types);
+  } else if (cs_type == CellStructureType::REGULAR) {
+    std::optional<std::pair<int, int>> fcb_pair = std::nullopt;
+    if (params.contains("fully_connected_boundary") and
+        not is_none(params.at("fully_connected_boundary"))) {
+      auto const variant =
+          get_value<VariantMap>(params, "fully_connected_boundary");
+      context()->parallel_try_catch([&fcb_pair, &variant]() {
+        fcb_pair = {{coord(boost::get<std::string>(variant.at("boundary"))),
+                     coord(boost::get<std::string>(variant.at("direction")))}};
+      });
+    }
+    context()->parallel_try_catch([this, &fcb_pair]() {
+      m_cell_structure->set_regular_decomposition(
+          get_system().get_interaction_range(), fcb_pair);
+    });
   } else {
     system.set_cell_structure_topology(cs_type);
   }
diff --git a/src/script_interface/code_info/CodeInfo.cpp b/src/script_interface/code_info/CodeInfo.cpp
index eb2870a42d1..69b2c5e0094 100644
--- a/src/script_interface/code_info/CodeInfo.cpp
+++ b/src/script_interface/code_info/CodeInfo.cpp
@@ -25,8 +25,9 @@
 
 #include <boost/algorithm/string/join.hpp>
 
-#include <algorithm>
+#include <stdexcept>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 namespace ScriptInterface {
@@ -36,17 +37,18 @@ static auto get_feature_vector(char const *const ptr[], unsigned int len) {
   return std::vector<std::string>{ptr, ptr + len};
 }
 
-static Variant get_feature_list(char const *const ptr[], unsigned int len) {
-  return make_vector_of_variants(std::vector<std::string>{ptr, ptr + len});
+static auto get_feature_set(char const *const ptr[], unsigned int len) {
+  return std::unordered_set<std::string>(ptr, ptr + len);
 }
 
 Variant CodeInfo::do_call_method(std::string const &name,
                                  VariantMap const &parameters) {
   if (name == "features") {
-    return get_feature_list(FEATURES, NUM_FEATURES);
+    return make_vector_of_variants(get_feature_vector(FEATURES, NUM_FEATURES));
   }
   if (name == "all_features") {
-    return get_feature_list(FEATURES_ALL, NUM_FEATURES_ALL);
+    return make_vector_of_variants(
+        get_feature_vector(FEATURES_ALL, NUM_FEATURES_ALL));
   }
   if (name == "build_type") {
     return std::string(ESPRESSO_BUILD_TYPE);
@@ -62,14 +64,14 @@ Variant CodeInfo::do_call_method(std::string const &name,
 }
 
 void check_features(std::vector<std::string> const &features) {
-  auto const allowed = get_feature_vector(FEATURES_ALL, NUM_FEATURES_ALL);
-  auto const built = get_feature_vector(FEATURES, NUM_FEATURES);
+  auto const allowed = get_feature_set(FEATURES_ALL, NUM_FEATURES_ALL);
+  auto const compiled_features = get_feature_set(FEATURES, NUM_FEATURES);
   std::vector<std::string> missing_features{};
   for (auto const &feature : features) {
-    if (std::find(allowed.begin(), allowed.end(), feature) == allowed.end()) {
+    if (not allowed.contains(feature)) {
       throw std::runtime_error("Unknown feature '" + feature + "'");
     }
-    if (std::find(built.begin(), built.end(), feature) == built.end()) {
+    if (not compiled_features.contains(feature)) {
       missing_features.emplace_back(feature);
     }
   }
diff --git a/src/script_interface/collision_detection/CollisionDetection.hpp b/src/script_interface/collision_detection/CollisionDetection.hpp
index 2ab4e93752b..b523c3d96f9 100644
--- a/src/script_interface/collision_detection/CollisionDetection.hpp
+++ b/src/script_interface/collision_detection/CollisionDetection.hpp
@@ -47,7 +47,6 @@ class CollisionDetection : public AutoParameters<CollisionDetection> {
       {CollisionModeType::BIND_CENTERS, "bind_centers"},
       {CollisionModeType::BIND_VS, "bind_at_point_of_collision"},
       {CollisionModeType::GLUE_TO_SURF, "glue_to_surface"},
-      {CollisionModeType::BIND_THREE_PARTICLES, "bind_three_particles"},
   };
   std::unordered_map<std::string, CollisionModeType> cd_name_to_mode;
   std::unordered_map<CollisionModeType,
@@ -62,9 +61,6 @@ class CollisionDetection : public AutoParameters<CollisionDetection> {
         "part_type_to_be_glued", "part_type_to_attach_vs_to",
         "part_type_after_glueing", "distance",
         "distance_glued_particle_to_vs"}},
-      {CollisionModeType::BIND_THREE_PARTICLES,
-       {"mode", "bond_centers", "distance", "bond_three_particles",
-        "three_particle_binding_angle_resolution"}},
   };
 
 public:
@@ -83,9 +79,6 @@ class CollisionDetection : public AutoParameters<CollisionDetection> {
 
          {"bond_centers", collision_params.bond_centers},
          {"bond_vs", collision_params.bond_vs},
-         {"bond_three_particles", collision_params.bond_three_particles},
-         {"three_particle_binding_angle_resolution",
-          collision_params.three_particle_angle_resolution},
 
          {"distance", collision_params.distance},
          {"distance_glued_particle_to_vs",
diff --git a/src/script_interface/constraints/Constraint.hpp b/src/script_interface/constraints/Constraint.hpp
index bcc75502d3e..2bc827bd82a 100644
--- a/src/script_interface/constraints/Constraint.hpp
+++ b/src/script_interface/constraints/Constraint.hpp
@@ -19,8 +19,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef SCRIPT_INTERFACE_CONSTRAINTS_CONSTRAINT_HPP
-#define SCRIPT_INTERFACE_CONSTRAINTS_CONSTRAINT_HPP
+#pragma once
 
 #include "core/constraints/Constraint.hpp"
 #include "script_interface/ScriptInterface.hpp"
@@ -37,5 +36,3 @@ class Constraint : public AutoParameters<Constraint> {
 
 } /* namespace Constraints */
 } /* namespace ScriptInterface */
-
-#endif
diff --git a/src/script_interface/constraints/Constraints.hpp b/src/script_interface/constraints/Constraints.hpp
index a8b304c872a..9e56946b411 100644
--- a/src/script_interface/constraints/Constraints.hpp
+++ b/src/script_interface/constraints/Constraints.hpp
@@ -19,8 +19,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef SCRIPT_INTERFACE_CONSTRAINTS_CONSTRAINTS_HPP
-#define SCRIPT_INTERFACE_CONSTRAINTS_CONSTRAINTS_HPP
+#pragma once
 
 #include "Constraint.hpp"
 
@@ -45,9 +44,7 @@ class Constraints : public ObjectList<Constraint> {
 private:
   // disable serialization: pickling done by the python interface
   std::string get_internal_state() const override { return {}; }
-  void set_internal_state(std::string const &state) override {}
+  void set_internal_state(std::string const &) override {}
 };
 } /* namespace Constraints */
 } /* namespace ScriptInterface */
-
-#endif
diff --git a/src/script_interface/constraints/ExternalField.hpp b/src/script_interface/constraints/ExternalField.hpp
index 1e3299ee0eb..b83bd74efb5 100644
--- a/src/script_interface/constraints/ExternalField.hpp
+++ b/src/script_interface/constraints/ExternalField.hpp
@@ -19,8 +19,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef SCRIPT_INTERFACE_CONSTRAINTS_EXTERNAL_FIELD_HPP
-#define SCRIPT_INTERFACE_CONSTRAINTS_EXTERNAL_FIELD_HPP
+#pragma once
 
 #include "couplings.hpp"
 #include "fields.hpp"
@@ -79,5 +78,3 @@ class ExternalField : public Constraint {
 };
 } /* namespace Constraints */
 } /* namespace ScriptInterface */
-
-#endif
diff --git a/src/script_interface/constraints/ExternalPotential.hpp b/src/script_interface/constraints/ExternalPotential.hpp
index 6a9840bff8e..c7a93f96a96 100644
--- a/src/script_interface/constraints/ExternalPotential.hpp
+++ b/src/script_interface/constraints/ExternalPotential.hpp
@@ -19,8 +19,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef SCRIPT_INTERFACE_CONSTRAINTS_EXTERNAL_POTENTIAL_HPP
-#define SCRIPT_INTERFACE_CONSTRAINTS_EXTERNAL_POTENTIAL_HPP
+#pragma once
 
 #include "core/constraints/Constraint.hpp"
 #include "core/constraints/ExternalPotential.hpp"
@@ -83,5 +82,3 @@ class ExternalPotential : public Constraint {
 };
 } // namespace Constraints
 } // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/constraints/HomogeneousMagneticField.hpp b/src/script_interface/constraints/HomogeneousMagneticField.hpp
index d36a2dbfc92..f01aadddf1a 100644
--- a/src/script_interface/constraints/HomogeneousMagneticField.hpp
+++ b/src/script_interface/constraints/HomogeneousMagneticField.hpp
@@ -19,8 +19,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef SCRIPT_INTERFACE_CONSTRAINTS_HOMOGENEOUSMAGNETICFIELD_HPP
-#define SCRIPT_INTERFACE_CONSTRAINTS_HOMOGENEOUSMAGNETICFIELD_HPP
+#pragma once
 
 #include "core/constraints/Constraint.hpp"
 #include "core/constraints/HomogeneousMagneticField.hpp"
@@ -65,5 +64,3 @@ class HomogeneousMagneticField : public Constraint {
 
 } /* namespace Constraints */
 } /* namespace ScriptInterface */
-
-#endif
diff --git a/src/script_interface/constraints/couplings.hpp b/src/script_interface/constraints/couplings.hpp
index 1e41d66b8d4..0e41b72925a 100644
--- a/src/script_interface/constraints/couplings.hpp
+++ b/src/script_interface/constraints/couplings.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef SCRIPT_INTERFACE_CONSTRAINTS_DETAIL_COUPLINGS_HPP
-#define SCRIPT_INTERFACE_CONSTRAINTS_DETAIL_COUPLINGS_HPP
+
+#pragma once
 
 /**
  * @file
@@ -98,5 +98,3 @@ template <> inline Scaled make_coupling<Scaled>(const VariantMap &params) {
 } // namespace detail
 } // namespace Constraints
 } // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/constraints/fields.hpp b/src/script_interface/constraints/fields.hpp
index 650c12ce880..428b54180b8 100644
--- a/src/script_interface/constraints/fields.hpp
+++ b/src/script_interface/constraints/fields.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef SCRIPT_INTERFACE_CONSTRAINTS_DETAIL_FIELDS_HPP
-#define SCRIPT_INTERFACE_CONSTRAINTS_DETAIL_FIELDS_HPP
+
+#pragma once
 
 #include "core/field_coupling/fields/AffineMap.hpp"
 #include "core/field_coupling/fields/Constant.hpp"
@@ -165,5 +165,3 @@ template <typename Field> Field make_field(const VariantMap &params) {
 } // namespace detail
 } // namespace Constraints
 } // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/constraints/initialize.hpp b/src/script_interface/constraints/initialize.hpp
index 8d48333b26b..29b4396cbcb 100644
--- a/src/script_interface/constraints/initialize.hpp
+++ b/src/script_interface/constraints/initialize.hpp
@@ -17,8 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef SCRIPT_INTERFACE_CONSTRAINTS_INITIALIZE_HPP
-#define SCRIPT_INTERFACE_CONSTRAINTS_INITIALIZE_HPP
+#pragma once
 
 #include <script_interface/ObjectHandle.hpp>
 
@@ -31,5 +30,3 @@ void initialize(Utils::Factory<ObjectHandle> *om);
 
 } /* namespace Constraints */
 } /* namespace ScriptInterface */
-
-#endif
diff --git a/src/script_interface/electrostatics/Actor.hpp b/src/script_interface/electrostatics/Actor.hpp
index 7ddae421df9..1839673b300 100644
--- a/src/script_interface/electrostatics/Actor.hpp
+++ b/src/script_interface/electrostatics/Actor.hpp
@@ -30,9 +30,7 @@
 #include "script_interface/system/Leaf.hpp"
 
 #include <memory>
-#include <stdexcept>
 #include <string>
-#include <vector>
 
 namespace ScriptInterface {
 namespace Coulomb {
diff --git a/src/script_interface/electrostatics/Actor_impl.hpp b/src/script_interface/electrostatics/Actor_impl.hpp
index 8204cc1e683..82b26354001 100644
--- a/src/script_interface/electrostatics/Actor_impl.hpp
+++ b/src/script_interface/electrostatics/Actor_impl.hpp
@@ -30,6 +30,10 @@
 
 #include "script_interface/auto_parameters/AutoParameter.hpp"
 
+#include <cassert>
+#include <stdexcept>
+#include <string>
+
 namespace ScriptInterface {
 namespace Coulomb {
 
@@ -83,6 +87,7 @@ template <class SIClass, class CoreClass> Actor<SIClass, CoreClass>::Actor() {
 template <class SIClass, class CoreClass>
 Variant Actor<SIClass, CoreClass>::do_call_method(std::string const &name,
                                                   VariantMap const &params) {
+  assert(params.empty());
   if (name == "activate") {
     context()->parallel_try_catch([this]() {
       auto &system = get_system();
diff --git a/src/script_interface/electrostatics/Container.hpp b/src/script_interface/electrostatics/Container.hpp
index e89291a29c7..98c26e06667 100644
--- a/src/script_interface/electrostatics/Container.hpp
+++ b/src/script_interface/electrostatics/Container.hpp
@@ -58,7 +58,7 @@ class Container : public AutoParameters<Container, System::Leaf> {
     system.on_coulomb_change();
   }
 
-  void on_bind_system(::System::System &system) override {
+  void on_bind_system(::System::System &) override {
     auto const &params = *m_params;
     for (auto const &key : get_parameter_insertion_order()) {
       if (params.count(key)) {
diff --git a/src/script_interface/electrostatics/CoulombMMM1DGpu.hpp b/src/script_interface/electrostatics/CoulombMMM1DGpu.hpp
deleted file mode 100644
index deadb926b6e..00000000000
--- a/src/script_interface/electrostatics/CoulombMMM1DGpu.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (C) 2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "config/config.hpp"
-
-#ifdef MMM1D_GPU
-
-#include "Actor.hpp"
-
-#include "core/electrostatics/mmm1d_gpu.hpp"
-
-#include "script_interface/get_value.hpp"
-
-#include <memory>
-#include <string>
-
-namespace ScriptInterface {
-namespace Coulomb {
-
-class CoulombMMM1DGpu : public Actor<CoulombMMM1DGpu, ::CoulombMMM1DGpu> {
-
-public:
-  CoulombMMM1DGpu() {
-    add_parameters({
-        {"is_tuned", AutoParameter::read_only,
-         [this]() { return actor()->is_tuned(); }},
-        {"far_switch_radius", AutoParameter::read_only,
-         [this]() { return actor()->far_switch_radius; }},
-        {"maxPWerror", AutoParameter::read_only,
-         [this]() { return actor()->maxPWerror; }},
-        {"bessel_cutoff", AutoParameter::read_only,
-         [this]() { return actor()->bessel_cutoff; }},
-    });
-  }
-
-  void do_construct(VariantMap const &params) override {
-    context()->parallel_try_catch([this, &params]() {
-      m_actor = std::make_shared<CoreActorClass>(
-          get_value<double>(params, "prefactor"),
-          get_value<double>(params, "maxPWerror"),
-          get_value<double>(params, "far_switch_radius"),
-          get_value<int>(params, "bessel_cutoff"));
-    });
-    set_charge_neutrality_tolerance(params);
-  }
-};
-
-} // namespace Coulomb
-} // namespace ScriptInterface
-
-#endif // MMM1D_GPU
diff --git a/src/script_interface/electrostatics/CoulombP3M.hpp b/src/script_interface/electrostatics/CoulombP3M.hpp
index 38fc376835c..34ea03c20e1 100644
--- a/src/script_interface/electrostatics/CoulombP3M.hpp
+++ b/src/script_interface/electrostatics/CoulombP3M.hpp
@@ -26,6 +26,7 @@
 #include "Actor.hpp"
 
 #include "core/electrostatics/p3m.hpp"
+#include "core/p3m/FFTBackendLegacy.hpp"
 
 #include "script_interface/get_value.hpp"
 
@@ -88,6 +89,7 @@ class CoulombP3M : public Actor<CoulombP3M, ::CoulombP3M> {
           std::move(p3m), get_value<double>(params, "prefactor"),
           get_value<int>(params, "timings"), get_value<bool>(params, "verbose"),
           get_value<bool>(params, "check_complex_residuals"));
+      m_actor->p3m.make_fft_instance<FFTBackendLegacy>(false);
     });
     set_charge_neutrality_tolerance(params);
   }
diff --git a/src/script_interface/electrostatics/CoulombP3MGPU.hpp b/src/script_interface/electrostatics/CoulombP3MGPU.hpp
index 95ce143014c..187fde4308c 100644
--- a/src/script_interface/electrostatics/CoulombP3MGPU.hpp
+++ b/src/script_interface/electrostatics/CoulombP3MGPU.hpp
@@ -27,6 +27,7 @@
 #include "Actor.hpp"
 
 #include "core/electrostatics/p3m_gpu.hpp"
+#include "core/p3m/FFTBackendLegacy.hpp"
 
 #include "script_interface/get_value.hpp"
 
@@ -89,6 +90,7 @@ class CoulombP3MGPU : public Actor<CoulombP3MGPU, ::CoulombP3MGPU> {
           std::move(p3m), get_value<double>(params, "prefactor"),
           get_value<int>(params, "timings"), get_value<bool>(params, "verbose"),
           get_value<bool>(params, "check_complex_residuals"));
+      m_actor->p3m.make_fft_instance<FFTBackendLegacy>(false); // for CPU part
     });
     set_charge_neutrality_tolerance(params);
   }
diff --git a/src/script_interface/electrostatics/initialize.cpp b/src/script_interface/electrostatics/initialize.cpp
index de8e21cc710..476ea4ca9c5 100644
--- a/src/script_interface/electrostatics/initialize.cpp
+++ b/src/script_interface/electrostatics/initialize.cpp
@@ -27,7 +27,6 @@
 
 #include "Container.hpp"
 #include "CoulombMMM1D.hpp"
-#include "CoulombMMM1DGpu.hpp"
 #include "CoulombP3M.hpp"
 #include "CoulombP3MGPU.hpp"
 #include "CoulombScafacos.hpp"
@@ -59,9 +58,6 @@ void initialize(Utils::Factory<ObjectHandle> *om) {
       "Coulomb::ElectrostaticLayerCorrection");
 #endif // P3M
   om->register_new<ICCStar>("Coulomb::ICCStar");
-#ifdef MMM1D_GPU
-  om->register_new<CoulombMMM1DGpu>("Coulomb::CoulombMMM1DGpu");
-#endif
   om->register_new<CoulombMMM1D>("Coulomb::CoulombMMM1D");
 #ifdef SCAFACOS
   om->register_new<CoulombScafacos>("Coulomb::CoulombScafacos");
diff --git a/src/script_interface/get_value.hpp b/src/script_interface/get_value.hpp
index 8f1bc914ffd..9bf678b5a85 100644
--- a/src/script_interface/get_value.hpp
+++ b/src/script_interface/get_value.hpp
@@ -27,8 +27,8 @@
 #include <utils/demangle.hpp>
 
 #include <boost/algorithm/string/join.hpp>
-#include <boost/range/algorithm/transform.hpp>
 
+#include <algorithm>
 #include <cstddef>
 #include <memory>
 #include <set>
@@ -221,8 +221,7 @@ struct vector_conversion_visitor : boost::static_visitor<Utils::Vector<T, N>> {
     }
 
     Utils::Vector<T, N> ret{};
-    boost::transform(vv, ret.begin(),
-                     [](const Variant &v) { return get_value_helper<T>{}(v); });
+    std::ranges::transform(vv, ret.begin(), get_value_helper<T>{});
 
     return ret;
   }
@@ -264,8 +263,7 @@ struct GetVectorOrEmpty : boost::static_visitor<std::vector<T>> {
   std::vector<T> operator()(std::vector<Variant> const &vv) const {
     std::vector<T> ret(vv.size());
 
-    boost::transform(vv, ret.begin(),
-                     [](const Variant &v) { return get_value_helper<T>{}(v); });
+    std::ranges::transform(vv, ret.begin(), get_value_helper<T>{});
 
     return ret;
   }
diff --git a/src/script_interface/integrators/IntegratorHandle.cpp b/src/script_interface/integrators/IntegratorHandle.cpp
index 3b08badf6ea..109369935dc 100644
--- a/src/script_interface/integrators/IntegratorHandle.cpp
+++ b/src/script_interface/integrators/IntegratorHandle.cpp
@@ -97,6 +97,7 @@ void IntegratorHandle::on_bind_system(::System::System &system) {
   auto const &params = *m_params;
   for (auto const &key : get_parameter_insertion_order()) {
     if (params.count(key) != 0ul) {
+      // NOLINTNEXTLINE(readability-simplify-boolean-expr)
       if (not(key == "time_step" and
               system.propagation->integ_switch == INTEG_METHOD_NVT and
               system.get_time_step() == -1. and
diff --git a/src/script_interface/interactions/NonBondedInteraction.hpp b/src/script_interface/interactions/NonBondedInteraction.hpp
index 4f8413149ee..a93ae994b1f 100644
--- a/src/script_interface/interactions/NonBondedInteraction.hpp
+++ b/src/script_interface/interactions/NonBondedInteraction.hpp
@@ -86,7 +86,7 @@ class InteractionPotentialInterface
       }
     }
     for (auto const &kv : params) {
-      if (std::find(keys.begin(), keys.end(), kv.first) == keys.end()) {
+      if (std::ranges::find(keys, kv.first) == keys.end()) {
         throw std::runtime_error("Parameter '" + kv.first +
                                  "' is not recognized");
       }
@@ -878,6 +878,7 @@ class NonBondedInteractionHandle
 public:
   Variant do_call_method(std::string const &name,
                          VariantMap const &params) override {
+    assert(params.empty());
     if (name == "get_types") {
       return std::vector<int>{{m_types[0], m_types[1]}};
     }
diff --git a/src/script_interface/magnetostatics/Actor.hpp b/src/script_interface/magnetostatics/Actor.hpp
index 0ba9a0ec974..393bd290360 100644
--- a/src/script_interface/magnetostatics/Actor.hpp
+++ b/src/script_interface/magnetostatics/Actor.hpp
@@ -30,7 +30,6 @@
 #include "script_interface/system/Leaf.hpp"
 
 #include <memory>
-#include <stdexcept>
 #include <string>
 
 namespace ScriptInterface {
diff --git a/src/script_interface/magnetostatics/Actor_impl.hpp b/src/script_interface/magnetostatics/Actor_impl.hpp
index 58e24e64b1d..8a8e007224c 100644
--- a/src/script_interface/magnetostatics/Actor_impl.hpp
+++ b/src/script_interface/magnetostatics/Actor_impl.hpp
@@ -30,12 +30,16 @@
 
 #include "script_interface/auto_parameters/AutoParameter.hpp"
 
+#include <cassert>
+#include <string>
+
 namespace ScriptInterface {
 namespace Dipoles {
 
 template <class SIClass, class CoreClass>
 Variant Actor<SIClass, CoreClass>::do_call_method(std::string const &name,
                                                   VariantMap const &params) {
+  assert(params.empty());
   if (name == "activate") {
     context()->parallel_try_catch([this]() {
       auto &system = get_system();
diff --git a/src/script_interface/magnetostatics/Container.hpp b/src/script_interface/magnetostatics/Container.hpp
index 52e0f56c6ff..c3ae74940e9 100644
--- a/src/script_interface/magnetostatics/Container.hpp
+++ b/src/script_interface/magnetostatics/Container.hpp
@@ -46,7 +46,7 @@ class Container : public AutoParameters<Container, System::Leaf> {
     system.on_dipoles_change();
   }
 
-  void on_bind_system(::System::System &system) override {
+  void on_bind_system(::System::System &) override {
     auto const &params = *m_params;
     for (auto const &key : get_parameter_insertion_order()) {
       if (params.count(key)) {
diff --git a/src/script_interface/magnetostatics/DipolarBarnesHutGpu.hpp b/src/script_interface/magnetostatics/DipolarBarnesHutGpu.hpp
deleted file mode 100644
index 881f5cf51c3..00000000000
--- a/src/script_interface/magnetostatics/DipolarBarnesHutGpu.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (C) 2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "config/config.hpp"
-
-#ifdef DIPOLAR_BARNES_HUT
-
-#include "Actor.hpp"
-
-#include "core/magnetostatics/barnes_hut_gpu.hpp"
-
-#include "script_interface/get_value.hpp"
-
-#include <memory>
-#include <string>
-
-namespace ScriptInterface {
-namespace Dipoles {
-
-class DipolarBarnesHutGpu
-    : public Actor<DipolarBarnesHutGpu, ::DipolarBarnesHutGpu> {
-public:
-  DipolarBarnesHutGpu() {
-    add_parameters({
-        {"epssq", AutoParameter::read_only,
-         [this]() { return actor()->m_epssq; }},
-        {"itolsq", AutoParameter::read_only,
-         [this]() { return actor()->m_itolsq; }},
-    });
-  }
-
-  void do_construct(VariantMap const &params) override {
-    context()->parallel_try_catch([&]() {
-      m_actor = std::make_shared<CoreActorClass>(
-          get_value<double>(params, "prefactor"),
-          get_value<double>(params, "epssq"),
-          get_value<double>(params, "itolsq"));
-    });
-  }
-};
-
-} // namespace Dipoles
-} // namespace ScriptInterface
-
-#endif // DIPOLAR_BARNES_HUT
diff --git a/src/script_interface/magnetostatics/DipolarP3M.hpp b/src/script_interface/magnetostatics/DipolarP3M.hpp
index d37733d75d6..7474345e9d4 100644
--- a/src/script_interface/magnetostatics/DipolarP3M.hpp
+++ b/src/script_interface/magnetostatics/DipolarP3M.hpp
@@ -26,6 +26,7 @@
 #include "Actor.hpp"
 
 #include "core/magnetostatics/dp3m.hpp"
+#include "core/p3m/FFTBackendLegacy.hpp"
 
 #include "script_interface/get_value.hpp"
 
@@ -85,6 +86,7 @@ class DipolarP3M : public Actor<DipolarP3M, ::DipolarP3M> {
           std::move(p3m), get_value<double>(params, "prefactor"),
           get_value<int>(params, "timings"),
           get_value<bool>(params, "verbose"));
+      m_actor->dp3m.make_fft_instance<FFTBackendLegacy>(true);
     });
   }
 };
diff --git a/src/script_interface/magnetostatics/initialize.cpp b/src/script_interface/magnetostatics/initialize.cpp
index 972d37879ee..04b22818ed5 100644
--- a/src/script_interface/magnetostatics/initialize.cpp
+++ b/src/script_interface/magnetostatics/initialize.cpp
@@ -25,7 +25,6 @@
 #include "Actor_impl.hpp"
 
 #include "Container.hpp"
-#include "DipolarBarnesHutGpu.hpp"
 #include "DipolarDirectSum.hpp"
 #include "DipolarDirectSumGpu.hpp"
 #include "DipolarLayerCorrection.hpp"
@@ -49,9 +48,6 @@ void initialize(Utils::Factory<ObjectHandle> *om) {
 #ifdef DIPOLAR_DIRECT_SUM
   om->register_new<DipolarDirectSumGpu>("Dipoles::DipolarDirectSumGpu");
 #endif
-#ifdef DIPOLAR_BARNES_HUT
-  om->register_new<DipolarBarnesHutGpu>("Dipoles::DipolarBarnesHutGpu");
-#endif
 #ifdef DP3M
   om->register_new<DipolarP3M>("Dipoles::DipolarP3M");
 #endif
diff --git a/src/script_interface/observables/CylindricalLBProfileObservable.hpp b/src/script_interface/observables/CylindricalLBProfileObservable.hpp
index 455e7f061a4..b0bcfc68b69 100644
--- a/src/script_interface/observables/CylindricalLBProfileObservable.hpp
+++ b/src/script_interface/observables/CylindricalLBProfileObservable.hpp
@@ -30,11 +30,11 @@
 
 #include "script_interface/math/CylindricalTransformationParameters.hpp"
 
-#include <boost/range/algorithm.hpp>
-
+#include <algorithm>
 #include <cstddef>
 #include <iterator>
 #include <memory>
+#include <numbers>
 #include <string>
 #include <type_traits>
 #include <vector>
@@ -114,8 +114,8 @@ class CylindricalLBProfileObservable
             get_value_or<int>(params, "n_z_bins", 1),
             get_value_or<double>(params, "min_r", 0.),
             get_value<double>(params, "max_r"),
-            get_value_or<double>(params, "min_phi", -Utils::pi()),
-            get_value_or<double>(params, "max_phi", Utils::pi()),
+            get_value_or<double>(params, "min_phi", -std::numbers::pi),
+            get_value_or<double>(params, "max_phi", std::numbers::pi),
             get_value<double>(params, "min_z"),
             get_value<double>(params, "max_z"),
             get_value<double>(params, "sampling_density"));
@@ -127,8 +127,8 @@ class CylindricalLBProfileObservable
                          VariantMap const &parameters) override {
     if (method == "edges") {
       std::vector<Variant> variant_edges;
-      boost::copy(cylindrical_profile_observable()->edges(),
-                  std::back_inserter(variant_edges));
+      std::ranges::copy(cylindrical_profile_observable()->edges(),
+                        std::back_inserter(variant_edges));
       return variant_edges;
     }
     return Base::do_call_method(method, parameters);
diff --git a/src/script_interface/observables/CylindricalPidProfileObservable.hpp b/src/script_interface/observables/CylindricalPidProfileObservable.hpp
index 629a491bf63..26d23aa0a2c 100644
--- a/src/script_interface/observables/CylindricalPidProfileObservable.hpp
+++ b/src/script_interface/observables/CylindricalPidProfileObservable.hpp
@@ -29,13 +29,11 @@
 
 #include "script_interface/math/CylindricalTransformationParameters.hpp"
 
-#include <utils/constants.hpp>
-
-#include <boost/range/algorithm.hpp>
-
+#include <algorithm>
 #include <cstddef>
 #include <iterator>
 #include <memory>
+#include <numbers>
 #include <string>
 #include <type_traits>
 #include <vector>
@@ -114,8 +112,8 @@ class CylindricalPidProfileObservable
             get_value_or<int>(params, "n_z_bins", 1),
             get_value_or<double>(params, "min_r", 0.),
             get_value<double>(params, "max_r"),
-            get_value_or<double>(params, "min_phi", -Utils::pi()),
-            get_value_or<double>(params, "max_phi", Utils::pi()),
+            get_value_or<double>(params, "min_phi", -std::numbers::pi),
+            get_value_or<double>(params, "max_phi", std::numbers::pi),
             get_value<double>(params, "min_z"),
             get_value<double>(params, "max_z"));
       });
@@ -126,8 +124,8 @@ class CylindricalPidProfileObservable
                          VariantMap const &parameters) override {
     if (method == "edges") {
       std::vector<Variant> variant_edges;
-      boost::copy(cylindrical_pid_profile_observable()->edges(),
-                  std::back_inserter(variant_edges));
+      std::ranges::copy(cylindrical_pid_profile_observable()->edges(),
+                        std::back_inserter(variant_edges));
       return variant_edges;
     }
     return Base::do_call_method(method, parameters);
diff --git a/src/script_interface/observables/LBProfileObservable.hpp b/src/script_interface/observables/LBProfileObservable.hpp
index 3ebe9f02780..8a867b60d7d 100644
--- a/src/script_interface/observables/LBProfileObservable.hpp
+++ b/src/script_interface/observables/LBProfileObservable.hpp
@@ -27,8 +27,7 @@
 #include "Observable.hpp"
 #include "core/observables/LBProfileObservable.hpp"
 
-#include <boost/range/algorithm.hpp>
-
+#include <algorithm>
 #include <cstddef>
 #include <iterator>
 #include <memory>
@@ -107,8 +106,8 @@ class LBProfileObservable
                          VariantMap const &parameters) override {
     if (method == "edges") {
       std::vector<Variant> variant_edges;
-      boost::copy(profile_observable()->edges(),
-                  std::back_inserter(variant_edges));
+      std::ranges::copy(profile_observable()->edges(),
+                        std::back_inserter(variant_edges));
       return variant_edges;
     }
     return Base::do_call_method(method, parameters);
diff --git a/src/script_interface/observables/PidProfileObservable.hpp b/src/script_interface/observables/PidProfileObservable.hpp
index 4e6c1ebe947..b37dede03c1 100644
--- a/src/script_interface/observables/PidProfileObservable.hpp
+++ b/src/script_interface/observables/PidProfileObservable.hpp
@@ -30,8 +30,7 @@
 #include "core/observables/ForceDensityProfile.hpp"
 #include "core/observables/PidProfileObservable.hpp"
 
-#include <boost/range/algorithm.hpp>
-
+#include <algorithm>
 #include <cstddef>
 #include <iterator>
 #include <memory>
@@ -93,8 +92,8 @@ class PidProfileObservable
                          VariantMap const &parameters) override {
     if (method == "edges") {
       std::vector<Variant> variant_edges;
-      boost::copy(pid_profile_observable()->edges(),
-                  std::back_inserter(variant_edges));
+      std::ranges::copy(pid_profile_observable()->edges(),
+                        std::back_inserter(variant_edges));
       return variant_edges;
     }
     return Base::do_call_method(method, parameters);
diff --git a/src/script_interface/observables/ProfileObservable.hpp b/src/script_interface/observables/ProfileObservable.hpp
index 7540177c215..08685d5b312 100644
--- a/src/script_interface/observables/ProfileObservable.hpp
+++ b/src/script_interface/observables/ProfileObservable.hpp
@@ -28,6 +28,7 @@
 #include "core/observables/LBVelocityProfile.hpp"
 #include "core/observables/ProfileObservable.hpp"
 
+#include <algorithm>
 #include <cstddef>
 #include <iterator>
 #include <memory>
@@ -108,8 +109,8 @@ class ProfileObservable
                       VariantMap const &parameters) override {
     if (method == "edges") {
       std::vector<Variant> variant_edges;
-      boost::copy(profile_observable()->edges(),
-                  std::back_inserter(variant_edges));
+      std::ranges::copy(profile_observable()->edges(),
+                        std::back_inserter(variant_edges));
       return variant_edges;
     }
     return Base::call_method(method, parameters);
diff --git a/src/script_interface/observables/RDF.hpp b/src/script_interface/observables/RDF.hpp
index 8d912c3ad7c..c9d66f14eb4 100644
--- a/src/script_interface/observables/RDF.hpp
+++ b/src/script_interface/observables/RDF.hpp
@@ -27,10 +27,6 @@
 
 #include "core/observables/RDF.hpp"
 
-#include <boost/range/algorithm.hpp>
-
-#include <cstddef>
-#include <iterator>
 #include <memory>
 #include <vector>
 
diff --git a/src/script_interface/packed_variant.hpp b/src/script_interface/packed_variant.hpp
index 5dfb9b09448..e4ee38f5a62 100644
--- a/src/script_interface/packed_variant.hpp
+++ b/src/script_interface/packed_variant.hpp
@@ -16,11 +16,12 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef SCRIPT_INTERFACE_PACKED_VARIANT_HPP
-#define SCRIPT_INTERFACE_PACKED_VARIANT_HPP
+
+#pragma once
 
 #include "Variant.hpp"
 
+#include <algorithm>
 #include <cstddef>
 #include <functional>
 #include <string>
@@ -80,7 +81,7 @@ struct PackVisitor : boost::static_visitor<PackedVariant> {
   auto operator()(const std::vector<Variant> &vec) const {
     std::vector<PackedVariant> ret(vec.size());
 
-    boost::transform(vec, ret.begin(), [this](const Variant &v) {
+    std::ranges::transform(vec, ret.begin(), [this](const Variant &v) {
       return boost::apply_visitor(*this, v);
     });
 
@@ -129,7 +130,7 @@ struct UnpackVisitor : boost::static_visitor<Variant> {
   auto operator()(const std::vector<PackedVariant> &vec) const {
     std::vector<Variant> ret(vec.size());
 
-    boost::transform(vec, ret.begin(), [this](const PackedVariant &v) {
+    std::ranges::transform(vec, ret.begin(), [this](const PackedVariant &v) {
       return boost::apply_visitor(*this, v);
     });
 
@@ -192,7 +193,7 @@ inline Variant unpack(const PackedVariant &v,
 inline PackedMap pack(const VariantMap &v) {
   PackedMap ret(v.size());
 
-  boost::transform(v, ret.begin(), [](auto const &kv) {
+  std::ranges::transform(v, ret.begin(), [](auto const &kv) {
     return std::pair<std::string, PackedVariant>{kv.first, pack(kv.second)};
   });
 
@@ -210,7 +211,7 @@ unpack(const PackedMap &v,
        std::unordered_map<ObjectId, ObjectRef> const &objects) {
   VariantMap ret;
 
-  boost::transform(
+  std::ranges::transform(
       v, std::inserter(ret, ret.end()),
       [&objects](auto const &kv) -> std::pair<std::string, Variant> {
         return {kv.first, unpack(kv.second, objects)};
@@ -219,5 +220,3 @@ unpack(const PackedMap &v,
   return ret;
 }
 } // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/particle_data/ParticleHandle.cpp b/src/script_interface/particle_data/ParticleHandle.cpp
index fab4cb40323..c253831b99c 100644
--- a/src/script_interface/particle_data/ParticleHandle.cpp
+++ b/src/script_interface/particle_data/ParticleHandle.cpp
@@ -173,6 +173,11 @@ void ParticleHandle::set_particle_property(T &(Particle::*setter)(),
 }
 
 ParticleHandle::ParticleHandle() {
+  /* Warning: the order of particle property setters matters! Some properties
+   * override each other, e.g. quat/director/dip or dip/dipm.
+   * This is revelant during checkpointing: all particle properties are set at
+   * once, in the order specified in the following call to `add_parameters()`.
+   */
   add_parameters({
       {"id", AutoParameter::read_only, [this]() { return m_pid; }},
       {"type",
@@ -240,6 +245,28 @@ ParticleHandle::ParticleHandle() {
        },
 #endif // ELECTROSTATICS
        [this]() { return get_particle_data(m_pid).q(); }},
+#ifdef DIPOLES
+      {"dip",
+       [this](Variant const &value) {
+         set_particle_property([&value](Particle &p) {
+           auto const dip = get_value<Utils::Vector3d>(value);
+           std::tie(p.quat(), p.dipm()) = convert_dip_to_quat(dip);
+         });
+       },
+       [this]() { return get_particle_data(m_pid).calc_dip(); }},
+      {"dipm",
+       [this](Variant const &value) {
+         set_particle_property(&Particle::dipm, value);
+       },
+       [this]() { return get_particle_data(m_pid).dipm(); }},
+#endif // DIPOLES
+#ifdef DIPOLE_FIELD_TRACKING
+      {"dip_fld",
+       [this](Variant const &value) {
+         set_particle_property(&Particle::dip_fld, value);
+       },
+       [this]() { return get_particle_data(m_pid).dip_fld(); }},
+#endif
 #ifdef ROTATION
       {"director",
        [this](Variant const &value) {
@@ -300,28 +327,6 @@ ParticleHandle::ParticleHandle() {
          return convert_vector_body_to_space(p, p.torque());
        }},
 #endif // ROTATION
-#ifdef DIPOLES
-      {"dip",
-       [this](Variant const &value) {
-         set_particle_property([&value](Particle &p) {
-           auto const dip = get_value<Utils::Vector3d>(value);
-           std::tie(p.quat(), p.dipm()) = convert_dip_to_quat(dip);
-         });
-       },
-       [this]() { return get_particle_data(m_pid).calc_dip(); }},
-      {"dipm",
-       [this](Variant const &value) {
-         set_particle_property(&Particle::dipm, value);
-       },
-       [this]() { return get_particle_data(m_pid).dipm(); }},
-#endif // DIPOLES
-#ifdef DIPOLE_FIELD_TRACKING
-      {"dip_fld",
-       [this](Variant const &value) {
-         set_particle_property(&Particle::dip_fld, value);
-       },
-       [this]() { return get_particle_data(m_pid).dip_fld(); }},
-#endif
 #ifdef ROTATIONAL_INERTIA
       {"rinertia",
        [this](Variant const &value) {
@@ -364,17 +369,6 @@ ParticleHandle::ParticleHandle() {
        [this]() { return get_particle_data(m_pid).ext_torque(); }},
 #endif // ROTATION
 #endif // EXTERNAL_FORCES
-      {"propagation",
-       [this](Variant const &value) {
-         auto const propagation = get_value<int>(value);
-         if (!is_valid_propagation_combination(propagation)) {
-           throw std::domain_error(error_msg(
-               "propagation", "propagation combination not accepted: " +
-                                  propagation_bitmask_to_string(propagation)));
-         }
-         set_particle_property(&Particle::propagation, value);
-       },
-       [this]() { return get_particle_data(m_pid).propagation(); }},
 #ifdef THERMOSTAT_PER_PARTICLE
       {"gamma",
        [this](Variant const &value) {
@@ -404,7 +398,11 @@ ParticleHandle::ParticleHandle() {
       {"lees_edwards_flag", AutoParameter::read_only,
        [this]() { return get_particle_data(m_pid).lees_edwards_flag(); }},
       {"image_box", AutoParameter::read_only,
-       [this]() { return get_particle_data(m_pid).image_box(); }},
+       [this]() {
+         auto const &box_geo = *System::get_system().box_geo;
+         auto const p = get_particle_data(m_pid);
+         return box_geo.folded_image_box(p.pos(), p.image_box());
+       }},
       {"node", AutoParameter::read_only,
        [this]() {
          return (context()->is_head_node()) ? get_particle_node(m_pid) : -1;
@@ -445,14 +443,8 @@ ParticleHandle::ParticleHandle() {
            throw std::invalid_argument(error_msg(
                "vs_relative", "must take the form [id, distance, quaternion]"));
          }
-         set_particle_property([&vs_relative](Particle &p) {
-           p.vs_relative() = vs_relative;
-           if (vs_relative.to_particle_id != -1) {
-             p.propagation() = PropagationMode::TRANS_VS_RELATIVE |
-                               PropagationMode::ROT_VS_RELATIVE;
-           }
-           assert(is_valid_propagation_combination(p.propagation()));
-         });
+         set_particle_property(
+             [&vs_relative](Particle &p) { p.vs_relative() = vs_relative; });
        },
        [this]() {
          auto const vs_rel = get_particle_data(m_pid).vs_relative();
@@ -460,6 +452,17 @@ ParticleHandle::ParticleHandle() {
                                       quat2vector(vs_rel.rel_orientation)}};
        }},
 #endif // VIRTUAL_SITES_RELATIVE
+      {"propagation",
+       [this](Variant const &value) {
+         auto const propagation = get_value<int>(value);
+         if (!is_valid_propagation_combination(propagation)) {
+           throw std::domain_error(error_msg(
+               "propagation", "propagation combination not accepted: " +
+                                  propagation_bitmask_to_string(propagation)));
+         }
+         set_particle_property(&Particle::propagation, value);
+       },
+       [this]() { return get_particle_data(m_pid).propagation(); }},
 #ifdef ENGINE
       {"swimming",
        [this](Variant const &value) {
@@ -598,7 +601,7 @@ Variant ParticleHandle::do_call_method(std::string const &name,
     }
     return get_particle_data(m_pid).is_virtual();
 #ifdef VIRTUAL_SITES_RELATIVE
-  } else if (name == "vs_relate_to") {
+  } else if (name == "vs_auto_relate_to") {
     if (not context()->is_head_node()) {
       return {};
     }
@@ -629,6 +632,9 @@ Variant ParticleHandle::do_call_method(std::string const &name,
         override_cutoff_check);
     set_parameter("vs_relative", Variant{std::vector<Variant>{
                                      {other_pid, dist, quat2vector(quat)}}});
+    set_parameter("propagation",
+                  Variant{static_cast<int>(PropagationMode::TRANS_VS_RELATIVE |
+                                           PropagationMode::ROT_VS_RELATIVE)});
 #endif // VIRTUAL_SITES_RELATIVE
 #ifdef EXCLUSIONS
   } else if (name == "has_exclusion") {
@@ -724,14 +730,11 @@ static auto const contradicting_arguments_quat = std::vector<
 
 void ParticleHandle::do_construct(VariantMap const &params) {
   auto const n_extra_args = params.size() - params.count("id");
-  auto const has_param = [&params](std::string const key) {
-    return params.count(key) == 1;
-  };
-  m_pid = (has_param("id")) ? get_value<int>(params, "id")
-                            : get_maximal_particle_id() + 1;
+  m_pid = (params.contains("id")) ? get_value<int>(params, "id")
+                                  : get_maximal_particle_id() + 1;
 
 #ifndef NDEBUG
-  if (!has_param("id")) {
+  if (not params.contains("id")) {
     auto head_node_reference = m_pid;
     boost::mpi::broadcast(context()->get_comm(), head_node_reference, 0);
     assert(m_pid == head_node_reference && "global max_seen_pid has diverged");
@@ -759,11 +762,11 @@ void ParticleHandle::do_construct(VariantMap const &params) {
   context()->parallel_try_catch([&]() {
     // if we are not constructing a particle from a checkpoint file,
     // check the quaternion is not accidentally set twice by the user
-    if (not has_param("__cpt_sentinel")) {
+    if (not params.contains("__cpt_sentinel")) {
       auto formatter =
           boost::format("Contradicting particle attributes: '%s' and '%s'. %s");
       for (auto const &[prop1, prop2, reason] : contradicting_arguments_quat) {
-        if (has_param(prop1) and has_param(prop2)) {
+        if (params.contains(prop1) and params.contains(prop2)) {
           auto const err_msg = boost::str(formatter % prop1 % prop2 % reason);
           throw std::invalid_argument(err_msg);
         }
@@ -780,29 +783,16 @@ void ParticleHandle::do_construct(VariantMap const &params) {
       /* clang-format off */
       // set particle properties (filter out read-only and deferred properties)
       std::set<std::string> const skip = {
-          "pos_folded", "pos", "quat", "director", "id",
-          "exclusions", "dip", "node", "image_box", "bonds",
+          "pos_folded", "pos", "id", "exclusions", "node", "image_box", "bonds",
           "lees_edwards_flag", "__cpt_sentinel",
       };
       /* clang-format on */
-#ifdef ROTATION
-      // multiple parameters can potentially set the quaternion, but only one
-      // can be allowed to; these conditionals are required to handle a reload
-      // from a checkpoint file, where all properties exist (avoids accidentally
-      // overwriting the quaternion by the default-constructed dipole moment)
-      for (std::string name : {"quat", "director", "dip"}) {
-        if (has_param(name)) {
-          do_set_parameter(name, params.at(name));
-          break;
-        }
-      }
-#endif // ROTATION
       for (auto const &name : get_parameter_insertion_order()) {
-        if (params.count(name) != 0ul and skip.count(name) == 0ul) {
+        if (params.contains(name) and not skip.contains(name)) {
           do_set_parameter(name, params.at(name));
         }
       }
-      if (not has_param("type")) {
+      if (not params.contains("type")) {
         do_set_parameter("type", 0);
       }
     });
diff --git a/src/script_interface/particle_data/ParticleHandle.hpp b/src/script_interface/particle_data/ParticleHandle.hpp
index eee2c9f69ea..5c6bb460fca 100644
--- a/src/script_interface/particle_data/ParticleHandle.hpp
+++ b/src/script_interface/particle_data/ParticleHandle.hpp
@@ -17,8 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef ESPRESSO_SRC_SCRIPT_INTERFACE_PARTICLE_DATA_PARTICLE_HANDLE_HPP
-#define ESPRESSO_SRC_SCRIPT_INTERFACE_PARTICLE_DATA_PARTICLE_HANDLE_HPP
+#pragma once
 
 #include "script_interface/ScriptInterface.hpp"
 #include "script_interface/auto_parameters/AutoParameters.hpp"
@@ -57,5 +56,3 @@ class ParticleHandle : public AutoParameters<ParticleHandle> {
 
 } // namespace Particles
 } // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/particle_data/ParticleList.cpp b/src/script_interface/particle_data/ParticleList.cpp
index cbf82b4171e..e9b2ccd7a30 100644
--- a/src/script_interface/particle_data/ParticleList.cpp
+++ b/src/script_interface/particle_data/ParticleList.cpp
@@ -34,8 +34,9 @@
 
 #include <boost/mpi/collectives.hpp>
 #include <boost/mpi/communicator.hpp>
-#include <boost/range/algorithm.hpp>
 
+#include <algorithm>
+#include <cstddef>
 #include <memory>
 #include <stdexcept>
 #include <string>
@@ -67,7 +68,7 @@ std::string ParticleList::get_internal_state() const {
   auto const p_ids = get_particle_ids();
   std::vector<std::string> object_states(p_ids.size());
 
-  boost::transform(p_ids, object_states.begin(), [this](auto const p_id) {
+  std::ranges::transform(p_ids, object_states.begin(), [this](auto const p_id) {
     auto p_obj =
         context()->make_shared("Particles::ParticleHandle", {{"id", p_id}});
     auto &p_handle = dynamic_cast<ParticleHandle &>(*p_obj);
@@ -138,7 +139,7 @@ static void auto_exclusions(boost::mpi::communicator const &comm,
   for (auto const &p : cell_structure.local_particles()) {
     auto const pid1 = p.id();
     for (auto const bond : p.bonds()) {
-      if (bond.partner_ids().size() == 1) {
+      if (bond.partner_ids().size() == 1u) {
         auto const pid2 = bond.partner_ids()[0];
         if (pid1 != pid2) {
           bonded_pairs.emplace_back(pid1);
@@ -174,13 +175,13 @@ static void auto_exclusions(boost::mpi::communicator const &comm,
       for (auto const pid1 : pids) {
         // loop over partners (counter-based loops due to iterator invalidation)
         // NOLINTNEXTLINE(modernize-loop-convert)
-        for (int i = 0; i < partners[pid1].size(); ++i) {
+        for (std::size_t i = 0u; i < partners[pid1].size(); ++i) {
           auto const [pid2, dist21] = partners[pid1][i];
           if (dist21 > n_bonds_max)
             continue;
           // loop over all partners of the partner
           // NOLINTNEXTLINE(modernize-loop-convert)
-          for (int j = 0; j < partners[pid2].size(); ++j) {
+          for (std::size_t j = 0u; j < partners[pid2].size(); ++j) {
             auto const [pid3, dist32] = partners[pid2][j];
             auto const dist31 = dist32 + dist21;
             if (dist31 > n_bonds_max)
diff --git a/src/script_interface/particle_data/ParticleList.hpp b/src/script_interface/particle_data/ParticleList.hpp
index 9506506141c..7f8cbdc6d80 100644
--- a/src/script_interface/particle_data/ParticleList.hpp
+++ b/src/script_interface/particle_data/ParticleList.hpp
@@ -17,8 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef ESPRESSO_SRC_SCRIPT_INTERFACE_PARTICLE_DATA_PARTICLE_LIST_HPP
-#define ESPRESSO_SRC_SCRIPT_INTERFACE_PARTICLE_DATA_PARTICLE_LIST_HPP
+#pragma once
 
 #include "script_interface/ScriptInterface.hpp"
 
@@ -33,7 +32,7 @@ class ParticleList : public ObjectHandle {
   Variant do_call_method(std::string const &name,
                          VariantMap const &params) override;
 
-  void do_construct(VariantMap const &params) override {}
+  void do_construct(VariantMap const &) override {}
 
 private:
   std::string get_internal_state() const override;
@@ -42,5 +41,3 @@ class ParticleList : public ObjectHandle {
 
 } // namespace Particles
 } // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/particle_data/ParticleSlice.cpp b/src/script_interface/particle_data/ParticleSlice.cpp
index eef27f91f85..4b0da0aa56e 100644
--- a/src/script_interface/particle_data/ParticleSlice.cpp
+++ b/src/script_interface/particle_data/ParticleSlice.cpp
@@ -23,8 +23,6 @@
 
 #include "core/particle_node.hpp"
 
-#include <utils/Span.hpp>
-
 #include <string>
 #include <vector>
 
@@ -40,7 +38,7 @@ Variant ParticleSlice::do_call_method(std::string const &name,
                                       VariantMap const &params) {
   if (name == "prefetch_particle_data") {
     auto p_ids = get_value<std::vector<int>>(params, "chunk");
-    prefetch_particle_data(Utils::Span<int>(p_ids));
+    prefetch_particle_data(p_ids);
   } else if (name == "particle_exists") {
     return particle_exists(get_value<int>(params, "p_id"));
   }
diff --git a/src/script_interface/particle_data/ParticleSlice.hpp b/src/script_interface/particle_data/ParticleSlice.hpp
index b6c2ae57bdb..471a6dcecac 100644
--- a/src/script_interface/particle_data/ParticleSlice.hpp
+++ b/src/script_interface/particle_data/ParticleSlice.hpp
@@ -17,8 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef ESPRESSO_SRC_SCRIPT_INTERFACE_PARTICLE_DATA_PARTICLE_SLICE_HPP
-#define ESPRESSO_SRC_SCRIPT_INTERFACE_PARTICLE_DATA_PARTICLE_SLICE_HPP
+#pragma once
 
 #include "ParticleHandle.hpp"
 
@@ -53,5 +52,3 @@ class ParticleSlice : public AutoParameters<ParticleSlice> {
 
 } // namespace Particles
 } // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/particle_data/Polymer.hpp b/src/script_interface/particle_data/Polymer.hpp
index 504d568ac01..48720966afe 100644
--- a/src/script_interface/particle_data/Polymer.hpp
+++ b/src/script_interface/particle_data/Polymer.hpp
@@ -17,8 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef ESPRESSO_SRC_SCRIPT_INTERFACE_PARTICLE_DATA_POLYMER_HPP
-#define ESPRESSO_SRC_SCRIPT_INTERFACE_PARTICLE_DATA_POLYMER_HPP
+#pragma once
 
 #include "script_interface/ScriptInterface.hpp"
 
@@ -35,5 +34,3 @@ class Polymer : public AutoParameters<Polymer> {
 
 } // namespace Particles
 } // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/particle_data/initialize.hpp b/src/script_interface/particle_data/initialize.hpp
index 5b239d2068e..524b8849c51 100644
--- a/src/script_interface/particle_data/initialize.hpp
+++ b/src/script_interface/particle_data/initialize.hpp
@@ -17,8 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef ESPRESSO_SRC_SCRIPT_INTERFACE_PARTICLE_DATA_INITIALIZE_HPP
-#define ESPRESSO_SRC_SCRIPT_INTERFACE_PARTICLE_DATA_INITIALIZE_HPP
+#pragma once
 
 #include <script_interface/ObjectHandle.hpp>
 
@@ -31,5 +30,3 @@ void initialize(Utils::Factory<ObjectHandle> *om);
 
 } // namespace Particles
 } // namespace ScriptInterface
-
-#endif
diff --git a/src/script_interface/reaction_methods/ReactionAlgorithm.cpp b/src/script_interface/reaction_methods/ReactionAlgorithm.cpp
index 64d9873201d..1c0b878ad0b 100644
--- a/src/script_interface/reaction_methods/ReactionAlgorithm.cpp
+++ b/src/script_interface/reaction_methods/ReactionAlgorithm.cpp
@@ -102,19 +102,6 @@ ReactionAlgorithm::ReactionAlgorithm() {
         }}});
 }
 
-static auto get_real_particle(boost::mpi::communicator const &comm, int p_id) {
-  assert(p_id >= 0);
-  auto const &system = ::System::get_system();
-  auto &cell_structure = *system.cell_structure;
-  auto ptr = cell_structure.get_local_particle(p_id);
-  if (ptr != nullptr and ptr->is_ghost()) {
-    ptr = nullptr;
-  }
-  assert(boost::mpi::all_reduce(comm, static_cast<int>(ptr != nullptr),
-                                std::plus<>()) == 1);
-  return ptr;
-}
-
 Variant ReactionAlgorithm::do_call_method(std::string const &name,
                                           VariantMap const &params) {
   if (name == "calculate_factorial_expression") {
diff --git a/src/script_interface/scafacos/scafacos.cpp b/src/script_interface/scafacos/scafacos.cpp
index a4249f0eb4d..e2f2698a185 100644
--- a/src/script_interface/scafacos/scafacos.cpp
+++ b/src/script_interface/scafacos/scafacos.cpp
@@ -30,13 +30,13 @@
 
 #include <utils/demangle.hpp>
 
-#include <boost/algorithm/string.hpp>
-#include <boost/optional.hpp>
+#include <boost/variant.hpp>
 
 #include <algorithm>
 #include <functional>
 #include <iomanip>
 #include <iterator>
+#include <optional>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -52,12 +52,10 @@ std::vector<std::string> available_methods() {
 
 struct ConvertToStringVector
     : public boost::static_visitor<std::vector<std::string>> {
-  auto operator()(std::string const &value) const {
-    return std::vector<std::string>{value};
-  }
+  auto operator()(std::string const &value) const { return result_type{value}; }
 
   template <typename T, typename = std::enable_if_t<!std::is_arithmetic_v<T>>>
-  std::vector<std::string> operator()(T const &value) const {
+  result_type operator()(T const &) const {
     throw std::runtime_error("Cannot convert " + Utils::demangle<T>());
   }
 
@@ -70,12 +68,10 @@ struct ConvertToStringVector
     return operator()(to_str(value));
   }
 
-  auto operator()(std::vector<std::string> const &values) const {
-    return values;
-  }
+  auto operator()(result_type const &values) const { return values; }
 
   auto operator()(std::vector<Variant> const &values) const {
-    std::vector<std::string> values_str;
+    result_type values_str;
     for (auto const &v : values) {
       values_str.emplace_back(boost::apply_visitor(*this, v).front());
     }
@@ -84,7 +80,7 @@ struct ConvertToStringVector
 
   template <typename T, typename = std::enable_if_t<std::is_arithmetic_v<T>>>
   auto operator()(std::vector<T> const &values) const {
-    std::vector<std::string> values_str;
+    result_type values_str;
     for (auto const &v : values) {
       values_str.emplace_back(to_str(v));
     }
@@ -104,9 +100,7 @@ struct ConvertToStringVector
 
 struct GetParameterList
     : public boost::static_visitor<std::unordered_map<std::string, Variant>> {
-  auto operator()(std::unordered_map<std::string, Variant> const &obj) const {
-    return obj;
-  }
+  auto operator()(result_type const &obj) const { return obj; }
 
   template <typename T>
   auto operator()(std::unordered_map<T, Variant> const &obj) const {
@@ -142,7 +136,7 @@ std::string serialize_parameters(Variant const &pack) {
 }
 
 template <typename T>
-boost::optional<Variant> string_to_number(std::string const &s) {
+std::optional<Variant> string_to_number(std::string const &s) {
   auto deserializer = std::istringstream(s);
   T result;
   deserializer >> result;
@@ -165,9 +159,11 @@ deserialize_parameters(std::string const &parameters) {
   auto const numbers = std::string("-0123456789");
   std::unordered_map<std::string, Variant> method_params{};
   std::vector<std::string> flat_array;
-  // Clang 10 false positive: https://github.com/boostorg/algorithm/issues/63
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  boost::split(flat_array, parameters, boost::is_any_of(","));
+  std::istringstream buffer;
+  buffer.str(parameters);
+  for (std::string line; std::getline(buffer, line, ',');) {
+    flat_array.emplace_back(line);
+  }
   for (auto it = flat_array.begin(); it != flat_array.end();) {
     auto const parameter_name = *it;
     auto parameter_list = std::vector<Variant>{};
diff --git a/src/script_interface/system/CudaInitHandle.cpp b/src/script_interface/system/CudaInitHandle.cpp
index ace3f752c84..8f670b5a4f9 100644
--- a/src/script_interface/system/CudaInitHandle.cpp
+++ b/src/script_interface/system/CudaInitHandle.cpp
@@ -24,6 +24,10 @@
 #include "core/cuda/init.hpp"
 #include "core/cuda/utils.hpp"
 
+#if defined(CUDA) && defined(WALBERLA)
+#include "walberla_bridge/lattice_boltzmann/lb_walberla_init.hpp"
+#endif
+
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -59,7 +63,7 @@ Variant CudaInitHandle::do_call_method(std::string const &name,
       invoke_skip_cuda_exceptions([&n_gpus]() { n_gpus = cuda_get_n_gpus(); });
       for (int i = 0; i < n_gpus; ++i) {
         invoke_skip_cuda_exceptions([&devices, i]() {
-          char gpu_name_buffer[4 + 64];
+          char gpu_name_buffer[256] = {'\0'};
           cuda_get_gpu_name(i, gpu_name_buffer);
           devices[i] = std::string{gpu_name_buffer};
         });
@@ -100,6 +104,14 @@ Variant CudaInitHandle::do_call_method(std::string const &name,
 #endif // CUDA
     return n_gpus;
   }
+#if defined(CUDA) && defined(WALBERLA)
+  if (name == "set_device_id_per_rank") {
+    if (cuda_get_n_gpus()) {
+      set_device_id_per_rank();
+    }
+    return {};
+  }
+#endif
   return {};
 }
 
diff --git a/src/script_interface/system/System.cpp b/src/script_interface/system/System.cpp
index a6ffb25e8e1..dc5015e66e1 100644
--- a/src/script_interface/system/System.cpp
+++ b/src/script_interface/system/System.cpp
@@ -111,6 +111,7 @@ System::System() : m_instance{}, m_leaves{std::make_shared<Leaves>()} {
            if (not(new_value > Utils::Vector3d::broadcast(0.))) {
              throw std::domain_error("Attribute 'box_l' must be > 0");
            }
+           m_instance->veto_boxl_change();
            m_instance->box_geo->set_length(new_value);
            m_instance->on_boxl_change();
          });
@@ -256,19 +257,23 @@ Variant System::do_call_method(std::string const &name,
     auto &box_geo = *m_instance->box_geo;
     auto const coord = get_value<int>(parameters, "coord");
     auto const length = get_value<double>(parameters, "length");
+    assert(coord >= 0);
+    assert(coord != 3 or ((box_geo.length()[0] == box_geo.length()[1]) and
+                          (box_geo.length()[1] == box_geo.length()[2])));
     auto const scale = (coord == 3) ? length * box_geo.length_inv()[0]
                                     : length * box_geo.length_inv()[coord];
     context()->parallel_try_catch([&]() {
       if (length <= 0.) {
         throw std::domain_error("Parameter 'd_new' must be > 0");
       }
+      m_instance->veto_boxl_change(true);
     });
     auto new_value = Utils::Vector3d{};
     if (coord == 3) {
       new_value = Utils::Vector3d::broadcast(length);
     } else {
       new_value = box_geo.length();
-      new_value[coord] = length;
+      new_value[static_cast<unsigned>(coord)] = length;
     }
     // when shrinking, rescale the particles first
     if (scale <= 1.) {
diff --git a/src/script_interface/system/SystemFacade.hpp b/src/script_interface/system/SystemFacade.hpp
index e4100342cff..a6e30168d75 100644
--- a/src/script_interface/system/SystemFacade.hpp
+++ b/src/script_interface/system/SystemFacade.hpp
@@ -24,6 +24,7 @@
 #include "script_interface/ScriptInterface.hpp"
 
 #include <memory>
+#include <span>
 #include <string>
 
 namespace ScriptInterface {
@@ -45,14 +46,14 @@ class SystemFacade : public ObjectHandle {
     // create a dummy system to be able to read the list of parameters
     m_instance = std::make_shared<System>();
   }
-  void do_construct(VariantMap const &) override{};
+  void do_construct(VariantMap const &) override {};
   Variant get_parameter(const std::string &name) const override {
     return m_instance->get_parameter(name);
   }
   void do_set_parameter(const std::string &name, const Variant &v) override {
     m_instance->do_set_parameter(name, v);
   }
-  Utils::Span<const boost::string_ref> valid_parameters() const override {
+  std::span<const boost::string_ref> valid_parameters() const override {
     return m_instance->valid_parameters();
   }
   Variant do_call_method(std::string const &name,
diff --git a/src/script_interface/tests/Accumulators_test.cpp b/src/script_interface/tests/Accumulators_test.cpp
index 8cba30ee3f1..db69e5e0ce9 100644
--- a/src/script_interface/tests/Accumulators_test.cpp
+++ b/src/script_interface/tests/Accumulators_test.cpp
@@ -53,7 +53,7 @@ namespace Observables {
 class MockObservable : public Observable {
 public:
   std::vector<double>
-  operator()(boost::mpi::communicator const &comm) const override {
+  operator()(boost::mpi::communicator const &) const override {
     return {1., 2., 3., 4.};
   }
   std::vector<std::size_t> shape() const override { return {2u, 2u}; }
diff --git a/src/script_interface/tests/AutoParameter_test.cpp b/src/script_interface/tests/AutoParameter_test.cpp
index e71c38d687a..ab4d720766d 100644
--- a/src/script_interface/tests/AutoParameter_test.cpp
+++ b/src/script_interface/tests/AutoParameter_test.cpp
@@ -44,9 +44,8 @@ BOOST_AUTO_TEST_CASE(read_only) {
   BOOST_CHECK(boost::get<int>(p.get()) == i);
 
   /* Setting should throw */
-  BOOST_CHECK_EXCEPTION(
-      p.set(2), AutoParameter::WriteError,
-      [](AutoParameter::WriteError const &e) { return true; });
+  BOOST_CHECK_EXCEPTION(p.set(2), AutoParameter::WriteError,
+                        [](AutoParameter::WriteError const &) { return true; });
 }
 
 BOOST_AUTO_TEST_CASE(user_provided) {
diff --git a/src/script_interface/tests/AutoParameters_test.cpp b/src/script_interface/tests/AutoParameters_test.cpp
index 7b516bb283a..3445dca29d7 100644
--- a/src/script_interface/tests/AutoParameters_test.cpp
+++ b/src/script_interface/tests/AutoParameters_test.cpp
@@ -20,7 +20,7 @@
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
-#include <boost/range/algorithm/find.hpp>
+#include <boost/variant.hpp>
 
 #include "script_interface/auto_parameters/AutoParameters.hpp"
 
@@ -36,11 +36,11 @@ struct A : AutoParameters<A> {
 BOOST_AUTO_TEST_CASE(basic) {
   A a{0, 42};
 
-  auto const &valid_parameters = a.valid_parameters();
+  auto const &parameters = a.valid_parameters();
 
-  BOOST_CHECK(valid_parameters.size() == 2);
-  BOOST_CHECK(boost::find(valid_parameters, "i") != valid_parameters.end());
-  BOOST_CHECK(boost::find(valid_parameters, "j") != valid_parameters.end());
+  BOOST_CHECK(parameters.size() == 2u);
+  BOOST_CHECK(std::ranges::find(parameters, "i") != parameters.end());
+  BOOST_CHECK(std::ranges::find(parameters, "j") != parameters.end());
 
   BOOST_CHECK(0 == boost::get<int>(a.get_parameter("i")));
   BOOST_CHECK(42 == boost::get<int>(a.get_parameter("j")));
diff --git a/src/script_interface/tests/CMakeLists.txt b/src/script_interface/tests/CMakeLists.txt
index 399e44255a7..dc34f91e1bf 100644
--- a/src/script_interface/tests/CMakeLists.txt
+++ b/src/script_interface/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2020-2022 The ESPResSo project
+# Copyright (C) 2020-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,46 +17,42 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
+include(espresso_unit_test)
 
-unit_test(NAME ObjectHandle_test SRC ObjectHandle_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME AutoParameters_test SRC AutoParameters_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME AutoParameter_test SRC AutoParameter_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME Variant_test SRC Variant_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME get_value_test SRC get_value_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME None_test SRC None_test.cpp DEPENDS espresso::script_interface)
-unit_test(NAME reduction_test SRC reduction_test.cpp DEPENDS
-          espresso::script_interface Boost::mpi MPI::MPI_CXX NUM_PROC 4)
-unit_test(NAME LocalContext_test SRC LocalContext_test.cpp DEPENDS
-          espresso::script_interface Boost::mpi MPI::MPI_CXX NUM_PROC 1)
-unit_test(NAME GlobalContext_test SRC GlobalContext_test.cpp DEPENDS
-          espresso::script_interface Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME Exception_test SRC Exception_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME ParallelExceptionHandler_test SRC
-          ParallelExceptionHandler_test.cpp DEPENDS espresso::script_interface
-          espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
-unit_test(NAME packed_variant_test SRC packed_variant_test.cpp DEPENDS
-          espresso::script_interface)
-unit_test(NAME ObjectList_test SRC ObjectList_test.cpp DEPENDS
-          espresso::script_interface espresso::core Boost::mpi)
-unit_test(NAME ObjectMap_test SRC ObjectMap_test.cpp DEPENDS
-          espresso::script_interface espresso::core Boost::mpi)
-unit_test(NAME Accumulators_test SRC Accumulators_test.cpp DEPENDS
-          espresso::script_interface espresso::core Boost::mpi MPI::MPI_CXX
-          NUM_PROC 2)
-unit_test(NAME Constraints_test SRC Constraints_test.cpp DEPENDS
-          espresso::script_interface espresso::core)
-unit_test(NAME Actors_test SRC Actors_test.cpp DEPENDS
-          espresso::script_interface espresso::core)
-unit_test(NAME ConstantpHEnsemble_test SRC ConstantpHEnsemble_test.cpp DEPENDS
-          espresso::core espresso::script_interface Boost::mpi MPI::MPI_CXX
-          NUM_PROC 2)
-unit_test(NAME ReactionEnsemble_test SRC ReactionEnsemble_test.cpp DEPENDS
-          espresso::core espresso::script_interface Boost::mpi MPI::MPI_CXX
-          NUM_PROC 2)
+espresso_unit_test(SRC ObjectHandle_test.cpp DEPENDS espresso::script_interface)
+espresso_unit_test(SRC AutoParameters_test.cpp DEPENDS
+                   espresso::script_interface)
+espresso_unit_test(SRC AutoParameter_test.cpp DEPENDS
+                   espresso::script_interface)
+espresso_unit_test(SRC Variant_test.cpp DEPENDS espresso::script_interface)
+espresso_unit_test(SRC get_value_test.cpp DEPENDS espresso::script_interface)
+espresso_unit_test(SRC None_test.cpp DEPENDS espresso::script_interface)
+espresso_unit_test(SRC reduction_test.cpp DEPENDS espresso::script_interface
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 4)
+espresso_unit_test(SRC LocalContext_test.cpp DEPENDS espresso::script_interface
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 1)
+espresso_unit_test(
+  SRC GlobalContext_test.cpp DEPENDS espresso::script_interface Boost::mpi
+  MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC Exception_test.cpp DEPENDS espresso::script_interface)
+espresso_unit_test(
+  SRC ParallelExceptionHandler_test.cpp DEPENDS espresso::script_interface
+  espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC packed_variant_test.cpp DEPENDS
+                   espresso::script_interface)
+espresso_unit_test(SRC ObjectList_test.cpp DEPENDS espresso::script_interface
+                   espresso::core Boost::mpi)
+espresso_unit_test(SRC ObjectMap_test.cpp DEPENDS espresso::script_interface
+                   espresso::core Boost::mpi)
+espresso_unit_test(SRC Accumulators_test.cpp DEPENDS espresso::script_interface
+                   espresso::core Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(SRC Constraints_test.cpp DEPENDS espresso::script_interface
+                   espresso::core)
+espresso_unit_test(SRC Actors_test.cpp DEPENDS espresso::script_interface
+                   espresso::core)
+espresso_unit_test(
+  SRC ConstantpHEnsemble_test.cpp DEPENDS espresso::core
+  espresso::script_interface Boost::mpi MPI::MPI_CXX NUM_PROC 2)
+espresso_unit_test(
+  SRC ReactionEnsemble_test.cpp DEPENDS espresso::core
+  espresso::script_interface Boost::mpi MPI::MPI_CXX NUM_PROC 2)
diff --git a/src/script_interface/tests/GlobalContext_test.cpp b/src/script_interface/tests/GlobalContext_test.cpp
index da0401ad413..a782d4d99f2 100644
--- a/src/script_interface/tests/GlobalContext_test.cpp
+++ b/src/script_interface/tests/GlobalContext_test.cpp
@@ -31,6 +31,7 @@
 #include <algorithm>
 #include <cstddef>
 #include <memory>
+#include <span>
 #include <string>
 
 static std::weak_ptr<boost::mpi::environment> mpi_env;
@@ -49,11 +50,10 @@ struct Dummy : si::ObjectHandle {
     params[name] = val;
   }
 
-  Utils::Span<const boost::string_ref> valid_parameters() const override {
+  std::span<const boost::string_ref> valid_parameters() const override {
     static const boost::string_ref parameter_names[] = {"id", "object_param"};
 
-    return Utils::make_const_span(parameter_names,
-                                  std::min(params.size(), std::size_t{2u}));
+    return {parameter_names, std::min(params.size(), std::size_t{2u})};
   }
 };
 
diff --git a/src/script_interface/tests/LocalContext_test.cpp b/src/script_interface/tests/LocalContext_test.cpp
index 5afdf3fa910..b9e2edeb904 100644
--- a/src/script_interface/tests/LocalContext_test.cpp
+++ b/src/script_interface/tests/LocalContext_test.cpp
@@ -30,6 +30,7 @@
 #include <algorithm>
 #include <cstddef>
 #include <memory>
+#include <span>
 #include <string>
 
 namespace si = ScriptInterface;
@@ -46,11 +47,10 @@ struct Dummy : si::ObjectHandle {
     params[name] = val;
   }
 
-  Utils::Span<const boost::string_ref> valid_parameters() const override {
+  std::span<const boost::string_ref> valid_parameters() const override {
     static const boost::string_ref parameter_names[] = {"id", "object_param"};
 
-    return Utils::make_const_span(parameter_names,
-                                  std::min(params.size(), std::size_t{2u}));
+    return {parameter_names, std::min(params.size(), std::size_t{2u})};
   }
 };
 
diff --git a/src/script_interface/tests/ObjectHandle_test.cpp b/src/script_interface/tests/ObjectHandle_test.cpp
index f2927e5bfc5..32ced4486d1 100644
--- a/src/script_interface/tests/ObjectHandle_test.cpp
+++ b/src/script_interface/tests/ObjectHandle_test.cpp
@@ -78,13 +78,13 @@ struct LogHandle : public ObjectHandle {
     call_log.emplace_back(MockCall::Construct{&params});
   }
 
-  void do_set_parameter(const std::string &name,
-                        const Variant &value) override {
+  void do_set_parameter(std::string const &name,
+                        Variant const &value) override {
     call_log.emplace_back(MockCall::SetParameter{&name, &value});
   }
 
-  Variant do_call_method(const std::string &name,
-                         const VariantMap &params) override {
+  Variant do_call_method(std::string const &name,
+                         VariantMap const &params) override {
     call_log.emplace_back(MockCall::CallMethod{&name, &params});
 
     return none;
@@ -177,19 +177,19 @@ namespace Testing {
  * Logging mock for Context.
  */
 struct LogContext : public Context {
-  std::vector<std::pair<const ObjectHandle *, MockCall::Info>> call_log;
+  std::vector<std::pair<ObjectHandle const *, MockCall::Info>> call_log;
 
-  void notify_call_method(const ObjectHandle *o, std::string const &n,
+  void notify_call_method(ObjectHandle const *o, std::string const &n,
                           VariantMap const &p) override {
     call_log.emplace_back(o, MockCall::CallMethod{&n, &p});
   }
-  void notify_set_parameter(const ObjectHandle *o, std::string const &n,
+  void notify_set_parameter(ObjectHandle const *o, std::string const &n,
                             Variant const &v) override {
     call_log.emplace_back(o, MockCall::SetParameter{&n, &v});
   }
 
   std::shared_ptr<ObjectHandle> make_shared(std::string const &,
-                                            const VariantMap &) override {
+                                            VariantMap const &) override {
     auto it = std::make_shared<Testing::LogHandle>();
     set_context(it.get());
 
@@ -200,7 +200,7 @@ struct LogContext : public Context {
     return make_shared(s, v);
   }
 
-  boost::string_ref name(const ObjectHandle *o) const override {
+  boost::string_ref name(ObjectHandle const *) const override {
     return "Dummy";
   }
 
diff --git a/src/script_interface/tests/ObjectList_test.cpp b/src/script_interface/tests/ObjectList_test.cpp
index 4298d37fa1b..2ab36c60d1d 100644
--- a/src/script_interface/tests/ObjectList_test.cpp
+++ b/src/script_interface/tests/ObjectList_test.cpp
@@ -23,8 +23,6 @@
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
-#include <boost/range/algorithm/find.hpp>
-
 #include "script_interface/LocalContext.hpp"
 #include "script_interface/ObjectList.hpp"
 
@@ -44,15 +42,13 @@ struct ObjectListImpl : ObjectList<ObjectHandle> {
 
 private:
   bool has_in_core(const ObjectRef &obj_ptr) const override {
-    return std::find(mock_core.begin(), mock_core.end(), obj_ptr) !=
-           mock_core.end();
+    return std::ranges::count(mock_core, obj_ptr) >= 1;
   }
   void add_in_core(const ObjectRef &obj_ptr) override {
     mock_core.push_back(obj_ptr);
   }
   void remove_in_core(const ObjectRef &obj_ptr) override {
-    mock_core.erase(std::remove(mock_core.begin(), mock_core.end(), obj_ptr),
-                    mock_core.end());
+    std::erase(mock_core, obj_ptr);
   }
 };
 
@@ -68,7 +64,7 @@ BOOST_AUTO_TEST_CASE(adding_elements) {
   list.add(e);
   BOOST_CHECK(list.elements().back() == e);
   // And is added to the core
-  BOOST_CHECK(boost::find(list.mock_core, e) != list.mock_core.end());
+  BOOST_CHECK(std::ranges::count(list.mock_core, e) == 1);
 }
 
 BOOST_AUTO_TEST_CASE(removing_elements) {
@@ -78,9 +74,9 @@ BOOST_AUTO_TEST_CASE(removing_elements) {
   ObjectListImpl list;
   list.add(e);
   list.remove(e);
-  BOOST_CHECK(boost::find(list.elements(), e) == list.elements().end());
+  BOOST_CHECK(std::ranges::count(list.elements(), e) == 0);
   // And is removed from the core
-  BOOST_CHECK(boost::find(list.mock_core, e) == list.mock_core.end());
+  BOOST_CHECK(std::ranges::count(list.mock_core, e) == 0);
 }
 
 BOOST_AUTO_TEST_CASE(clearing_elements) {
diff --git a/src/script_interface/tests/get_value_test.cpp b/src/script_interface/tests/get_value_test.cpp
index 00baf0cf18f..8674aba1859 100644
--- a/src/script_interface/tests/get_value_test.cpp
+++ b/src/script_interface/tests/get_value_test.cpp
@@ -161,8 +161,9 @@ BOOST_AUTO_TEST_CASE(unordered_map) {
   }
 }
 
-auto exception_message_predicate(std::string const &pattern) {
-  return [=](std::exception const &ex) {
+struct exception_message_predicate {
+  std::string pattern;
+  auto operator()(std::exception const &ex) const {
     boost::test_tools::predicate_result result = true;
     std::string const what = ex.what();
     std::smatch match;
@@ -172,15 +173,15 @@ auto exception_message_predicate(std::string const &pattern) {
                        << "doesn't match pattern \"" << pattern << "\"";
     }
     return result;
-  };
-}
+  }
+};
 
 BOOST_AUTO_TEST_CASE(check_exceptions) {
   using ScriptInterface::get_value;
   using ScriptInterface::Variant;
 
-  assert(!!exception_message_predicate("A")(std::runtime_error("A")));
-  assert(!exception_message_predicate("A")(std::runtime_error("B")));
+  assert(!!exception_message_predicate{"A"}(std::runtime_error("A")));
+  assert(!exception_message_predicate{"A"}(std::runtime_error("B")));
 
   using so_ptr_t = std::shared_ptr<ScriptInterface::ObjectHandle>;
 
@@ -195,9 +196,9 @@ BOOST_AUTO_TEST_CASE(check_exceptions) {
     auto const obj_variant_pattern = Utils::demangle<so_ptr_t>();
     auto const what = msg_prefix + "'" + obj_variant_pattern + "'";
     auto const predicate_nullptr =
-        exception_message_predicate(what + " is a null pointer");
+        exception_message_predicate{what + " is a null pointer"};
     auto const predicate_conversion =
-        exception_message_predicate(what + " is not convertible to 'int'");
+        exception_message_predicate{what + " is not convertible to 'int'"};
     BOOST_CHECK_EXCEPTION(get_value<so_ptr_t>(obj_variant), std::exception,
                           predicate_nullptr);
     BOOST_CHECK_EXCEPTION(get_value<int>(obj_variant), std::exception,
@@ -209,13 +210,13 @@ BOOST_AUTO_TEST_CASE(check_exceptions) {
     auto const vec_variant = Variant{std::vector<Variant>{{so_obj}}};
     auto const vec_variant_pattern = "std::vector<" + variant_sip_name + ">";
     auto const what = msg_prefix + "'" + vec_variant_pattern + "\\{.size=1\\}'";
-    auto const predicate_nullptr = exception_message_predicate(
-        what + " contains a value that is a null pointer");
-    auto const predicate_conversion_containee = exception_message_predicate(
+    auto const predicate_nullptr = exception_message_predicate{
+        what + " contains a value that is a null pointer"};
+    auto const predicate_conversion_containee = exception_message_predicate{
         what + " is not convertible to 'std::vector<int>' because"
-               " it contains a value that is not convertible to 'int'");
-    auto const predicate_conversion = exception_message_predicate(
-        msg_prefix + "'double' is not convertible to 'std::vector<int>'");
+               " it contains a value that is not convertible to 'int'"};
+    auto const predicate_conversion = exception_message_predicate{
+        msg_prefix + "'double' is not convertible to 'std::vector<int>'"};
     BOOST_CHECK_EXCEPTION(get_value<std::vector<so_ptr_t>>(vec_variant),
                           std::exception, predicate_nullptr);
     BOOST_CHECK_EXCEPTION(get_value<std::vector<int>>(vec_variant),
@@ -230,12 +231,12 @@ BOOST_AUTO_TEST_CASE(check_exceptions) {
     auto const map_variant_pattern =
         "std::unordered_map<int, " + variant_sip_name + ">";
     auto const what = msg_prefix + "'" + map_variant_pattern + "'";
-    auto const predicate_nullptr = exception_message_predicate(
-        what + " contains a value that is a null pointer");
-    auto const predicate_conversion = exception_message_predicate(
+    auto const predicate_nullptr = exception_message_predicate{
+        what + " contains a value that is a null pointer"};
+    auto const predicate_conversion = exception_message_predicate{
         what +
         " is not convertible to 'std::unordered_map<int, double>' because"
-        " it contains a value that is not convertible to 'int' or 'double'");
+        " it contains a value that is not convertible to 'int' or 'double'"};
     BOOST_CHECK_EXCEPTION(
         (get_value<std::unordered_map<int, so_ptr_t>>(map_variant)),
         std::exception, predicate_nullptr);
@@ -250,13 +251,13 @@ BOOST_AUTO_TEST_CASE(check_exceptions) {
     auto const map_variant_pattern =
         "std::unordered_map<std::string, " + variant_sip_name + ">";
     auto const what = msg_prefix + "'" + map_variant_pattern + "'";
-    auto const predicate_nullptr = exception_message_predicate(
-        what + " contains a value that is a null pointer");
-    auto const predicate_conversion = exception_message_predicate(
+    auto const predicate_nullptr = exception_message_predicate{
+        what + " contains a value that is a null pointer"};
+    auto const predicate_conversion = exception_message_predicate{
         what +
         " is not convertible to 'std::unordered_map<std::string, int>' because"
         " it contains a value that is not convertible to 'std::string' or "
-        "'int'");
+        "'int'"};
     BOOST_CHECK_EXCEPTION(
         (get_value<std::unordered_map<std::string, so_ptr_t>>(map_variant)),
         std::exception, predicate_nullptr);
diff --git a/src/script_interface/thermostat/thermostat.hpp b/src/script_interface/thermostat/thermostat.hpp
index c2494362d8e..57318778f1a 100644
--- a/src/script_interface/thermostat/thermostat.hpp
+++ b/src/script_interface/thermostat/thermostat.hpp
@@ -612,6 +612,12 @@ class Thermostat : public AutoParameters<Thermostat, System::Leaf> {
 
   Variant do_call_method(std::string const &name,
                          VariantMap const &params) override {
+    if (params.contains("act_on_virtual")) {
+      context()->parallel_try_catch([&]() {
+        throw std::runtime_error(
+            name + "() got an unexpected keyword argument 'act_on_virtual'");
+      });
+    }
     if (name == "set_langevin") {
       setup_thermostat(langevin, params);
       return {};
@@ -701,7 +707,7 @@ class Thermostat : public AutoParameters<Thermostat, System::Leaf> {
   }
 
   /**
-   * @brief Instantiate default-contructed thermostats.
+   * @brief Instantiate default-constructed thermostats.
    * Can only be run on the head node!
    */
   void make_default_constructed_thermostats() {
diff --git a/src/script_interface/walberla/CMakeLists.txt b/src/script_interface/walberla/CMakeLists.txt
index eafd59aef4c..52005addce8 100644
--- a/src/script_interface/walberla/CMakeLists.txt
+++ b/src/script_interface/walberla/CMakeLists.txt
@@ -27,5 +27,8 @@ if(ESPRESSO_BUILD_WITH_WALBERLA)
             ${CMAKE_CURRENT_SOURCE_DIR}/EKSpecies.cpp
             ${CMAKE_CURRENT_SOURCE_DIR}/EKSpeciesNode.cpp
             ${CMAKE_CURRENT_SOURCE_DIR}/EKSpeciesSlice.cpp)
-  target_link_libraries(espresso_script_interface PRIVATE espresso::walberla)
+  target_link_libraries(
+    espresso_script_interface
+    PRIVATE espresso::walberla
+            $<$<BOOL:${ESPRESSO_BUILD_WITH_CUDA}>:espresso::walberla_cuda>)
 endif(ESPRESSO_BUILD_WITH_WALBERLA)
diff --git a/src/script_interface/walberla/EKSpecies.cpp b/src/script_interface/walberla/EKSpecies.cpp
index c38cb1c5dc6..06b79e031c6 100644
--- a/src/script_interface/walberla/EKSpecies.cpp
+++ b/src/script_interface/walberla/EKSpecies.cpp
@@ -88,17 +88,31 @@ Variant EKSpecies::do_call_method(std::string const &method,
   return Base::do_call_method(method, parameters);
 }
 
-void EKSpecies::do_construct(VariantMap const &args) {
-  m_lattice = get_value<std::shared_ptr<LatticeWalberla>>(args, "lattice");
+void EKSpecies::make_instance(VariantMap const &params) {
+  auto const diffusion = get_value<double>(params, "diffusion");
+  auto const ext_efield = get_value<Utils::Vector3d>(params, "ext_efield");
+  auto const density = get_value<double>(params, "density");
+  auto const kT = get_value<double>(params, "kT");
+  auto const precision = get_value<bool>(params, "single_precision");
+  auto const ek_diffusion = diffusion * m_conv_diffusion;
+  auto const ek_ext_efield = ext_efield * m_conv_ext_efield;
+  auto const ek_density = density * m_conv_density;
+  auto const ek_kT = kT * m_conv_energy;
+  m_instance = ::walberla::new_ek_walberla(
+      m_lattice->lattice(), ek_diffusion, ek_kT,
+      get_value<double>(params, "valency"), ek_ext_efield, ek_density,
+      get_value<bool>(params, "advection"),
+      get_value<bool>(params, "friction_coupling"), precision);
+}
+
+void EKSpecies::do_construct(VariantMap const &params) {
+  m_lattice = get_value<std::shared_ptr<LatticeWalberla>>(params, "lattice");
   m_vtk_writers =
-      get_value_or<decltype(m_vtk_writers)>(args, "vtk_writers", {});
-  auto const single_precision = get_value<bool>(args, "single_precision");
+      get_value_or<decltype(m_vtk_writers)>(params, "vtk_writers", {});
   auto const agrid = get_value<double>(m_lattice->get_parameter("agrid"));
-  auto const diffusion = get_value<double>(args, "diffusion");
-  auto const ext_efield = get_value<Utils::Vector3d>(args, "ext_efield");
-  auto const density = get_value<double>(args, "density");
-  auto const kT = get_value<double>(args, "kT");
-  auto const tau = m_tau = get_value<double>(args, "tau");
+  auto const density = get_value<double>(params, "density");
+  auto const kT = get_value<double>(params, "kT");
+  auto const tau = m_tau = get_value<double>(params, "tau");
   context()->parallel_try_catch([&]() {
     if (tau <= 0.) {
       throw std::domain_error("Parameter 'tau' must be > 0");
@@ -114,15 +128,8 @@ void EKSpecies::do_construct(VariantMap const &args) {
     m_conv_ext_efield = Utils::int_pow<2>(tau) / agrid;
     m_conv_density = Utils::int_pow<3>(agrid);
     m_conv_flux = tau * Utils::int_pow<2>(agrid);
-    auto const ek_diffusion = diffusion * m_conv_diffusion;
-    auto const ek_ext_efield = ext_efield * m_conv_ext_efield;
-    auto const ek_density = m_density = density * m_conv_density;
-    auto const ek_kT = kT * m_conv_energy;
-    m_instance = ::walberla::new_ek_walberla(
-        m_lattice->lattice(), ek_diffusion, ek_kT,
-        get_value<double>(args, "valency"), ek_ext_efield, ek_density,
-        get_value<bool>(args, "advection"),
-        get_value<bool>(args, "friction_coupling"), single_precision);
+    m_density = density * m_conv_density;
+    make_instance(params);
     for (auto &vtk : m_vtk_writers) {
       vtk->attach_to_lattice(m_instance, get_latice_to_md_units_conversion());
     }
@@ -138,8 +145,7 @@ void EKSpecies::load_checkpoint(std::string const &filename, int mode) {
     cpfile.read(read_grid_size);
     if (read_grid_size != expected_grid_size) {
       std::stringstream message;
-      message << "grid dimensions mismatch, "
-              << "read [" << read_grid_size << "], "
+      message << "grid dimensions mismatch, read [" << read_grid_size << "], "
               << "expected [" << expected_grid_size << "].";
       throw std::runtime_error(message.str());
     }
diff --git a/src/script_interface/walberla/EKSpecies.hpp b/src/script_interface/walberla/EKSpecies.hpp
index 4f304e2f970..45cf4d00b88 100644
--- a/src/script_interface/walberla/EKSpecies.hpp
+++ b/src/script_interface/walberla/EKSpecies.hpp
@@ -43,6 +43,7 @@ namespace ScriptInterface::walberla {
 class EKVTKHandle;
 
 class EKSpecies : public LatticeModel<::EKinWalberlaBase, EKVTKHandle> {
+protected:
   using Base = LatticeModel<::EKinWalberlaBase, EKVTKHandle>;
   double m_conv_diffusion;
   double m_conv_ext_efield;
@@ -52,6 +53,8 @@ class EKSpecies : public LatticeModel<::EKinWalberlaBase, EKVTKHandle> {
   double m_tau;
   double m_density;
 
+  void make_instance(VariantMap const &params) override;
+
 public:
   EKSpecies() {
     add_parameters({
@@ -99,7 +102,7 @@ class EKSpecies : public LatticeModel<::EKinWalberlaBase, EKVTKHandle> {
     });
   }
 
-  void do_construct(VariantMap const &args) override;
+  void do_construct(VariantMap const &params) override;
 
   [[nodiscard]] auto get_ekinstance() const { return m_instance; }
   [[nodiscard]] auto get_lattice() const { return m_lattice; }
diff --git a/src/script_interface/walberla/EKSpeciesNode.cpp b/src/script_interface/walberla/EKSpeciesNode.cpp
index 61433ffafd8..0a20dad05e5 100644
--- a/src/script_interface/walberla/EKSpeciesNode.cpp
+++ b/src/script_interface/walberla/EKSpeciesNode.cpp
@@ -28,7 +28,6 @@
 #include <walberla_bridge/electrokinetics/EKinWalberlaBase.hpp>
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/mpi/reduce_optional.hpp>
 
 #include <boost/mpi/collectives/all_reduce.hpp>
@@ -55,10 +54,10 @@ Variant EKSpeciesNode::do_call_method(std::string const &name,
     // rebuilding a EKSpeciesNode for each node in the slice
     auto const index = get_value<Utils::Vector3i>(params, "index");
     if (not is_index_valid(index, m_grid_size)) {
-      return ES_ERROR;
+      return 1;
     }
     m_index = index;
-    return ES_OK;
+    return 0;
   }
   if (name == "set_density") {
     auto const dens = get_value<double>(params, "value");
diff --git a/src/script_interface/walberla/EKSpeciesSlice.hpp b/src/script_interface/walberla/EKSpeciesSlice.hpp
index d16688729f4..81e2b4609e2 100644
--- a/src/script_interface/walberla/EKSpeciesSlice.hpp
+++ b/src/script_interface/walberla/EKSpeciesSlice.hpp
@@ -58,9 +58,9 @@ struct EKFieldSerializer {
       vec.reserve(values.size());
       for (auto const &opt : values) {
         if (opt) {
-          vec.emplace_back(Variant{*opt});
+          vec.emplace_back(*opt);
         } else {
-          vec.emplace_back(Variant{None{}});
+          vec.emplace_back(None{});
         }
       }
       return {vec};
diff --git a/src/script_interface/walberla/EKWalberlaNodeState.hpp b/src/script_interface/walberla/EKWalberlaNodeState.hpp
index 2dc0f8db047..eac916d20c9 100644
--- a/src/script_interface/walberla/EKWalberlaNodeState.hpp
+++ b/src/script_interface/walberla/EKWalberlaNodeState.hpp
@@ -35,7 +35,7 @@ struct EKWalberlaNodeState {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, unsigned int /* version */) {
-    ar &density &is_boundary_density &density_boundary &is_boundary_flux
-        &flux_boundary;
+    ar & density & is_boundary_density & density_boundary & is_boundary_flux &
+        flux_boundary;
   }
 };
diff --git a/src/script_interface/walberla/LBFluid.cpp b/src/script_interface/walberla/LBFluid.cpp
index b66f954ef4c..0cad46f999e 100644
--- a/src/script_interface/walberla/LBFluid.cpp
+++ b/src/script_interface/walberla/LBFluid.cpp
@@ -124,6 +124,28 @@ Variant LBFluid::do_call_method(std::string const &name,
   return Base::do_call_method(name, params);
 }
 
+void LBFluidCPU::make_instance(VariantMap const &params) {
+  auto const visc = get_value<double>(params, "kinematic_viscosity");
+  auto const dens = get_value<double>(params, "density");
+  auto const precision = get_value<bool>(params, "single_precision");
+  auto const lb_lattice = m_lattice->lattice();
+  auto const lb_visc = m_conv_visc * visc;
+  auto const lb_dens = m_conv_dens * dens;
+  m_instance = new_lb_walberla_cpu(lb_lattice, lb_visc, lb_dens, precision);
+}
+
+#ifdef CUDA
+void LBFluidGPU::make_instance(VariantMap const &params) {
+  auto const visc = get_value<double>(params, "kinematic_viscosity");
+  auto const dens = get_value<double>(params, "density");
+  auto const precision = get_value<bool>(params, "single_precision");
+  auto const lb_lattice = m_lattice->lattice();
+  auto const lb_visc = m_conv_visc * visc;
+  auto const lb_dens = m_conv_dens * dens;
+  m_instance = new_lb_walberla_gpu(lb_lattice, lb_visc, lb_dens, precision);
+}
+#endif // CUDA
+
 void LBFluid::do_construct(VariantMap const &params) {
   m_lattice = get_value<std::shared_ptr<LatticeWalberla>>(params, "lattice");
   m_vtk_writers =
@@ -134,7 +156,6 @@ void LBFluid::do_construct(VariantMap const &params) {
   auto const dens = get_value<double>(params, "density");
   auto const kT = get_value<double>(params, "kT");
   auto const ext_f = get_value<Utils::Vector3d>(params, "ext_force_density");
-  auto const single_precision = get_value<bool>(params, "single_precision");
   m_lb_params = std::make_shared<::LB::LBWalberlaParams>(agrid, tau);
   m_is_active = false;
   m_seed = get_value<int>(params, "seed");
@@ -150,7 +171,6 @@ void LBFluid::do_construct(VariantMap const &params) {
     m_conv_press = Utils::int_pow<2>(tau) * Utils::int_pow<1>(agrid);
     m_conv_force = Utils::int_pow<2>(tau) / Utils::int_pow<1>(agrid);
     m_conv_force_dens = Utils::int_pow<2>(tau) * Utils::int_pow<2>(agrid);
-    auto const lb_lattice = m_lattice->lattice();
     auto const lb_visc = m_conv_visc * visc;
     auto const lb_dens = m_conv_dens * dens;
     auto const lb_kT = m_conv_energy * kT;
@@ -167,8 +187,7 @@ void LBFluid::do_construct(VariantMap const &params) {
     if (lb_visc < 0.) {
       throw std::domain_error("Parameter 'kinematic_viscosity' must be >= 0");
     }
-    m_instance =
-        new_lb_walberla(lb_lattice, lb_visc, lb_dens, single_precision);
+    make_instance(params);
     auto const &system = ::System::get_system();
     if (auto le_protocol = system.lees_edwards->get_protocol()) {
       if (lb_kT != 0.) {
@@ -228,8 +247,7 @@ void LBFluid::load_checkpoint(std::string const &filename, int mode) {
     cpfile.read(read_pop_size);
     if (read_grid_size != expected_grid_size) {
       std::stringstream message;
-      message << "grid dimensions mismatch, "
-              << "read [" << read_grid_size << "], "
+      message << "grid dimensions mismatch, read [" << read_grid_size << "], "
               << "expected [" << expected_grid_size << "].";
       throw std::runtime_error(message.str());
     }
diff --git a/src/script_interface/walberla/LBFluid.hpp b/src/script_interface/walberla/LBFluid.hpp
index 093e7159334..036f408729b 100644
--- a/src/script_interface/walberla/LBFluid.hpp
+++ b/src/script_interface/walberla/LBFluid.hpp
@@ -47,6 +47,7 @@ namespace ScriptInterface::walberla {
 class LBVTKHandle;
 
 class LBFluid : public LatticeModel<::LBWalberlaBase, LBVTKHandle> {
+protected:
   using Base = LatticeModel<::LBWalberlaBase, LBVTKHandle>;
   std::shared_ptr<::LB::LBWalberlaParams> m_lb_params;
   bool m_is_active;
@@ -135,6 +136,18 @@ class LBFluid : public LatticeModel<::LBWalberlaBase, LBVTKHandle> {
   Variant get_interpolated_velocity(Utils::Vector3d const &pos) const;
 };
 
+class LBFluidCPU : public LBFluid {
+protected:
+  void make_instance(VariantMap const &params) override;
+};
+
+#ifdef CUDA
+class LBFluidGPU : public LBFluid {
+protected:
+  void make_instance(VariantMap const &params) override;
+};
+#endif // CUDA
+
 class LBVTKHandle : public VTKHandleBase<::LBWalberlaBase> {
   static std::unordered_map<std::string, int> const obs_map;
 
diff --git a/src/script_interface/walberla/LBFluidSlice.hpp b/src/script_interface/walberla/LBFluidSlice.hpp
index 4701ee64bc7..415440a5ca3 100644
--- a/src/script_interface/walberla/LBFluidSlice.hpp
+++ b/src/script_interface/walberla/LBFluidSlice.hpp
@@ -55,9 +55,9 @@ struct LBFieldSerializer {
       vec.reserve(values.size());
       for (auto const &opt : values) {
         if (opt) {
-          vec.emplace_back(Variant{*opt});
+          vec.emplace_back(*opt);
         } else {
-          vec.emplace_back(Variant{None{}});
+          vec.emplace_back(None{});
         }
       }
       return {vec};
diff --git a/src/script_interface/walberla/LBWalberlaNodeState.hpp b/src/script_interface/walberla/LBWalberlaNodeState.hpp
index a47099b3cca..a0685cb7981 100644
--- a/src/script_interface/walberla/LBWalberlaNodeState.hpp
+++ b/src/script_interface/walberla/LBWalberlaNodeState.hpp
@@ -36,6 +36,6 @@ struct LBWalberlaNodeState {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, unsigned int /* version */) {
-    ar &populations &last_applied_force &slip_velocity &is_boundary;
+    ar & populations & last_applied_force & slip_velocity & is_boundary;
   }
 };
diff --git a/src/script_interface/walberla/LatticeModel.hpp b/src/script_interface/walberla/LatticeModel.hpp
index c5830ab82ad..45184e64346 100644
--- a/src/script_interface/walberla/LatticeModel.hpp
+++ b/src/script_interface/walberla/LatticeModel.hpp
@@ -43,9 +43,10 @@ class LatticeModel : public AutoParameters<LatticeModel<Method, VTKHandle>> {
 
   virtual ::LatticeModel::units_map
   get_latice_to_md_units_conversion() const = 0;
+  virtual void make_instance(VariantMap const &params) = 0;
 
   auto find_vtk(std::shared_ptr<VTKHandle> const &vtk) const {
-    return std::find(m_vtk_writers.begin(), m_vtk_writers.end(), vtk);
+    return std::ranges::find(m_vtk_writers, vtk);
   }
 
   auto serialize_vtk_writers() const {
diff --git a/src/script_interface/walberla/LatticeSlice.impl.hpp b/src/script_interface/walberla/LatticeSlice.impl.hpp
index 0f0ac558e47..39d773f87fd 100644
--- a/src/script_interface/walberla/LatticeSlice.impl.hpp
+++ b/src/script_interface/walberla/LatticeSlice.impl.hpp
@@ -152,7 +152,7 @@ Variant LatticeSlice<FieldSerializer>::gather_3d(
     std::vector<T> out;
     detail::flatten_grid<FieldSerializer>(array, out, units_conversion);
     std::vector<int> shape = {m_shape.begin(), m_shape.end()};
-    if (not(data_dims.size() == 1ul and data_dims[0] == 1)) {
+    if (data_dims.size() != 1ul or data_dims[0] != 1) {
       shape.insert(shape.end(), data_dims.begin(), data_dims.end());
     }
     auto const variant = FieldSerializer::serialize(out);
diff --git a/src/script_interface/walberla/VTKHandle.hpp b/src/script_interface/walberla/VTKHandle.hpp
index edcf6aa5dbb..d4a0975e9e5 100644
--- a/src/script_interface/walberla/VTKHandle.hpp
+++ b/src/script_interface/walberla/VTKHandle.hpp
@@ -88,9 +88,9 @@ class VTKHandleBase : public AutoParameters<VTKHandleBase<Field>> {
       if (obs_map.count(name) == 0) {
         auto const valid_names = get_valid_observable_names();
         std::stringstream message;
-        message << "Only the following VTK observables are supported: ["
-                << "'" << boost::algorithm::join(valid_names, "', '") << "'"
-                << "], got '" << name << "'";
+        message << "Only the following VTK observables are supported: ['"
+                << boost::algorithm::join(valid_names, "', '") << "'], got '"
+                << name << "'";
         throw std::invalid_argument(message.str());
       }
       flag |= obs_map.at(name);
diff --git a/src/script_interface/walberla/initialize.cpp b/src/script_interface/walberla/initialize.cpp
index fde6f8d0d1c..b5c2071f06f 100644
--- a/src/script_interface/walberla/initialize.cpp
+++ b/src/script_interface/walberla/initialize.cpp
@@ -52,7 +52,10 @@ namespace ScriptInterface::walberla {
 void initialize(Utils::Factory<ObjectHandle> *om) {
   om->register_new<LatticeWalberla>("walberla::LatticeWalberla");
 
-  om->register_new<LBFluid>("walberla::LBFluid");
+  om->register_new<LBFluidCPU>("walberla::LBFluidCPU");
+#ifdef CUDA
+  om->register_new<LBFluidGPU>("walberla::LBFluidGPU");
+#endif // CUDA
   om->register_new<LBFluidNode>("walberla::LBFluidNode");
   om->register_new<LBFluidSlice>("walberla::LBFluidSlice");
   om->register_new<LBVTKHandle>("walberla::LBVTKHandle");
diff --git a/src/shapes/include/shapes/Ellipsoid.hpp b/src/shapes/include/shapes/Ellipsoid.hpp
index 39fc53bb293..e65149fb93c 100644
--- a/src/shapes/include/shapes/Ellipsoid.hpp
+++ b/src/shapes/include/shapes/Ellipsoid.hpp
@@ -23,7 +23,7 @@
 #define SHAPES_ELLIPSOID_HPP
 
 #include "Shape.hpp"
-#include <utils/Array.hpp>
+
 #include <utils/Vector.hpp>
 
 namespace Shapes {
diff --git a/src/shapes/include/shapes/Union.hpp b/src/shapes/include/shapes/Union.hpp
index c719ac37826..eb35f268f26 100644
--- a/src/shapes/include/shapes/Union.hpp
+++ b/src/shapes/include/shapes/Union.hpp
@@ -34,7 +34,7 @@ namespace Shapes {
 class Union : public Shape {
 public:
   bool contains(std::shared_ptr<Shapes::Shape> const &shape) const noexcept {
-    return std::find(m_shapes.begin(), m_shapes.end(), shape) != m_shapes.end();
+    return std::ranges::find(m_shapes, shape) != m_shapes.end();
   }
 
   void add(std::shared_ptr<Shapes::Shape> const &shape) {
@@ -42,8 +42,7 @@ class Union : public Shape {
   }
 
   void remove(std::shared_ptr<Shapes::Shape> const &shape) {
-    m_shapes.erase(std::remove(m_shapes.begin(), m_shapes.end(), shape),
-                   m_shapes.end());
+    std::erase(m_shapes, shape);
   }
 
   /**
diff --git a/src/shapes/src/HollowConicalFrustum.cpp b/src/shapes/src/HollowConicalFrustum.cpp
index c605fc20aa3..51ceb807dc2 100644
--- a/src/shapes/src/HollowConicalFrustum.cpp
+++ b/src/shapes/src/HollowConicalFrustum.cpp
@@ -20,9 +20,10 @@
 #include <shapes/HollowConicalFrustum.hpp>
 
 #include <utils/Vector.hpp>
-#include <utils/math/abs.hpp>
 #include <utils/math/coordinate_transformation.hpp>
 
+#include <cmath>
+
 namespace Shapes {
 void HollowConicalFrustum::calculate_dist(const Utils::Vector3d &pos,
                                           double &dist,
@@ -70,7 +71,7 @@ void HollowConicalFrustum::calculate_dist(const Utils::Vector3d &pos,
    */
 
   auto endpoint_angle = pos_phi;
-  if (Utils::abs(pos_phi) < m_central_angle / 2.) {
+  if (std::fabs(pos_phi) < m_central_angle / 2.) {
     // Cannot use Utils::sgn because of pos_phi==0 corner case
     endpoint_angle =
         pos_phi > 0. ? m_central_angle / 2. : -m_central_angle / 2.;
@@ -93,7 +94,7 @@ void HollowConicalFrustum::calculate_dist(const Utils::Vector3d &pos,
   /* It can be that the projection onto the (infinite) line is outside the
    * frustum. In that case, the closest point is actually one of the endpoints.
    */
-  if (Utils::abs(pos_closest_hcf_frame[2]) > m_length / 2.) {
+  if (std::fabs(pos_closest_hcf_frame[2]) > m_length / 2.) {
     pos_closest_hcf_frame =
         pos_closest_hcf_frame[2] > 0. ? r1_endpoint : r2_endpoint;
   }
diff --git a/src/shapes/unit_tests/CMakeLists.txt b/src/shapes/unit_tests/CMakeLists.txt
index 61480e1a238..5123a75d2c3 100644
--- a/src/shapes/unit_tests/CMakeLists.txt
+++ b/src/shapes/unit_tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2016-2022 The ESPResSo project
+# Copyright (C) 2016-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,16 +17,14 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
-unit_test(NAME Wall_test SRC Wall_test.cpp DEPENDS espresso::shapes
-          espresso::utils)
-unit_test(NAME HollowConicalFrustum_test SRC HollowConicalFrustum_test.cpp
-          DEPENDS espresso::shapes espresso::utils)
-unit_test(NAME Union_test SRC Union_test.cpp DEPENDS espresso::shapes
-          espresso::utils)
-unit_test(NAME Ellipsoid_test SRC Ellipsoid_test.cpp DEPENDS espresso::shapes
-          espresso::utils)
-unit_test(NAME Sphere_test SRC Sphere_test.cpp DEPENDS espresso::shapes
-          espresso::utils)
-unit_test(NAME NoWhere_test SRC NoWhere_test.cpp DEPENDS espresso::shapes
-          espresso::utils)
+include(espresso_unit_test)
+
+espresso_unit_test(SRC Wall_test.cpp DEPENDS espresso::shapes espresso::utils)
+espresso_unit_test(SRC HollowConicalFrustum_test.cpp DEPENDS espresso::shapes
+                   espresso::utils)
+espresso_unit_test(SRC Union_test.cpp DEPENDS espresso::shapes espresso::utils)
+espresso_unit_test(SRC Ellipsoid_test.cpp DEPENDS espresso::shapes
+                   espresso::utils)
+espresso_unit_test(SRC Sphere_test.cpp DEPENDS espresso::shapes espresso::utils)
+espresso_unit_test(SRC NoWhere_test.cpp DEPENDS espresso::shapes
+                   espresso::utils)
diff --git a/src/shapes/unit_tests/Ellipsoid_test.cpp b/src/shapes/unit_tests/Ellipsoid_test.cpp
index d9d41874bf7..2be8cbcedf7 100644
--- a/src/shapes/unit_tests/Ellipsoid_test.cpp
+++ b/src/shapes/unit_tests/Ellipsoid_test.cpp
@@ -27,10 +27,10 @@
 #include <shapes/Shape.hpp>
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 
 #include <cmath>
 #include <limits>
+#include <numbers>
 
 BOOST_AUTO_TEST_CASE(dist_function) {
   // multiply by 100 because BOOST_REQUIRE_CLOSE takes a percentage tolerance
@@ -46,7 +46,7 @@ BOOST_AUTO_TEST_CASE(dist_function) {
   int N = 100;
   for (int i = 0; i < N; i++) {
     for (int j = 0; j < N; j++) {
-      double theta = 2. * i / N * Utils::pi();
+      double theta = 2. * i / N * std::numbers::pi;
       double v = j / (N - 1.);
 
       Utils::Vector3d dist;
diff --git a/src/shapes/unit_tests/Wall_test.cpp b/src/shapes/unit_tests/Wall_test.cpp
index 7c5a6d5f0d1..50eaffffb30 100644
--- a/src/shapes/unit_tests/Wall_test.cpp
+++ b/src/shapes/unit_tests/Wall_test.cpp
@@ -62,10 +62,10 @@ BOOST_AUTO_TEST_CASE(rasterize_function) {
     auto const agrid = 1.0;
 
     auto const raster = shape.rasterize({5, 5, 5}, agrid, 0.5);
-    for (int i = 0; i < 25; ++i) {
+    for (auto i = 0u; i < 25u; ++i) {
       BOOST_REQUIRE_EQUAL(raster[i], 1);
     }
-    for (int i = 25; i < 125; ++i) {
+    for (auto i = 25u; i < 125u; ++i) {
       BOOST_REQUIRE_EQUAL(raster[i], 0);
     }
   }
@@ -77,10 +77,10 @@ BOOST_AUTO_TEST_CASE(rasterize_function) {
     auto const agrid = 1.0;
 
     auto const raster = shape.rasterize({5, 5, 5}, agrid, 0.5);
-    for (int i = 0; i < 25; ++i) {
+    for (auto i = 0u; i < 25u; ++i) {
       BOOST_REQUIRE_EQUAL(raster[i], 1);
     }
-    for (int i = 25; i < 125; ++i) {
+    for (auto i = 25u; i < 125u; ++i) {
       BOOST_REQUIRE_EQUAL(raster[i], 0);
     }
   }
@@ -92,10 +92,10 @@ BOOST_AUTO_TEST_CASE(rasterize_function) {
     auto const agrid = 1.0;
 
     auto const raster = shape.rasterize({5, 5, 5}, agrid, 0.5);
-    for (int i = 0; i < 2 * 25; ++i) {
+    for (auto i = 0u; i < 2u * 25u; ++i) {
       BOOST_REQUIRE_EQUAL(raster[i], 1);
     }
-    for (int i = 2 * 25; i < 125; ++i) {
+    for (auto i = 2u * 25u; i < 125u; ++i) {
       BOOST_REQUIRE_EQUAL(raster[i], 0);
     }
   }
diff --git a/src/utils/include/utils/Accumulator.hpp b/src/utils/include/utils/Accumulator.hpp
index c59f743520e..3b1c78e77cc 100644
--- a/src/utils/include/utils/Accumulator.hpp
+++ b/src/utils/include/utils/Accumulator.hpp
@@ -41,7 +41,7 @@ template <typename T> struct AccumulatorData {
 
   template <typename Archive>
   void serialize(Archive &ar, const unsigned /*version*/) {
-    ar &mean &m;
+    ar & mean & m;
   }
 };
 
@@ -61,7 +61,7 @@ class Accumulator {
 
   template <typename Archive>
   void serialize(Archive &ar, const unsigned /*version*/) {
-    ar &m_n &m_acc_data;
+    ar & m_n & m_acc_data;
   }
 };
 
@@ -71,10 +71,9 @@ inline void Accumulator::operator()(const std::vector<double> &data) {
         "The given data size does not fit the initialized size!");
   ++m_n;
   if (m_n == 1) {
-    std::transform(data.begin(), data.end(), m_acc_data.begin(),
-                   [](double d) -> AccumulatorData<double> {
-                     return {d, 0.0};
-                   });
+    std::transform(
+        data.begin(), data.end(), m_acc_data.begin(),
+        [](double d) -> AccumulatorData<double> { return {d, 0.0}; });
   } else {
     std::transform(m_acc_data.begin(), m_acc_data.end(), data.begin(),
                    m_acc_data.begin(),
diff --git a/src/utils/include/utils/Array.hpp b/src/utils/include/utils/Array.hpp
index 8d00f071bea..df18195b83a 100644
--- a/src/utils/include/utils/Array.hpp
+++ b/src/utils/include/utils/Array.hpp
@@ -47,7 +47,7 @@ template <typename T, std::size_t N> struct Storage {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, const unsigned int /* version */) {
-    ar &m_data;
+    ar & m_data;
   }
 };
 
@@ -178,7 +178,7 @@ template <typename T, std::size_t N> struct Array {
   friend boost::serialization::access;
   template <typename Archive>
   void serialize(Archive &ar, const unsigned int /* version */) {
-    ar &m_storage;
+    ar & m_storage;
   }
 
   static std::ostream &format(std::ostream &out, Array const &a,
diff --git a/src/utils/include/utils/Bag.hpp b/src/utils/include/utils/Bag.hpp
index 86f47a22f2e..2009e8504cc 100644
--- a/src/utils/include/utils/Bag.hpp
+++ b/src/utils/include/utils/Bag.hpp
@@ -75,7 +75,7 @@ template <class T> class Bag {
    * Serialization requires T to be serializable.
    */
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
-    ar &m_storage;
+    ar & m_storage;
   }
 
 public:
diff --git a/src/utils/include/utils/Cache.hpp b/src/utils/include/utils/Cache.hpp
index 25f84b62394..d2ed3bb1a8d 100644
--- a/src/utils/include/utils/Cache.hpp
+++ b/src/utils/include/utils/Cache.hpp
@@ -17,9 +17,9 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef CORE_UTILS_CACHE_HPP
-#define CORE_UTILS_CACHE_HPP
+#pragma once
 
+#include <cstddef>
 #include <memory>
 #include <random>
 #include <type_traits>
@@ -76,12 +76,12 @@ template <typename Key, typename Value> class Cache {
     }
 
     /* Pick a random element form that bucket. */
-    auto const elem_in_bucket = std::uniform_int_distribution<size_type>{
+    auto const elem_index = std::uniform_int_distribution<size_type>{
         0, m_cache.bucket_size(bucket) - 1}(m_rand);
 
     /* Get the element in the bucket */
     auto const drop_key =
-        std::next(m_cache.cbegin(bucket), elem_in_bucket)->first;
+        std::next(m_cache.cbegin(bucket), static_cast<long>(elem_index))->first;
 
     /* And drop it. */
     m_cache.erase(drop_key);
@@ -149,11 +149,12 @@ template <typename Key, typename Value> class Cache {
   KeyInputIterator put(KeyInputIterator kbegin, KeyInputIterator kend,
                        ValueInputIterator vbegin) {
     auto const range_len = std::distance(kbegin, kend);
-    auto const len = (range_len > max_size()) ? max_size() : range_len;
+    auto const size_max = static_cast<decltype(range_len)>(max_size());
+    auto const len = (range_len > size_max) ? size_max : range_len;
     kend = std::next(kbegin, len);
 
     /* Make some space. */
-    while ((max_size() - size()) < len) {
+    while (static_cast<decltype(len)>(max_size() - size()) < len) {
       drop_random_element();
     }
 
@@ -180,5 +181,3 @@ template <typename Key, typename Value> class Cache {
   }
 };
 } // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/Counter.hpp b/src/utils/include/utils/Counter.hpp
index dca2726b897..ad63abb49e8 100644
--- a/src/utils/include/utils/Counter.hpp
+++ b/src/utils/include/utils/Counter.hpp
@@ -28,8 +28,8 @@ template <typename T> class Counter {
   T m_initial;
   friend class boost::serialization::access;
   template <class Archive> void serialize(Archive &ar, const unsigned int) {
-    ar &m_val;
-    ar &m_initial;
+    ar & m_val;
+    ar & m_initial;
   }
 
 public:
diff --git a/src/utils/include/utils/Histogram.hpp b/src/utils/include/utils/Histogram.hpp
index f16aeb015ad..d36421a2fb9 100644
--- a/src/utils/include/utils/Histogram.hpp
+++ b/src/utils/include/utils/Histogram.hpp
@@ -16,10 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef UTILS_HISTOGRAM_HPP
-#define UTILS_HISTOGRAM_HPP
 
-#include "utils/Span.hpp"
+#pragma once
 
 #include <boost/multi_array.hpp>
 
@@ -30,6 +28,7 @@
 #include <cstddef>
 #include <functional>
 #include <numeric>
+#include <span>
 #include <stdexcept>
 #include <utility>
 #include <vector>
@@ -47,6 +46,8 @@ template <typename T, std::size_t N, std::size_t M = 3, typename U = double>
 class Histogram {
   using array_type = boost::multi_array<T, M + 1>;
   using count_type = boost::multi_array<std::size_t, M + 1>;
+
+protected:
   using array_index = typename array_type::index;
 
 public:
@@ -89,14 +90,14 @@ class Histogram {
    * \brief Add data to the histogram.
    * \param pos    Position to update.
    */
-  void update(Span<const U> pos) { update(pos, m_ones); }
+  void update(std::span<const U> pos) { update(pos, m_ones); }
 
   /**
    * \brief Add data to the histogram.
    * \param pos    Position to update.
    * \param value  Value to add.
    */
-  void update(Span<const U> pos, Span<const T> value) {
+  void update(std::span<const U> pos, std::span<const T> value) {
     if (pos.size() != M) {
       throw std::invalid_argument("Wrong dimensions for the coordinates");
     }
@@ -108,9 +109,9 @@ class Histogram {
       for (std::size_t i = 0; i < M; ++i) {
         index[i] = calc_bin_index(pos[i], m_limits[i].first, m_bin_sizes[i]);
       }
-      for (array_index i = 0; i < N; ++i) {
+      for (array_index i = 0; i < static_cast<array_index>(N); ++i) {
         index.back() = i;
-        m_array(index) += value[i];
+        m_array(index) += value[static_cast<std::size_t>(i)];
         m_count(index)++;
       }
     }
@@ -152,7 +153,7 @@ class Histogram {
    * \brief Check if the position lies within the histogram limits.
    * \param pos     Position to check.
    */
-  bool check_limits(Span<const U> pos) const {
+  bool check_limits(std::span<const U> pos) const {
     assert(pos.size() == M);
     bool within_range = true;
     for (std::size_t i = 0; i < M; ++i) {
@@ -196,6 +197,7 @@ class CylindricalHistogram : public Histogram<T, N, M, U> {
   using Histogram<T, N, M, U>::m_limits;
   using Histogram<T, N, M, U>::m_bin_sizes;
   using Histogram<T, N, M, U>::m_array;
+  using typename Histogram<T, N, M, U>::array_index;
 
 public:
   using Histogram<T, N, M, U>::Histogram;
@@ -205,12 +207,12 @@ class CylindricalHistogram : public Histogram<T, N, M, U> {
     auto const r_bin_size = m_bin_sizes[0];
     auto const phi_bin_size = m_bin_sizes[1];
     auto const z_bin_size = m_bin_sizes[2];
-    auto const n_bins_r = m_n_bins[0];
-    for (std::size_t i = 0; i < n_bins_r; i++) {
+    auto const n_bins_r = static_cast<array_index>(m_n_bins[0]);
+    for (array_index i = 0; i < n_bins_r; i++) {
       auto const r_left = min_r + static_cast<U>(i) * r_bin_size;
       auto const r_right = r_left + r_bin_size;
-      auto const bin_volume =
-          (r_right * r_right - r_left * r_left) * z_bin_size * phi_bin_size / 2;
+      auto const bin_volume = (r_right * r_right - r_left * r_left) *
+                              z_bin_size * phi_bin_size / U(2);
       auto *begin = m_array[i].origin();
       std::transform(
           begin, begin + m_array[i].num_elements(), begin,
@@ -220,5 +222,3 @@ class CylindricalHistogram : public Histogram<T, N, M, U> {
 };
 
 } // Namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/NumeratedContainer.hpp b/src/utils/include/utils/NumeratedContainer.hpp
index 91bfab51f14..2c65bdb5d24 100644
--- a/src/utils/include/utils/NumeratedContainer.hpp
+++ b/src/utils/include/utils/NumeratedContainer.hpp
@@ -59,7 +59,9 @@ template <class T, typename index_type = int> class NumeratedContainer {
     for (auto const &e : l) {
       m_container[e.first] = e.second;
       /* Remove the index from the index set if it exists. */
-      m_free_indices.erase(m_free_indices.find(e.first), m_free_indices.end());
+      if (auto it = m_free_indices.find(e.first); it != m_free_indices.end()) {
+        m_free_indices.erase(it);
+      }
     }
 
     /* Refill the index set */
diff --git a/src/utils/include/utils/Span.hpp b/src/utils/include/utils/Span.hpp
deleted file mode 100644
index c56d1af4d83..00000000000
--- a/src/utils/include/utils/Span.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef UTILS_SPAN_HPP
-#define UTILS_SPAN_HPP
-
-#include "device_qualifier.hpp"
-
-#include <cassert>
-#include <cstddef>
-#include <iterator>
-#include <stdexcept>
-#include <type_traits>
-
-namespace Utils {
-
-/**
- * @brief A stripped-down version of std::span from C++17.
- *
- * Behaves like a std::span where implemented.
- */
-
-template <class T> class Span {
-public:
-  using value_type = typename std::remove_cv<T>::type;
-  using pointer = T *;
-  using const_pointer = const T *;
-  using reference = T &;
-  using const_reference = const T &;
-  using iterator = pointer;
-  using const_iterator = const_pointer;
-  using reverse_iterator = std::reverse_iterator<iterator>;
-  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
-  using size_type = std::size_t;
-  using difference_type = std::ptrdiff_t;
-
-private:
-  T *m_ptr;
-  std::size_t m_size{};
-
-  template <typename U>
-  using enable_if_const_t =
-      typename std::enable_if<std::is_const_v<T>, U>::type;
-  template <class U>
-  using enable_if_mutable_t =
-      typename std::enable_if<!std::is_const_v<T>, U>::type;
-  template <class U>
-  using enable_if_has_data_t = typename std::enable_if<
-      std::is_convertible_v<std::decay_t<decltype(std::declval<U>().data())> *,
-                            T *const *>,
-      U>::type;
-
-public:
-  Span() = default;
-  Span(const Span &) = default;
-  Span &operator=(const Span &) = default;
-
-  DEVICE_QUALIFIER
-  constexpr Span(pointer array, size_type length)
-      : m_ptr(array), m_size(length) {}
-  template <std::size_t N>
-  DEVICE_QUALIFIER constexpr Span(T (&a)[N]) noexcept : Span(a, N) {}
-
-  template <typename C, typename = enable_if_mutable_t<C>,
-            typename = enable_if_has_data_t<C>>
-  DEVICE_QUALIFIER explicit Span(C &c) noexcept : Span(c.data(), c.size()) {}
-  template <typename C, typename = enable_if_const_t<C>,
-            typename = enable_if_has_data_t<C>>
-  DEVICE_QUALIFIER Span(const C &c) noexcept : Span(c.data(), c.size()) {}
-
-  DEVICE_QUALIFIER constexpr size_type size() const { return m_size; }
-  DEVICE_QUALIFIER constexpr bool empty() const { return size() == 0; }
-
-  DEVICE_QUALIFIER constexpr iterator begin() const { return m_ptr; }
-  DEVICE_QUALIFIER constexpr const_iterator cbegin() const { return m_ptr; }
-  DEVICE_QUALIFIER constexpr iterator end() const { return m_ptr + m_size; }
-  DEVICE_QUALIFIER constexpr const_iterator cend() const {
-    return m_ptr + m_size;
-  }
-  constexpr reverse_iterator rbegin() const { return reverse_iterator(end()); }
-  constexpr reverse_iterator rend() const { return reverse_iterator(begin()); }
-
-  DEVICE_QUALIFIER constexpr reference operator[](size_type i) const {
-    return DEVICE_ASSERT(i < size()), m_ptr[i];
-  }
-
-  constexpr reference at(size_type i) const {
-    return (i < size()) ? m_ptr[i]
-                        : throw std::out_of_range("span access out of bounds."),
-           m_ptr[i];
-  }
-
-  DEVICE_QUALIFIER constexpr pointer data() const { return m_ptr; }
-};
-
-template <typename T>
-DEVICE_QUALIFIER constexpr Span<T> make_span(T *p, std::size_t N) {
-  return Span<T>(p, N);
-}
-
-template <class C> DEVICE_QUALIFIER constexpr auto make_span(C &c) {
-  return make_span(c.data(), c.size());
-}
-
-template <typename T>
-DEVICE_QUALIFIER constexpr Span<std::add_const_t<T>>
-make_const_span(T *p, std::size_t N) {
-  return Span<std::add_const_t<T>>(p, N);
-}
-
-template <class C> DEVICE_QUALIFIER constexpr auto make_const_span(C &c) {
-  return make_const_span(c.data(), c.size());
-}
-
-} // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/Vector.hpp b/src/utils/include/utils/Vector.hpp
index 288e459b83c..06167c901dc 100644
--- a/src/utils/include/utils/Vector.hpp
+++ b/src/utils/include/utils/Vector.hpp
@@ -39,6 +39,7 @@
 #include <initializer_list>
 #include <iterator>
 #include <numeric>
+#include <span>
 #include <type_traits>
 #include <vector>
 
@@ -79,7 +80,8 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
 
 public:
   template <class Range>
-  explicit Vector(Range const &rng) : Vector(std::begin(rng), std::end(rng)) {}
+  explicit constexpr Vector(Range const &rng)
+      : Vector(std::begin(rng), std::end(rng)) {}
   explicit constexpr Vector(T const (&v)[N]) : Base() {
     copy_init(std::begin(v), std::end(v));
   }
@@ -103,14 +105,13 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
     }
   }
 
-  /**
-   * @brief Create a vector that has all entries set to
-   *         one value.
-   */
-  static Vector<T, N> broadcast(T const &s) {
-    Vector<T, N> ret;
-    std::fill(ret.begin(), ret.end(), s);
-
+  /** @brief Create a vector that has all entries set to the same value. */
+  DEVICE_QUALIFIER static constexpr Vector<T, N>
+  broadcast(typename Base::value_type const &value) {
+    Vector<T, N> ret{};
+    for (std::size_t i = 0u; i != N; ++i) {
+      ret[i] = value;
+    }
     return ret;
   }
 
@@ -118,11 +119,17 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
 
   operator std::vector<T>() const { return as_vector(); }
 
+  constexpr std::span<T, N> as_span() const {
+    return std::span<T, N>(const_cast<T *>(begin()), size());
+  }
+
+  constexpr operator std::span<T, N>() const { return as_span(); }
+
   template <class U> explicit operator Vector<U, N>() const {
     Vector<U, N> ret;
 
     std::transform(begin(), end(), ret.begin(),
-                   [](auto e) { return static_cast<U>(e); });
+                   [](auto const &e) { return static_cast<U>(e); });
 
     return ret;
   }
@@ -140,7 +147,7 @@ template <typename T, std::size_t N> class Vector : public Array<T, N> {
   Vector &normalize() {
     auto const l = norm();
     if (l > T(0)) {
-      for (std::size_t i = 0; i < N; i++)
+      for (std::size_t i = 0u; i < N; ++i)
         this->operator[](i) /= l;
     }
 
@@ -247,8 +254,7 @@ template <std::size_t N, typename T>
 Vector<T, N> operator-(Vector<T, N> const &a) {
   Vector<T, N> ret;
 
-  std::transform(std::begin(a), std::end(a), std::begin(ret),
-                 [](T const &v) { return -v; });
+  std::transform(std::begin(a), std::end(a), std::begin(ret), std::negate<T>());
 
   return ret;
 }
@@ -300,6 +306,15 @@ Vector<T, N> operator/(Vector<T, N> const &a, T const &b) {
   return ret;
 }
 
+template <std::size_t N, typename T>
+Vector<T, N> operator/(T const &a, Vector<T, N> const &b) {
+  Vector<T, N> ret;
+
+  std::transform(std::begin(b), std::end(b), ret.begin(),
+                 [a](T const &val) { return a / val; });
+  return ret;
+}
+
 template <std::size_t N, typename T>
 Vector<T, N> &operator/=(Vector<T, N> &a, T const &b) {
   std::transform(std::begin(a), std::end(a), std::begin(a),
@@ -367,7 +382,7 @@ auto hadamard_product(Vector<T, N> const &a, Vector<U, N> const &b) {
 
   Vector<R, N> ret;
   std::transform(a.cbegin(), a.cend(), b.cbegin(), ret.begin(),
-                 [](auto ai, auto bi) { return ai * bi; });
+                 [](auto const &ai, auto const &bi) { return ai * bi; });
 
   return ret;
 }
@@ -410,7 +425,7 @@ auto hadamard_division(Vector<T, N> const &a, Vector<U, N> const &b) {
 
   Vector<R, N> ret;
   std::transform(a.cbegin(), a.cend(), b.cbegin(), ret.begin(),
-                 [](auto ai, auto bi) { return ai / bi; });
+                 [](auto const &ai, auto const &bi) { return ai / bi; });
 
   return ret;
 }
@@ -448,11 +463,11 @@ auto hadamard_division(T const &a, U const &b) {
 }
 
 template <typename T> Vector<T, 3> unit_vector(unsigned int i) {
-  if (i == 0)
+  if (i == 0u)
     return {T{1}, T{0}, T{0}};
-  if (i == 1)
+  if (i == 1u)
     return {T{0}, T{1}, T{0}};
-  if (i == 2)
+  if (i == 2u)
     return {T{0}, T{0}, T{1}};
   throw std::domain_error("coordinate out of range");
 }
@@ -465,7 +480,9 @@ template <typename T, std::size_t N> struct decay_to_scalar<Vector<T, N>> {
   using type = Vector<T, N>;
 };
 
-template <typename T> struct decay_to_scalar<Vector<T, 1>> { using type = T; };
+template <typename T> struct decay_to_scalar<Vector<T, 1>> {
+  using type = T;
+};
 
 template <std::size_t I, class T, std::size_t N>
 struct tuple_element<I, Vector<T, N>> {
diff --git a/src/utils/include/utils/checks/charge_neutrality.hpp b/src/utils/include/utils/checks/charge_neutrality.hpp
deleted file mode 100644
index 2d08eb85105..00000000000
--- a/src/utils/include/utils/checks/charge_neutrality.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef UTILS_CHECKS_CHARGE_NEUTRALITY_HPP
-#define UTILS_CHECKS_CHARGE_NEUTRALITY_HPP
-
-#include <boost/accumulators/accumulators.hpp>
-#include <boost/accumulators/statistics/sum_kahan.hpp>
-
-#include <cmath>
-#include <limits>
-#include <stdexcept>
-
-namespace Utils {
-template <typename ParticleRange>
-bool check_charge_neutrality(ParticleRange &prange) {
-  using namespace boost::accumulators;
-  using KahanSum = accumulator_set<double, features<tag::sum_kahan>>;
-
-  KahanSum q_sum;
-  auto q_min = std::numeric_limits<double>::infinity();
-  constexpr auto relative_tolerance = 2e-12;
-
-  for (auto const &p : prange) {
-    auto const &q = p.q();
-
-    if (q != 0.0) {
-      q_sum(q);
-      q_min = std::min(q_min, std::abs(q));
-    }
-  }
-
-  auto const excess_ratio = std::abs(sum_kahan(q_sum)) / q_min;
-
-  return excess_ratio <= relative_tolerance;
-}
-} // namespace Utils
-#endif
diff --git a/src/utils/include/utils/constants.hpp b/src/utils/include/utils/constants.hpp
deleted file mode 100644
index 3c5d2abb7d0..00000000000
--- a/src/utils/include/utils/constants.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef UTILS_CONSTANTS_HPP
-#define UTILS_CONSTANTS_HPP
-
-#include "device_qualifier.hpp"
-
-#include <boost/math/constants/constants.hpp>
-
-namespace Utils {
-
-/*************************************************************/
-/** \name Mathematical, physical and chemical constants.     */
-/*************************************************************/
-/**@{*/
-
-/**
- * @brief Ratio of diameter and circumference of a circle.
- */
-template <class T = double> DEVICE_QUALIFIER constexpr T pi() {
-  return T(3.14159265358979323846264338328L);
-}
-
-/**
- * @brief One over square root of pi.
- */
-template <class T = double> DEVICE_QUALIFIER constexpr T sqrt_pi_i() {
-  return T(0.56418958354775627928034964498L);
-}
-
-/**
- * @brief Euler-Mascheroni constant.
- */
-template <class T = double> DEVICE_QUALIFIER constexpr T gamma() {
-  return T(0.57721566490153286060651209008L);
-}
-
-/**
- * @brief Natural logarithm of 2.
- */
-template <class T = double> DEVICE_QUALIFIER constexpr T ln_2() {
-  return T(0.6931471805599453094172321214581766L);
-}
-
-/**
- * @brief Square root of 2.
- */
-template <class T = double> DEVICE_QUALIFIER constexpr T sqrt_2() {
-  return T(1.4142135623730950488016887242096981L);
-}
-
-/**
- * @brief Cube root of 2.
- */
-template <class T = double> DEVICE_QUALIFIER constexpr T cbrt_2() {
-  return T(1.25992104989487316476721060727822835057025L);
-}
-
-/**@}*/
-
-/// error code if no error occurred
-#define ES_OK 0
-/// error code if an error occurred
-#define ES_ERROR 1
-
-} // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/interpolation/bspline_3d.hpp b/src/utils/include/utils/interpolation/bspline_3d.hpp
index 448d5c4e84a..7e90b1d9a2c 100644
--- a/src/utils/include/utils/interpolation/bspline_3d.hpp
+++ b/src/utils/include/utils/interpolation/bspline_3d.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef UTILS_INTERPOLATION_HPP
-#define UTILS_INTERPOLATION_HPP
+
+#pragma once
 
 #include "utils/Vector.hpp"
 
@@ -42,12 +42,12 @@ namespace Interpolation {
  * @param offset Shift of the grid relative to the origin.
  */
 template <int order, typename Kernel>
-void bspline_3d(const Vector3d &pos, const Kernel &kernel,
-                const Vector3d &grid_spacing, const Vector3d &offset) {
+void bspline_3d(Vector3d const &pos, Kernel const &kernel,
+                Vector3d const &grid_spacing, Vector3d const &offset) {
   using Utils::bspline;
 
   /* The coordinates and relative distance of the assignment cube. */
-  const auto block = detail::ll_and_dist<order>(pos, grid_spacing, offset);
+  auto const block = detail::ll_and_dist<order>(pos, grid_spacing, offset);
 
   /* Precalc weights that are used multiple times. */
   std::array<double, order> w_y{};
@@ -60,10 +60,10 @@ void bspline_3d(const Vector3d &pos, const Kernel &kernel,
   std::array<int, 3> ind;
   for (int i = 0; i < order; i++) {
     ind[0] = block.corner[0] + i;
-    const auto wx = bspline<order>(i, block.distance[0]);
+    auto const wx = bspline<order>(i, block.distance[0]);
     for (int j = 0; j < order; j++) {
       ind[1] = block.corner[1] + j;
-      const auto wxy = wx * w_y[static_cast<unsigned>(j)];
+      auto const wxy = wx * w_y[static_cast<unsigned>(j)];
       for (int k = 0; k < order; k++) {
         ind[2] = block.corner[2] + k;
         kernel(ind, wxy * w_z[static_cast<unsigned>(k)]);
@@ -76,13 +76,13 @@ void bspline_3d(const Vector3d &pos, const Kernel &kernel,
  * @brief cardinal B-spline weighted sum.
  */
 template <int order, typename T, typename Kernel>
-T bspline_3d_accumulate(const Vector3d &pos, const Kernel &kernel,
-                        const Vector3d &grid_spacing, const Vector3d &offset,
+T bspline_3d_accumulate(Vector3d const &pos, Kernel const &kernel,
+                        Vector3d const &grid_spacing, Vector3d const &offset,
                         T const &init) {
   T value = init;
   bspline_3d<order>(
       pos,
-      [&value, &kernel](const std::array<int, 3> &ind, double w) {
+      [&value, &kernel](std::array<int, 3> const &ind, double w) {
         value += w * kernel(ind);
       },
       grid_spacing, offset);
@@ -92,5 +92,3 @@ T bspline_3d_accumulate(const Vector3d &pos, const Kernel &kernel,
 
 } // namespace Interpolation
 } // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/interpolation/bspline_3d_gradient.hpp b/src/utils/include/utils/interpolation/bspline_3d_gradient.hpp
index 813c439db7e..8de1ada168f 100644
--- a/src/utils/include/utils/interpolation/bspline_3d_gradient.hpp
+++ b/src/utils/include/utils/interpolation/bspline_3d_gradient.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef UTILS_INTERPOLATION_GRADIENT_HPP
-#define UTILS_INTERPOLATION_GRADIENT_HPP
+
+#pragma once
 
 #include "utils/Vector.hpp"
 
@@ -42,13 +42,13 @@ namespace Interpolation {
  * @param grid_spacing The distance between the grid points.
  * @param offset Shift of the grid relative to the origin.
  */
-template <std::size_t order, typename Kernel>
-void bspline_3d_gradient(const Vector3d &pos, const Kernel &kernel,
-                         const Vector3d &grid_spacing, const Vector3d &offset) {
+template <int order, typename Kernel>
+void bspline_3d_gradient(Vector3d const &pos, Kernel const &kernel,
+                         Vector3d const &grid_spacing, Vector3d const &offset) {
   using Utils::bspline;
 
   /* The coordinates and relative distance of the assignment cube. */
-  const auto block = detail::ll_and_dist<order>(pos, grid_spacing, offset);
+  auto const block = detail::ll_and_dist<order>(pos, grid_spacing, offset);
 
   /* Precalc weights that are used multiple times. */
   std::array<double, order> w_y;
@@ -65,8 +65,8 @@ void bspline_3d_gradient(const Vector3d &pos, const Kernel &kernel,
   std::array<int, 3> ind;
   for (int i = 0; i < order; i++) {
     ind[0] = block.corner[0] + i;
-    const auto w_x = bspline<order>(i, block.distance[0]);
-    const auto dw_x = bspline_d<order>(i, block.distance[0]) / grid_spacing[0];
+    auto const w_x = bspline<order>(i, block.distance[0]);
+    auto const dw_x = bspline_d<order>(i, block.distance[0]) / grid_spacing[0];
     for (int j = 0; j < order; j++) {
       ind[1] = block.corner[1] + j;
       for (int k = 0; k < order; k++) {
@@ -81,10 +81,10 @@ void bspline_3d_gradient(const Vector3d &pos, const Kernel &kernel,
 /**
  * @brief cardinal B-spline weighted sum.
  */
-template <std::size_t order, typename T, typename Kernel>
-T bspline_3d_gradient_accumulate(const Vector3d &pos, const Kernel &kernel,
-                                 const Vector3d &grid_spacing,
-                                 const Vector3d &offset, T const &init) {
+template <int order, typename T, typename Kernel>
+T bspline_3d_gradient_accumulate(Vector3d const &pos, Kernel const &kernel,
+                                 Vector3d const &grid_spacing,
+                                 Vector3d const &offset, T const &init) {
   T value = init;
   bspline_3d_gradient<order>(
       pos,
@@ -97,5 +97,3 @@ T bspline_3d_gradient_accumulate(const Vector3d &pos, const Kernel &kernel,
 }
 } // namespace Interpolation
 } // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/interpolation/detail/ll_and_dist.hpp b/src/utils/include/utils/interpolation/detail/ll_and_dist.hpp
index 62785639eca..dff3492d237 100644
--- a/src/utils/include/utils/interpolation/detail/ll_and_dist.hpp
+++ b/src/utils/include/utils/interpolation/detail/ll_and_dist.hpp
@@ -16,51 +16,44 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef UTILS_INTERPOLATION_DETAIL_LL_AND_DIST_HPP
-#define UTILS_INTERPOLATION_DETAIL_LL_AND_DIST_HPP
+
+#pragma once
 
 #include "utils/Vector.hpp"
 
 #include <array>
 #include <cmath>
 
-namespace Utils {
-namespace Interpolation {
-namespace detail {
+namespace Utils::Interpolation::detail {
 
 struct Block {
-  /* Index of the lower left corner of the assignment cube */
-  const std::array<int, 3> corner;
-  /* Distance to the nearest mesh point in units of h \in [-0.5, 0.5) */
-  const Vector3d distance;
+  /** Index of the lower left corner of the assignment cube */
+  std::array<int, 3> corner;
+  /** Distance to the nearest mesh point in units of agrid in [-0.5, 0.5) */
+  std::array<double, 3> distance;
 };
 
 /**
  * @brief Calculate the lower left index of a block
- *        stencil with order points side length.
+ *        stencil with @c order points side length.
  */
 template <int order>
-Block ll_and_dist(const Vector3d &pos, const Vector3d &grid_spacing,
-                  const Vector3d &offset) {
-  Vector3d dist;
-  std::array<int, 3> ll;
-
-  for (unsigned int dim = 0; dim < 3; ++dim) {
+auto ll_and_dist(Vector3d const &pos, Vector3d const &grid_spacing,
+                 Vector3d const &offset) {
+  Block block{};
+  for (unsigned int dim = 0u; dim < 3u; ++dim) {
     auto const fractional_index = (pos[dim] - offset[dim]) / grid_spacing[dim];
     int nmp;
     if constexpr (order % 2 == 0) {
       nmp = static_cast<int>(std::floor(fractional_index));
-      dist[dim] = fractional_index - nmp - 0.5;
+      block.distance[dim] = fractional_index - nmp - 0.5;
     } else {
       nmp = static_cast<int>(std::floor(fractional_index + 0.5));
-      dist[dim] = fractional_index - nmp;
+      block.distance[dim] = fractional_index - nmp;
     }
-    ll[dim] = nmp - (order - 1) / 2;
+    block.corner[dim] = nmp - (order - 1) / 2;
   }
-  return {ll, dist};
+  return block;
 }
-} // namespace detail
-} // namespace Interpolation
-} // namespace Utils
 
-#endif
+} // namespace Utils::Interpolation::detail
diff --git a/src/utils/include/utils/linear_interpolation.hpp b/src/utils/include/utils/linear_interpolation.hpp
index 63328bccdbb..87d3e6fba15 100644
--- a/src/utils/include/utils/linear_interpolation.hpp
+++ b/src/utils/include/utils/linear_interpolation.hpp
@@ -16,10 +16,11 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef UTILS_LINEAR_INTERPOLATION_HPP
-#define UTILS_LINEAR_INTERPOLATION_HPP
+
+#pragma once
 
 #include <cassert>
+#include <cstddef>
 
 namespace Utils {
 /** Linear interpolation between two data points.
@@ -34,7 +35,7 @@ T linear_interpolation(Container const &table, T hi, T offset, T x) {
   auto const dind = (x - offset) * hi;
   auto const ind = static_cast<int>(dind);
   assert(ind <= dind);
-  assert((ind >= 0) and (ind < table.size()));
+  assert((ind >= 0) and (static_cast<std::size_t>(ind) < table.size()));
   auto const dx = dind - static_cast<T>(ind);
   auto const uind = static_cast<unsigned int>(ind);
 
@@ -42,5 +43,3 @@ T linear_interpolation(Container const &table, T hi, T offset, T x) {
   return table[uind] * (T{1} - dx) + table[uind + 1] * dx;
 }
 } // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/mask.hpp b/src/utils/include/utils/mask.hpp
index 39ee422e9d2..954c517ea9a 100644
--- a/src/utils/include/utils/mask.hpp
+++ b/src/utils/include/utils/mask.hpp
@@ -54,11 +54,11 @@ auto mask_impl(Integral mask, T t, std::index_sequence<I...>) {
  * @return t partially zeroed out according to mask
  */
 template <class T, class Integral>
-auto mask(Integral mask, T t)
-    -> std::enable_if_t<std::is_unsigned_v<Integral> &&
-                            (size_in_bits<Integral>::value >=
-                             tuple_size<T>::value),
-                        T> {
+auto mask(Integral mask,
+          T t) -> std::enable_if_t<std::is_unsigned_v<Integral> &&
+                                       (size_in_bits<Integral>::value >=
+                                        tuple_size<T>::value),
+                                   T> {
   return detail::mask_impl(mask, t,
                            std::make_index_sequence<tuple_size<T>::value>{});
 }
diff --git a/src/utils/include/utils/math/abs.hpp b/src/utils/include/utils/math/abs.hpp
deleted file mode 100644
index 4576f2c9964..00000000000
--- a/src/utils/include/utils/math/abs.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef ESPRESSO_ABS_HPP
-#define ESPRESSO_ABS_HPP
-
-#include "utils/device_qualifier.hpp"
-
-#ifndef __CUDACC__
-#include <cmath>
-#endif
-
-namespace Utils {
-/**
- * @brief Return the absolute value of x.
- */
-inline DEVICE_QUALIFIER double abs(double x) { return fabs(x); }
-
-/**
- * @brief Return the absolute value of x.
- */
-inline DEVICE_QUALIFIER float abs(float x) { return fabsf(x); }
-} // namespace Utils
-
-#endif // ESPRESSO_DEVICE_MATH_HPP
diff --git a/src/utils/include/utils/math/bspline.hpp b/src/utils/include/utils/math/bspline.hpp
index b481f87e324..37a5ef45c03 100644
--- a/src/utils/include/utils/math/bspline.hpp
+++ b/src/utils/include/utils/math/bspline.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef UTILS_MATH_BSPLINE_HPP
-#define UTILS_MATH_BSPLINE_HPP
+
+#pragma once
 
 #include "sqr.hpp"
 #include "utils/device_qualifier.hpp"
@@ -28,8 +28,8 @@
 namespace Utils {
 /** @brief Formula of the B-spline. */
 template <int order, typename T>
-DEVICE_QUALIFIER auto bspline(int i, T x)
-    -> std::enable_if_t<(order > 0) && (order <= 7), T> {
+DEVICE_QUALIFIER auto
+bspline(int i, T x) -> std::enable_if_t<(order > 0) && (order <= 7), T> {
   DEVICE_ASSERT(i < order);
   DEVICE_ASSERT(x >= T(-0.5));
   DEVICE_ASSERT(x <= T(0.5));
@@ -206,8 +206,8 @@ template <class T> auto bspline(int i, T x, int k) {
 
 /** @brief Derivative of the B-spline. */
 template <int order, typename T = double>
-DEVICE_QUALIFIER auto bspline_d(int i, T x)
-    -> std::enable_if_t<(order > 0) && (order <= 7), T> {
+DEVICE_QUALIFIER auto
+bspline_d(int i, T x) -> std::enable_if_t<(order > 0) && (order <= 7), T> {
   DEVICE_ASSERT(i < order);
   DEVICE_ASSERT(x >= T(-0.5));
   DEVICE_ASSERT(x <= T(0.5));
@@ -323,5 +323,3 @@ DEVICE_QUALIFIER auto bspline_d(int i, T x)
   return T{};
 }
 } // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/math/coordinate_transformation.hpp b/src/utils/include/utils/math/coordinate_transformation.hpp
index c5808511be1..0ce6aff31b3 100644
--- a/src/utils/include/utils/math/coordinate_transformation.hpp
+++ b/src/utils/include/utils/math/coordinate_transformation.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef UTILS_COORDINATE_TRANSFORMATION_HPP
-#define UTILS_COORDINATE_TRANSFORMATION_HPP
+
+#pragma once
 
 /**
  * @file
@@ -31,10 +31,8 @@
  */
 
 #include "utils/Vector.hpp"
-#include "utils/constants.hpp"
 #include "utils/math/vec_rotate.hpp"
 #include "utils/matrix.hpp"
-#include "utils/quaternion.hpp"
 
 #include <cassert>
 #include <cmath>
@@ -166,4 +164,3 @@ inline Vector3d transform_vector_cartesian_to_cylinder(Vector3d const &vec,
 }
 
 } // namespace Utils
-#endif
diff --git a/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp b/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp
index f3a055556bf..6893f04d58e 100644
--- a/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp
+++ b/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp
@@ -16,13 +16,14 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP
-#define ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP
 
+#pragma once
+
+#include <cmath>
+#include <limits>
 #include <stdexcept>
 #include <string>
 
-#include <utils/math/abs.hpp>
 #include <utils/math/orthonormal_vec.hpp>
 
 namespace Utils {
@@ -63,30 +64,28 @@ class CylindricalTransformationParameters {
 
 private:
   void validate() const {
-    auto constexpr eps = 10 * std::numeric_limits<double>::epsilon();
-    if (Utils::abs(m_orientation * m_axis) > eps) {
+    auto constexpr eps = 10. * std::numeric_limits<double>::epsilon();
+    if (std::fabs(m_orientation * m_axis) > eps) {
       throw std::runtime_error(
           "CylindricalTransformationParameters: Axis and orientation must be "
           "orthogonal. Scalar product is " +
           std::to_string(m_orientation * m_axis));
     }
-    if (Utils::abs(m_axis.norm() - 1) > eps) {
+    if (std::fabs(m_axis.norm() - 1.) > eps) {
       throw std::runtime_error("CylindricalTransformationParameters: Axis must "
                                "be normalized. Norm is " +
                                std::to_string(m_axis.norm()));
     }
-    if (Utils::abs(m_orientation.norm() - 1) > eps) {
+    if (std::fabs(m_orientation.norm() - 1.) > eps) {
       throw std::runtime_error("CylindricalTransformationParameters: "
                                "orientation must be normalized. Norm is " +
                                std::to_string(m_orientation.norm()));
     }
   }
 
-  const Utils::Vector3d m_center{};
-  const Utils::Vector3d m_axis{0, 0, 1};
-  const Utils::Vector3d m_orientation{1, 0, 0};
+  Utils::Vector3d const m_center{};
+  Utils::Vector3d const m_axis{0., 0., 1.};
+  Utils::Vector3d const m_orientation{1., 0., 0.};
 };
 
 } // namespace Utils
-
-#endif // ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP
diff --git a/src/utils/include/utils/math/make_lin_space.hpp b/src/utils/include/utils/math/make_lin_space.hpp
index 152edcb442b..bfea7ec555e 100644
--- a/src/utils/include/utils/math/make_lin_space.hpp
+++ b/src/utils/include/utils/math/make_lin_space.hpp
@@ -17,43 +17,32 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef UTILS_MAKE_LIN_SPACE_HPP
-#define UTILS_MAKE_LIN_SPACE_HPP
-
-#include <boost/iterator/counting_iterator.hpp>
-#include <boost/iterator/transform_iterator.hpp>
-#include <boost/range/iterator_range.hpp>
+#pragma once
 
+#include <algorithm>
 #include <cstddef>
+#include <ranges>
 
 namespace Utils {
 /**
  * @brief Equally spaced values in interval
  *
  * Returns a range of equally spaced values in
- * the range of start and stop, like numpy.linspace.
+ * the range @p start to @p stop, like @c numpy.linspace().
  *
  * @tparam T floating point type
  * @param start Start value of the interval
  * @param stop End value of the interval
  * @param number Number of partition points
- * @param endpoint If true, the last point is
- *        stop, otherwise one less.
- * @return Range of equally spaced values
+ * @param endpoint If true, the last point is @p stop.
+ * @return Range of equally-spaced values
  */
 template <class T>
 auto make_lin_space(T start, T stop, std::size_t number, bool endpoint = true) {
-  using boost::make_counting_iterator;
-  using boost::make_iterator_range;
-  using boost::make_transform_iterator;
-
   auto const dx = (stop - start) / T(number - endpoint);
-  auto x = [dx, start](std::size_t i) { return start + T(i) * dx; };
 
-  return make_iterator_range(
-      make_transform_iterator(make_counting_iterator(std::size_t(0)), x),
-      make_transform_iterator(make_counting_iterator(number), x));
+  return std::ranges::views::transform(
+      std::views::iota(std::size_t{0u}, number),
+      [dx, start](std::size_t i) { return start + T(i) * dx; });
 }
 } // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/math/orthonormal_vec.hpp b/src/utils/include/utils/math/orthonormal_vec.hpp
index 52c83b939d9..2fb6cf98466 100644
--- a/src/utils/include/utils/math/orthonormal_vec.hpp
+++ b/src/utils/include/utils/math/orthonormal_vec.hpp
@@ -16,13 +16,13 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_ORTHONORMAL_VEC_HPP
-#define ESPRESSO_ORTHONORMAL_VEC_HPP
+
+#pragma once
 
 #include "utils/Vector.hpp"
-#include "utils/constants.hpp"
 
 #include <cstddef>
+#include <numbers>
 
 namespace Utils {
 /**
@@ -34,16 +34,16 @@ Vector<T, N> calc_orthonormal_vector(Vector<T, N> const &vec) {
    trial vector. Only works if the trial vector is not parallel, so we have to
    try a second one in that case
   */
-  Vector<Vector<T, N>, 2> try_vectors = {Vector<T, N>::broadcast(0),
-                                         Vector<T, N>::broadcast(0)};
-  try_vectors[0][0] = 1;
-  try_vectors[1][1] = 1;
+  Vector<Vector<T, N>, 2> try_vectors = {Vector<T, N>::broadcast(T(0)),
+                                         Vector<T, N>::broadcast(T(0))};
+  try_vectors[0][0] = T(1);
+  try_vectors[1][1] = T(1);
 
   Vector<T, N> ret;
   for (auto v : try_vectors) {
     auto orth_component = v - (v * vec) / vec.norm2() * vec;
     auto norm = orth_component.norm();
-    if (norm >= 1. / Utils::sqrt_2()) {
+    if (norm >= 1. / std::numbers::sqrt2) {
       ret = orth_component / norm;
       break;
     }
@@ -52,5 +52,3 @@ Vector<T, N> calc_orthonormal_vector(Vector<T, N> const &vec) {
 }
 
 } // namespace Utils
-
-#endif // ESPRESSO_ORTHONORMAL_VEC_HPP
\ No newline at end of file
diff --git a/src/utils/include/utils/math/quaternion.hpp b/src/utils/include/utils/math/quaternion.hpp
index 1c0aef6b6bc..05a2f47d3c3 100644
--- a/src/utils/include/utils/math/quaternion.hpp
+++ b/src/utils/include/utils/math/quaternion.hpp
@@ -19,18 +19,18 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef UTILS_MATH_QUATERNION_HPP
-#define UTILS_MATH_QUATERNION_HPP
+#pragma once
+
 /** \file
  *  Quaternion algebra.
  */
 
 #include "utils/Vector.hpp"
-#include "utils/constants.hpp"
 #include "utils/quaternion.hpp"
 
 #include <cmath>
 #include <limits>
+#include <numbers>
 
 namespace Utils {
 
@@ -63,19 +63,19 @@ Quaternion<T> convert_director_to_quaternion(Vector<T, 3> const &d) {
   // Calculate angles
   auto const d_xy = std::sqrt(d[0] * d[0] + d[1] * d[1]);
   T theta2, phi2;
-  if (d_xy == 0) {
+  if (d_xy == 0.) {
     // Here the director is co-linear with the z-axis
     // We need to distinguish between (0, 0, +d_z) and (0, 0, -d_z)
-    theta2 = (d[2] > 0) ? 0 : Utils::pi<T>() / 2;
-    phi2 = 0;
+    theta2 = (d[2] > 0.) ? 0. : std::numbers::pi_v<T> / 2.;
+    phi2 = 0.;
   } else {
     // Here we take care of all other directions
     // We suppose that theta2 = theta/2 and phi2 = (phi - pi/2)/2,
     // where angles theta and phi are in spherical coordinates
-    theta2 = std::acos(d[2] / dm) / 2;
+    theta2 = std::acos(d[2] / dm) / 2.;
     // here we do not use the signum function due to the edge case d[1] = 0
-    auto const phi = ((d[1] > 0) ? 1 : -1) * std::acos(d[0] / d_xy);
-    phi2 = phi / 2 - Utils::pi<T>() / 4;
+    auto const phi = ((d[1] > 0.) ? 1. : -1.) * std::acos(d[0] / d_xy);
+    phi2 = phi / 2. - std::numbers::pi_v<T> / 4.;
   }
 
   // Calculate the quaternion from the angles
@@ -88,4 +88,3 @@ Quaternion<T> convert_director_to_quaternion(Vector<T, 3> const &d) {
 }
 
 } // namespace Utils
-#endif
diff --git a/src/utils/include/utils/math/sinc.hpp b/src/utils/include/utils/math/sinc.hpp
deleted file mode 100644
index e6705c09fe1..00000000000
--- a/src/utils/include/utils/math/sinc.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (C) 2010-2022 The ESPResSo project
- * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
- *   Max-Planck-Institute for Polymer Research, Theory Group
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef UTILS_MATH_SINC_HPP
-#define UTILS_MATH_SINC_HPP
-
-#include "utils/constants.hpp"
-#include "utils/device_qualifier.hpp"
-#include "utils/math/abs.hpp"
-
-#include <cmath>
-
-namespace Utils {
-/**
- * @brief Calculates the sinc-function as sin(PI*x)/(PI*x).
- *
- * (same convention as in @cite hockney88a). In order to avoid
- * divisions by 0, arguments, whose modulus is smaller than epsi, will
- * be evaluated by an 8th order Taylor expansion of the sinc
- * function. Note that the difference between sinc(x) and this
- * expansion is smaller than 0.235e-12, if x is smaller than 0.1. (The
- * next term in the expansion is the 10th order contribution
- * PI^10/39916800 * x^10 = 0.2346...*x^12). This expansion should
- * also save time, since it reduces the number of function calls to
- * sin().
- */
-template <typename T> DEVICE_QUALIFIER T sinc(T d) {
-  const constexpr T epsi = T(0.1);
-
-  const auto PId = pi<T>() * d;
-
-  if (::Utils::abs(d) > epsi)
-    return sin(PId) / PId;
-
-  /* Coefficients of the Taylor expansion of sinc */
-  const constexpr T c2 = T(-0.1666666666667e-0);
-  const constexpr T c4 = T(0.8333333333333e-2);
-  const constexpr T c6 = T(-0.1984126984127e-3);
-  const constexpr T c8 = T(0.2755731922399e-5);
-
-  const auto PId2 = PId * PId;
-  return T(1) + PId2 * (c2 + PId2 * (c4 + PId2 * (c6 + PId2 * c8)));
-}
-} // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/math/sqr.hpp b/src/utils/include/utils/math/sqr.hpp
index e5b88709554..a5ac0f25969 100644
--- a/src/utils/include/utils/math/sqr.hpp
+++ b/src/utils/include/utils/math/sqr.hpp
@@ -1,5 +1,7 @@
 /*
  * Copyright (C) 2010-2022 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
  *
  * This file is part of ESPResSo.
  *
@@ -16,8 +18,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef UTILS_MATH_SQR_HPP
-#define UTILS_MATH_SQR_HPP
+
+#pragma once
 
 #include "utils/device_qualifier.hpp"
 
@@ -25,5 +27,3 @@ namespace Utils {
 /** Calculates the SQuaRe of x */
 template <typename T> DEVICE_QUALIFIER constexpr T sqr(T x) { return x * x; }
 } // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/math/triangle_functions.hpp b/src/utils/include/utils/math/triangle_functions.hpp
index 32ccc102c37..389bd6d1154 100644
--- a/src/utils/include/utils/math/triangle_functions.hpp
+++ b/src/utils/include/utils/math/triangle_functions.hpp
@@ -20,10 +20,10 @@
 #define UTILS_MATH_TRIANGLE_FUNCTIONS_HPP
 
 #include "utils/Vector.hpp"
-#include "utils/constants.hpp"
 
 #include <algorithm>
 #include <cmath>
+#include <numbers>
 
 namespace Utils {
 /**
@@ -85,7 +85,7 @@ inline double angle_btw_triangles(const Vector3d &P1, const Vector3d &P2,
   // The angle between the faces (not considering
   // the orientation, always less or equal to Pi)
   // is equal to Pi minus angle between the normals
-  auto const phi = Utils::pi() - std::acos(cosine);
+  auto const phi = std::numbers::pi - std::acos(cosine);
 
   // Now we need to determine, if the angle between two triangles is less than
   // Pi or greater than Pi. To do this, we check if the point P4 lies in the
@@ -97,7 +97,7 @@ inline double angle_btw_triangles(const Vector3d &P1, const Vector3d &P2,
   // Point P4 lies in the halfspace given by normal iff n_x*P4_x + n_y*P4_y +
   // n_z*P4_z + d >= 0
   if (normal1 * P4 - normal1 * P1 < 0)
-    return 2 * Utils::pi() - phi;
+    return 2. * std::numbers::pi - phi;
 
   return phi;
 }
diff --git a/src/utils/include/utils/matrix.hpp b/src/utils/include/utils/matrix.hpp
index 30ecb31baef..bb21b6411bd 100644
--- a/src/utils/include/utils/matrix.hpp
+++ b/src/utils/include/utils/matrix.hpp
@@ -77,7 +77,7 @@ template <typename T, std::size_t Rows, std::size_t Cols> struct Matrix {
 private:
   friend class boost::serialization::access;
   template <class Archive> void serialize(Archive &ar, const unsigned int) {
-    ar &m_data;
+    ar & m_data;
   }
 
 public:
diff --git a/src/utils/include/utils/mpi/iall_gatherv.hpp b/src/utils/include/utils/mpi/iall_gatherv.hpp
index 14c5a9729fa..2d563ad8767 100644
--- a/src/utils/include/utils/mpi/iall_gatherv.hpp
+++ b/src/utils/include/utils/mpi/iall_gatherv.hpp
@@ -17,26 +17,25 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef UTILS_MPI_ALL_GATHERV_HPP
-#define UTILS_MPI_ALL_GATHERV_HPP
-
-#include "utils/Span.hpp"
+#pragma once
 
 #include <boost/mpi/communicator.hpp>
 #include <boost/mpi/request.hpp>
 
 #include <algorithm>
+#include <cstddef>
+#include <span>
 #include <vector>
 
 namespace Utils {
 namespace Mpi {
 namespace detail {
 
-inline std::vector<int> displacements(Span<int const> sizes) {
+inline std::vector<int> displacements(std::span<int const> sizes) {
   std::vector<int> displ(sizes.size());
 
   int offset = 0;
-  for (int i = 0; i < displ.size(); i++) {
+  for (std::size_t i = 0u; i < displ.size(); i++) {
     displ[i] = offset;
     offset += sizes[i];
   }
@@ -81,4 +80,3 @@ auto iall_gatherv(boost::mpi::communicator const &comm, T const *in_values,
 }
 } // namespace Mpi
 } // namespace Utils
-#endif
diff --git a/src/utils/include/utils/mpi/scatter_buffer.hpp b/src/utils/include/utils/mpi/scatter_buffer.hpp
index 9d14b436bcb..b2628ed1fd3 100644
--- a/src/utils/include/utils/mpi/scatter_buffer.hpp
+++ b/src/utils/include/utils/mpi/scatter_buffer.hpp
@@ -50,7 +50,7 @@ void scatter_buffer(T *buffer, int n_elem, boost::mpi::communicator comm,
 
     detail::size_and_offset<T>(sizes, displ, n_elem, comm, root);
 
-    for (int i = 0; i < comm.size(); i++) {
+    for (auto i = 0u; i < static_cast<unsigned>(comm.size()); i++) {
       sizes[i] *= sizeof(T);
       displ[i] *= sizeof(T);
     }
diff --git a/src/utils/include/utils/quaternion.hpp b/src/utils/include/utils/quaternion.hpp
index 4ad1f3b5780..d74b2161ec5 100644
--- a/src/utils/include/utils/quaternion.hpp
+++ b/src/utils/include/utils/quaternion.hpp
@@ -42,6 +42,7 @@
 #include "utils/Array.hpp"
 #include "utils/Vector.hpp"
 #include "utils/matrix.hpp"
+#include "utils/serialization/array.hpp"
 
 #include <cassert>
 #include <cstddef>
@@ -64,7 +65,7 @@ template <typename T> struct Quaternion {
 private:
   friend class boost::serialization::access;
   template <class Archive> void serialize(Archive &ar, const unsigned int) {
-    ar &m_data;
+    ar & m_data;
   }
 
 public:
diff --git a/src/utils/include/utils/raster.hpp b/src/utils/include/utils/raster.hpp
index 99ad9611fb8..1d50bb8e5c7 100644
--- a/src/utils/include/utils/raster.hpp
+++ b/src/utils/include/utils/raster.hpp
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef UTILS_RASTER_HPP
-#define UTILS_RASTER_HPP
+
+#pragma once
 
 #include "utils/Vector.hpp"
 #include "utils/math/make_lin_space.hpp"
@@ -59,5 +59,3 @@ auto raster(Vector<T, 3> const &offset, Vector<T, 3> const &grid_spacing,
   return res;
 }
 } // namespace Utils
-
-#endif // UTILS_RASTER_HPP
diff --git a/src/utils/include/utils/sampling.hpp b/src/utils/include/utils/sampling.hpp
index f85122669e0..55aaa424f49 100644
--- a/src/utils/include/utils/sampling.hpp
+++ b/src/utils/include/utils/sampling.hpp
@@ -16,11 +16,10 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef UTILS_SAMPLING_HPP
-#define UTILS_SAMPLING_HPP
+
+#pragma once
 
 #include "utils/Vector.hpp"
-#include "utils/constants.hpp"
 #include "utils/math/make_lin_space.hpp"
 #include "utils/math/sqr.hpp"
 
@@ -66,8 +65,11 @@ std::vector<Vector3d> get_cylindrical_sampling_positions(
   auto const delta_z =
       (z_limits.second - z_limits.first) / static_cast<double>(min_n_samples);
 
-  auto const r_range = make_lin_space(r_limits.first + .5 * delta_r,
-                                      r_limits.second, n_r_bins, endpoint);
+  std::vector<double> r_range;
+  std::ranges::copy(make_lin_space(r_limits.first + .5 * delta_r,
+                                   r_limits.second, n_r_bins, endpoint),
+                    std::back_inserter(r_range));
+
   auto const phi_range =
       make_lin_space(phi_limits.first + .5 * delta_phi, phi_limits.second,
                      n_phi_bins, endpoint);
@@ -78,7 +80,7 @@ std::vector<Vector3d> get_cylindrical_sampling_positions(
   std::vector<Vector3d> sampling_positions;
   for (auto const z : z_range) {
     for (auto const phi : phi_range) {
-      sampling_positions.push_back(Vector3d{{*r_range.begin(), phi, z}});
+      sampling_positions.push_back(Vector3d{{r_range.front(), phi, z}});
     }
   }
 
@@ -86,17 +88,17 @@ std::vector<Vector3d> get_cylindrical_sampling_positions(
   auto phis = [n_phi_bins, phi_limits](long r_bin) {
     auto const phis_range = make_lin_space(
         phi_limits.first, phi_limits.second,
-        n_phi_bins * (static_cast<std::size_t>(r_bin) + 1), endpoint);
+        n_phi_bins * (static_cast<std::size_t>(r_bin) + 1u), endpoint);
     return phis_range;
   };
   // Calculate the sampling positions
   // Along z
   for (auto const z : z_range) {
     // Along r
-    for (auto r = ++r_range.begin(); r != r_range.end(); ++r) {
+    for (auto r_it = ++r_range.begin(); r_it != r_range.end(); ++r_it) {
       // Along phi
-      for (auto const phi : phis(std::distance(r_range.begin(), r))) {
-        sampling_positions.push_back(Vector3d{{*r, phi, z}});
+      for (auto const phi : phis(std::distance(r_range.begin(), r_it))) {
+        sampling_positions.push_back(Vector3d{{*r_it, phi, z}});
       }
     }
   }
@@ -105,5 +107,3 @@ std::vector<Vector3d> get_cylindrical_sampling_positions(
 }
 
 } // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/serialization/memcpy_archive.hpp b/src/utils/include/utils/serialization/memcpy_archive.hpp
index 04e0ba10aef..55368db0e10 100644
--- a/src/utils/include/utils/serialization/memcpy_archive.hpp
+++ b/src/utils/include/utils/serialization/memcpy_archive.hpp
@@ -16,10 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef ESPRESSO_MEMCPY_ARCHIVE_HPP
-#define ESPRESSO_MEMCPY_ARCHIVE_HPP
 
-#include "utils/Span.hpp"
+#pragma once
 
 #include <boost/mpl/bool.hpp>
 #include <boost/serialization/is_bitwise_serializable.hpp>
@@ -30,13 +28,14 @@
 #include <cstddef>
 #include <cstring>
 #include <memory>
+#include <span>
 #include <type_traits>
 
 namespace Utils {
 /** @brief Type trait to indicate that a type is
  *         serializable with a static size, e.g. is
  *         suitable for memcpy serialization. Only
- *         specialize this to std::true_type if it is
+ *         specialize this to @c std::true_type if it is
  *         guaranteed that serializing this type always
  *         returns the same number of bytes, independent
  *         of object state.
@@ -45,31 +44,30 @@ namespace Utils {
  */
 template <class T>
 struct is_statically_serializable
-    : std::integral_constant<
-          bool, std::is_trivially_copyable_v<T> or
-                    boost::serialization::is_bitwise_serializable<T>::value> {};
+    : std::bool_constant<
+          std::is_trivially_copyable_v<T> or
+          boost::serialization::is_bitwise_serializable<T>::value> {};
 
 namespace detail {
 /* Use memcpy for packing */
 template <class T>
-using use_memcpy = std::integral_constant<
-    bool, std::is_trivially_copyable_v<T> or
-              boost::serialization::is_bitwise_serializable<T>::value>;
+using use_memcpy =
+    std::bool_constant<std::is_trivially_copyable_v<T> or
+                       boost::serialization::is_bitwise_serializable<T>::value>;
 /* Use serialize function only if the type is opt-in but not
  * trivially copyable, in which case memcpy is more efficient. */
 template <class T>
-using use_serialize =
-    std::integral_constant<bool, not use_memcpy<T>::value and
-                                     is_statically_serializable<T>::value>;
+using use_serialize = std::bool_constant<not use_memcpy<T>::value and
+                                         is_statically_serializable<T>::value>;
 
 template <class Derived> class BasicMemcpyArchive {
   /** Buffer to write to */
-  Utils::Span<char> buf;
+  std::span<char> buf;
   /** Current position in the buffer */
   char *insert;
 
 public:
-  explicit BasicMemcpyArchive(Utils::Span<char> buf)
+  explicit BasicMemcpyArchive(std::span<char> buf)
       : buf(buf), insert(buf.data()) {}
 
   auto get_library_version() const { return std::size_t{4}; }
@@ -79,21 +77,21 @@ template <class Derived> class BasicMemcpyArchive {
   }
 
   void skip(std::size_t bytes) {
-    assert((insert + bytes) <= buf.end());
+    assert((insert + bytes) <= &*buf.end());
     insert += bytes;
   }
 
 private:
   void read(void *data, std::size_t bytes) {
     /* check that there is enough space left in the buffer */
-    assert((insert + bytes) <= buf.end());
+    assert((insert + bytes) <= &*buf.end());
     std::memcpy(data, insert, bytes);
     insert += bytes;
   }
 
   void write(const void *data, std::size_t bytes) {
     /* check that there is enough space left in the buffer */
-    assert((insert + bytes) <= buf.end());
+    assert((insert + bytes) <= &*buf.end());
     std::memcpy(insert, data, bytes);
     insert += bytes;
   }
@@ -115,7 +113,7 @@ template <class Derived> class BasicMemcpyArchive {
     boost::serialization::serialize_adl(*static_cast<Derived *>(this), value,
                                         4);
     auto const new_pos = insert;
-    assert((new_pos - old_pos) <= sizeof(T));
+    assert(static_cast<std::size_t>(new_pos - old_pos) <= sizeof(T));
 
     auto const padding_size = sizeof(T) - (new_pos - old_pos);
     skip(padding_size);
@@ -123,14 +121,14 @@ template <class Derived> class BasicMemcpyArchive {
 
 public:
   template <class T>
-  auto operator>>(T &value)
-      -> std::enable_if_t<detail::use_serialize<T>::value> {
+  auto
+  operator>>(T &value) -> std::enable_if_t<detail::use_serialize<T>::value> {
     process(value);
   }
 
   template <class T>
-  auto operator<<(T &value)
-      -> std::enable_if_t<detail::use_serialize<T>::value> {
+  auto
+  operator<<(T &value) -> std::enable_if_t<detail::use_serialize<T>::value> {
     process(value);
   }
 
@@ -173,7 +171,7 @@ class MemcpyIArchive : public detail::BasicMemcpyArchive<MemcpyIArchive> {
   /**
    * @param buf Buffer to read from.
    */
-  explicit MemcpyIArchive(Utils::Span<char> buf) : base_type(buf) {}
+  explicit MemcpyIArchive(std::span<char> buf) : base_type(buf) {}
 
   /**
    * @brief Number of bytes read from the buffer.
@@ -207,7 +205,7 @@ class MemcpyOArchive : public detail::BasicMemcpyArchive<MemcpyOArchive> {
   /**
    * @param buf Buffer to write to.
    */
-  explicit MemcpyOArchive(Utils::Span<char> buf) : base_type(buf) {}
+  explicit MemcpyOArchive(std::span<char> buf) : base_type(buf) {}
 
   /**
    * @brief Number of bytes written to the buffer.
@@ -226,5 +224,3 @@ class MemcpyOArchive : public detail::BasicMemcpyArchive<MemcpyOArchive> {
   }
 };
 } // namespace Utils
-
-#endif // ESPRESSO_MEMCPY_ARCHIVE_HPP
diff --git a/src/utils/tests/Array_test.cpp b/src/utils/tests/Array_test.cpp
index 7b75165121c..6fcdef892b2 100644
--- a/src/utils/tests/Array_test.cpp
+++ b/src/utils/tests/Array_test.cpp
@@ -65,9 +65,9 @@ BOOST_AUTO_TEST_CASE(element_access) {
   auto a = Array<int, 5>{{{5, 6, 7, 8, 9}}};
   auto const &b = a;
 
-  int c = 5;
-  int j = 0;
-  for (int i : a) {
+  auto c = 5;
+  auto j = 0u;
+  for (auto i : a) {
     BOOST_CHECK_EQUAL(i, c);
     BOOST_CHECK_EQUAL(a[j], c);
     BOOST_CHECK_EQUAL(b[j], c);
diff --git a/src/utils/tests/Bag_test.cpp b/src/utils/tests/Bag_test.cpp
index fa509127219..046a7f87271 100644
--- a/src/utils/tests/Bag_test.cpp
+++ b/src/utils/tests/Bag_test.cpp
@@ -16,7 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#define BOOST_TEST_MODULE Utils::Bag
+
+#define BOOST_TEST_MODULE Utils::Bag test
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
@@ -24,7 +25,6 @@
 
 #include <boost/archive/text_iarchive.hpp>
 #include <boost/archive/text_oarchive.hpp>
-#include <boost/range/algorithm/find.hpp>
 
 #include <algorithm>
 #include <array>
@@ -55,7 +55,7 @@ BOOST_AUTO_TEST_CASE(insert_) {
 
     /* The elements are in the bag */
     for (auto e : elements) {
-      BOOST_CHECK(boost::find(bag, e) != bag.end());
+      BOOST_CHECK(std::ranges::find(bag, e) != bag.end());
     }
   }
 
@@ -88,8 +88,8 @@ BOOST_AUTO_TEST_CASE(erase_) {
     /* the begin iterator is returned */
     BOOST_CHECK(it == bag.begin());
     /* and the other elements are still in the bag */
-    BOOST_CHECK(boost::find(bag, elements[1]) != bag.end());
-    BOOST_CHECK(boost::find(bag, elements[2]) != bag.end());
+    BOOST_CHECK(std::ranges::find(bag, elements[1]) != bag.end());
+    BOOST_CHECK(std::ranges::find(bag, elements[2]) != bag.end());
   }
 
   {
@@ -104,8 +104,8 @@ BOOST_AUTO_TEST_CASE(erase_) {
     /* the correct iterator is returned */
     BOOST_CHECK(it == bag.begin() + 1);
     /* and the other elements are still in the bag */
-    BOOST_CHECK(boost::find(bag, elements[0]) != bag.end());
-    BOOST_CHECK(boost::find(bag, elements[2]) != bag.end());
+    BOOST_CHECK(std::ranges::find(bag, elements[0]) != bag.end());
+    BOOST_CHECK(std::ranges::find(bag, elements[2]) != bag.end());
   }
 
   {
@@ -120,8 +120,8 @@ BOOST_AUTO_TEST_CASE(erase_) {
     /* the correct iterator is returned */
     BOOST_CHECK(it == bag.end());
     /* and the other elements are still in the bag */
-    BOOST_CHECK(boost::find(bag, elements[0]) != bag.end());
-    BOOST_CHECK(boost::find(bag, elements[1]) != bag.end());
+    BOOST_CHECK(std::ranges::find(bag, elements[0]) != bag.end());
+    BOOST_CHECK(std::ranges::find(bag, elements[1]) != bag.end());
   }
 }
 
@@ -149,12 +149,12 @@ BOOST_AUTO_TEST_CASE(iterator_range_) {
 
   /* The range of the non-const iterators spans all elements */
   for (auto const &e : elements) {
-    BOOST_CHECK(boost::find(bag, e) != bag.end());
+    BOOST_CHECK(std::ranges::find(bag, e) != bag.end());
   }
   /* The range of the const iterators spans all elements */
   for (auto const &e : elements) {
-    BOOST_CHECK(boost::find(const_cast<const Utils::Bag<int> &>(bag), e) !=
-                bag.end());
+    BOOST_CHECK(std::ranges::find(const_cast<const Utils::Bag<int> &>(bag),
+                                  e) != bag.end());
   }
 }
 
@@ -207,7 +207,7 @@ BOOST_AUTO_TEST_CASE(resize_) {
   BOOST_CHECK_EQUAL(bag.size(), size);
   /* All the elements are still in the bag */
   for (auto const &e : bag) {
-    BOOST_CHECK(boost::find(bag, e) != bag.end());
+    BOOST_CHECK(std::ranges::find(bag, e) != bag.end());
   }
 }
 
@@ -232,12 +232,12 @@ BOOST_AUTO_TEST_CASE(swap_) {
   /* The elements are swapped */
   BOOST_CHECK_EQUAL(bag2.size(), elements1.size());
   for (auto const &e : elements1) {
-    BOOST_CHECK(boost::find(bag2, e) != bag2.end());
+    BOOST_CHECK(std::ranges::find(bag2, e) != bag2.end());
   }
 
   BOOST_CHECK_EQUAL(bag1.size(), elements2.size());
   for (auto const &e : elements2) {
-    BOOST_CHECK(boost::find(bag1, e) != bag1.end());
+    BOOST_CHECK(std::ranges::find(bag1, e) != bag1.end());
   }
 }
 
@@ -259,5 +259,5 @@ BOOST_AUTO_TEST_CASE(serialize_) {
   boost::archive::text_iarchive(stream) >> restored_bag;
 
   /* The deserialized object contains the same elements */
-  BOOST_CHECK(boost::equal(bag, restored_bag));
+  BOOST_CHECK(std::ranges::equal(bag, restored_bag));
 }
diff --git a/src/utils/tests/CMakeLists.txt b/src/utils/tests/CMakeLists.txt
index d7935217f6e..c7f954e78b4 100644
--- a/src/utils/tests/CMakeLists.txt
+++ b/src/utils/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2010-2022 The ESPResSo project
+# Copyright (C) 2010-2024 The ESPResSo project
 # Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
 #   Max-Planck-Institute for Polymer Research, Theory Group
 #
@@ -19,95 +19,76 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
+include(espresso_unit_test)
 
-unit_test(NAME abs_test SRC abs_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Vector_test SRC Vector_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Factory_test SRC Factory_test.cpp DEPENDS espresso::utils)
-unit_test(NAME NumeratedContainer_test SRC NumeratedContainer_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME keys_test SRC keys_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Cache_test SRC Cache_test.cpp DEPENDS espresso::utils)
-unit_test(NAME histogram SRC histogram.cpp DEPENDS espresso::utils)
-unit_test(NAME accumulator SRC accumulator.cpp DEPENDS espresso::utils
-          Boost::serialization)
-unit_test(NAME int_pow SRC int_pow_test.cpp DEPENDS espresso::utils)
-unit_test(NAME sgn SRC sgn_test.cpp DEPENDS espresso::utils)
-unit_test(NAME AS_erfc_part SRC AS_erfc_part_test.cpp DEPENDS espresso::utils)
-unit_test(NAME sinc SRC sinc_test.cpp DEPENDS espresso::utils)
-unit_test(NAME permute_ifield_test SRC permute_ifield_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME vec_rotate SRC vec_rotate_test.cpp DEPENDS espresso::utils)
-unit_test(NAME tensor_product SRC tensor_product_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME linear_interpolation SRC linear_interpolation_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME interpolation_gradient SRC interpolation_gradient_test.cpp
-          DEPENDS espresso::utils)
-unit_test(NAME interpolation SRC interpolation_test.cpp DEPENDS espresso::utils)
-unit_test(NAME bspline_test SRC bspline_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Span_test SRC Span_test.cpp DEPENDS espresso::utils)
-unit_test(NAME matrix_vector_product SRC matrix_vector_product.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME index_test SRC index_test.cpp DEPENDS espresso::utils)
-unit_test(NAME tuple_test SRC tuple_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Array_test SRC Array_test.cpp DEPENDS Boost::serialization
-          espresso::utils)
-unit_test(NAME contains_test SRC contains_test.cpp DEPENDS espresso::utils)
-unit_test(NAME Counter_test SRC Counter_test.cpp DEPENDS espresso::utils
-          Boost::serialization)
-unit_test(NAME RunningAverage_test SRC RunningAverage_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME for_each_pair_test SRC for_each_pair_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME raster_test SRC raster_test.cpp DEPENDS espresso::utils)
-unit_test(NAME make_lin_space_test SRC make_lin_space_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME sampling_test SRC sampling_test.cpp DEPENDS espresso::utils)
-unit_test(NAME coordinate_transformation_test SRC coordinate_transformation.cpp
-          DEPENDS espresso::utils)
-unit_test(NAME cylindrical_transformation_test SRC
-          cylindrical_transformation.cpp DEPENDS espresso::utils)
-unit_test(NAME rotation_matrix_test SRC rotation_matrix_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME quaternion_test SRC quaternion_test.cpp DEPENDS espresso::utils)
-unit_test(NAME mask_test SRC mask_test.cpp DEPENDS espresso::utils)
-unit_test(NAME type_traits_test SRC type_traits_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME uniform_test SRC uniform_test.cpp DEPENDS espresso::utils)
-unit_test(NAME memcpy_archive_test SRC memcpy_archive_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME triangle_functions_test SRC triangle_functions_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME Bag_test SRC Bag_test.cpp DEPENDS espresso::utils
-          Boost::serialization)
-unit_test(NAME integral_parameter_test SRC integral_parameter_test.cpp DEPENDS
-          espresso::utils)
-unit_test(NAME flatten_test SRC flatten_test.cpp DEPENDS espresso::utils)
-unit_test(NAME pack_test SRC pack_test.cpp DEPENDS Boost::serialization
-          espresso::utils)
-unit_test(NAME unordered_map_test SRC unordered_map_test.cpp DEPENDS
-          Boost::serialization espresso::utils)
-unit_test(NAME u32_to_u64_test SRC u32_to_u64_test.cpp DEPENDS espresso::utils
-          NUM_PROC 1)
-unit_test(NAME gather_buffer_test SRC gather_buffer_test.cpp DEPENDS
-          espresso::utils::mpi Boost::mpi MPI::MPI_CXX NUM_PROC 4)
-unit_test(NAME scatter_buffer_test SRC scatter_buffer_test.cpp DEPENDS
-          espresso::utils::mpi Boost::mpi MPI::MPI_CXX NUM_PROC 4)
-unit_test(NAME all_compare_test SRC all_compare_test.cpp DEPENDS
-          espresso::utils::mpi Boost::mpi MPI::MPI_CXX NUM_PROC 3)
-unit_test(NAME gatherv_test SRC gatherv_test.cpp DEPENDS espresso::utils::mpi
-          Boost::mpi MPI::MPI_CXX NUM_PROC 3)
-unit_test(NAME iall_gatherv_test SRC iall_gatherv_test.cpp DEPENDS
-          espresso::utils::mpi Boost::mpi MPI::MPI_CXX NUM_PROC 3)
-unit_test(NAME sendrecv_test SRC sendrecv_test.cpp DEPENDS espresso::utils::mpi
-          Boost::mpi MPI::MPI_CXX espresso::utils NUM_PROC 3)
-unit_test(NAME serialization_test SRC serialization_test.cpp DEPENDS
-          espresso::utils Boost::serialization Boost::mpi MPI::MPI_CXX NUM_PROC
-          1)
-unit_test(NAME matrix_test SRC matrix_test.cpp DEPENDS espresso::utils
-          Boost::serialization NUM_PROC 1)
-unit_test(NAME orthonormal_vec_test SRC orthonormal_vec_test.cpp DEPENDS
-          espresso::utils Boost::serialization NUM_PROC 1)
-unit_test(NAME reduce_optional_test SRC reduce_optional_test.cpp DEPENDS
-          espresso::utils::mpi Boost::mpi MPI::MPI_CXX NUM_PROC 4)
+espresso_unit_test(SRC Vector_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Factory_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC NumeratedContainer_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC keys_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Cache_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC histogram_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC accumulator_test.cpp DEPENDS espresso::utils
+                   Boost::serialization)
+espresso_unit_test(SRC int_pow_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC sgn_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC AS_erfc_part_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC permute_ifield_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC vec_rotate_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC tensor_product_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC linear_interpolation_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC interpolation_gradient_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC interpolation_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC bspline_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC matrix_vector_product_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC index_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC tuple_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Array_test.cpp DEPENDS Boost::serialization
+                   espresso::utils)
+espresso_unit_test(SRC contains_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Counter_test.cpp DEPENDS espresso::utils
+                   Boost::serialization)
+espresso_unit_test(SRC RunningAverage_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC for_each_pair_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC raster_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC make_lin_space_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC sampling_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC coordinate_transformation_test.cpp DEPENDS
+                   espresso::utils)
+espresso_unit_test(SRC cylindrical_transformation_test.cpp DEPENDS
+                   espresso::utils)
+espresso_unit_test(SRC rotation_matrix_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC quaternion_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC mask_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC type_traits_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC uniform_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC memcpy_archive_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC triangle_functions_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC Bag_test.cpp DEPENDS espresso::utils
+                   Boost::serialization)
+espresso_unit_test(SRC integral_parameter_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC flatten_test.cpp DEPENDS espresso::utils)
+espresso_unit_test(SRC pack_test.cpp DEPENDS Boost::serialization
+                   espresso::utils)
+espresso_unit_test(SRC unordered_map_test.cpp DEPENDS Boost::serialization
+                   espresso::utils)
+espresso_unit_test(SRC u32_to_u64_test.cpp DEPENDS espresso::utils NUM_PROC 1)
+espresso_unit_test(SRC gather_buffer_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 4)
+espresso_unit_test(SRC scatter_buffer_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 4)
+espresso_unit_test(SRC all_compare_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 3)
+espresso_unit_test(SRC gatherv_test.cpp DEPENDS espresso::utils::mpi Boost::mpi
+                   MPI::MPI_CXX NUM_PROC 3)
+espresso_unit_test(SRC iall_gatherv_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 3)
+espresso_unit_test(SRC sendrecv_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX espresso::utils NUM_PROC 3)
+espresso_unit_test(SRC serialization_test.cpp DEPENDS espresso::utils
+                   Boost::serialization Boost::mpi MPI::MPI_CXX NUM_PROC 1)
+espresso_unit_test(SRC matrix_test.cpp DEPENDS espresso::utils
+                   Boost::serialization NUM_PROC 1)
+espresso_unit_test(SRC orthonormal_vec_test.cpp DEPENDS espresso::utils
+                   Boost::serialization NUM_PROC 1)
+espresso_unit_test(SRC reduce_optional_test.cpp DEPENDS espresso::utils::mpi
+                   Boost::mpi MPI::MPI_CXX NUM_PROC 4)
diff --git a/src/utils/tests/Span_test.cpp b/src/utils/tests/Span_test.cpp
deleted file mode 100644
index fc5b01628d3..00000000000
--- a/src/utils/tests/Span_test.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (C) 2018-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#define BOOST_TEST_MODULE Utils::Span test
-#define BOOST_TEST_DYN_LINK
-#include <boost/test/unit_test.hpp>
-
-#include "utils/Span.hpp"
-using Utils::Span;
-
-#include <cstddef>
-#include <numeric>
-#include <stdexcept>
-#include <type_traits>
-#include <vector>
-
-BOOST_AUTO_TEST_CASE(const_expr_ctor) {
-  static_assert(4 == Span<int>(nullptr, 4).size());
-  BOOST_TEST_PASSPOINT();
-}
-
-BOOST_AUTO_TEST_CASE(array_ctor) {
-  BOOST_CHECK((std::is_constructible_v<Span<const int>, int[3]>));
-  BOOST_CHECK((std::is_constructible_v<Span<const int>, const int[3]>));
-  BOOST_CHECK(not(std::is_constructible_v<Span<int>, const int[3]>));
-  BOOST_CHECK((std::is_convertible_v<int[3], Span<const int>>));
-  BOOST_CHECK((std::is_convertible_v<const int[3], Span<const int>>));
-
-  int a[4] = {1, 2, 3, 4};
-  Span<int> s(a);
-
-  BOOST_CHECK_EQUAL(s.data(), a);
-  BOOST_CHECK_EQUAL(s.size(), 4);
-}
-
-BOOST_AUTO_TEST_CASE(ctor) {
-  /* Container conversion rules */
-  {
-    BOOST_CHECK((std::is_constructible_v<Span<const int>, std::vector<int>>));
-    BOOST_CHECK(
-        (std::is_constructible_v<Span<const int>, const std::vector<int>>));
-    BOOST_CHECK(
-        not(std::is_constructible_v<Span<int>, const std::vector<int>>));
-    BOOST_CHECK((std::is_convertible_v<std::vector<int>, Span<const int>>));
-    BOOST_CHECK(
-        (std::is_convertible_v<const std::vector<int>, Span<const int>>));
-  }
-
-  /* from ptr + size */
-  {
-    std::vector<int> v(23);
-
-    auto s = Span<int>(v.data(), v.size());
-
-    BOOST_CHECK(v.size() == s.size());
-    BOOST_CHECK(v.data() == s.data());
-  }
-
-  /* From container */
-  {
-    std::vector<int> v{{1, 2, 3}};
-    auto s = Span<int>(v);
-
-    BOOST_CHECK(v.size() == s.size());
-    BOOST_CHECK(v.data() == s.data());
-  }
-}
-
-BOOST_AUTO_TEST_CASE(iterators) {
-  int dummy;
-  auto const p = &dummy;
-  auto const size = 11u;
-
-  auto s = Span<int>(p, size);
-
-  BOOST_CHECK(s.begin() == p);
-  BOOST_CHECK(s.cbegin() == p);
-  BOOST_CHECK(s.end() == (p + size));
-  BOOST_CHECK(s.cend() == (p + size));
-}
-
-BOOST_AUTO_TEST_CASE(element_access) {
-  std::vector<int> v(11);
-  std::iota(v.begin(), v.end(), 5);
-
-  auto s = Span<int>(v.data(), v.size());
-
-  for (Span<int>::size_type i = 0; i < s.size(); ++i) {
-    BOOST_CHECK(v.at(i) == s[i]);
-    BOOST_CHECK(v.at(i) == s.at(i));
-  }
-
-  BOOST_CHECK_THROW(s.at(s.size()), std::out_of_range);
-}
-
-BOOST_AUTO_TEST_CASE(make_span_) {
-  using std::declval;
-  using Utils::make_span;
-
-  static_assert(std::is_same_v<decltype(make_span(declval<int *>(),
-                                                  declval<std::size_t>())),
-                               Span<int>>);
-  static_assert(std::is_same_v<decltype(make_span(declval<const int *>(),
-                                                  declval<std::size_t>())),
-                               Span<const int>>);
-
-  /* From pointer and size */
-  {
-    const int p = 5;
-    auto s = make_span(&p, 1);
-    BOOST_CHECK_EQUAL(s.data(), &p);
-    BOOST_CHECK_EQUAL(s.size(), 1);
-  }
-
-  /* From container */
-  {
-    std::vector<int> vec(5);
-    auto result = make_span(vec);
-    BOOST_CHECK_EQUAL(result.data(), vec.data());
-    BOOST_CHECK_EQUAL(result.size(), vec.size());
-  }
-}
-
-BOOST_AUTO_TEST_CASE(make_const_span_) {
-  using std::declval;
-  using Utils::make_const_span;
-
-  static_assert(std::is_same_v<decltype(make_const_span(
-                                   declval<int *>(), declval<std::size_t>())),
-                               Span<const int>>);
-  static_assert(
-      std::is_same_v<decltype(make_const_span(declval<const int *>(),
-                                              declval<std::size_t>())),
-                     Span<const int>>);
-
-  {
-    const int p = 5;
-    auto s = make_const_span(&p, 1);
-    BOOST_CHECK_EQUAL(s.data(), &p);
-    BOOST_CHECK_EQUAL(s.size(), 1);
-  }
-
-  {
-    std::vector<int> vec(5);
-    auto result = make_const_span(vec);
-    BOOST_CHECK_EQUAL(result.data(), vec.data());
-    BOOST_CHECK_EQUAL(result.size(), vec.size());
-  }
-}
diff --git a/src/utils/tests/Vector_test.cpp b/src/utils/tests/Vector_test.cpp
index a29709ecfc6..64463077fdf 100644
--- a/src/utils/tests/Vector_test.cpp
+++ b/src/utils/tests/Vector_test.cpp
@@ -36,6 +36,7 @@
 #include <iterator>
 #include <limits>
 #include <numeric>
+#include <span>
 #include <stdexcept>
 #include <type_traits>
 #include <vector>
@@ -106,9 +107,8 @@ BOOST_AUTO_TEST_CASE(range_constructor_test) {
 }
 
 BOOST_AUTO_TEST_CASE(unit_vector_test) {
-  BOOST_CHECK((Utils::unit_vector<int>(2) == Utils::Vector3i{0, 0, 1}));
-  BOOST_CHECK_THROW(Utils::unit_vector<double>(3), std::domain_error);
-  BOOST_CHECK_THROW(Utils::unit_vector<float>(-1), std::domain_error);
+  BOOST_CHECK((Utils::unit_vector<int>(2u) == Utils::Vector3i{0, 0, 1}));
+  BOOST_CHECK_THROW(Utils::unit_vector<double>(3u), std::domain_error);
 }
 
 BOOST_AUTO_TEST_CASE(test_norm2) {
@@ -189,6 +189,13 @@ BOOST_AUTO_TEST_CASE(algebraic_operators) {
     BOOST_CHECK(v4 == (v3 /= 2));
   }
 
+  {
+    Utils::Vector3i v3{2, 12, 91};
+    Utils::Vector3i v4{180, 30, 3};
+    auto v5 = 360 / v3;
+    BOOST_CHECK(v5 == v4);
+  }
+
   BOOST_CHECK((sqrt(Utils::Vector3d{1., 2., 3.}) ==
                Utils::Vector3d{sqrt(1.), sqrt(2.), sqrt(3.)}));
 
@@ -310,6 +317,27 @@ BOOST_AUTO_TEST_CASE(conversion) {
     auto const result = Utils::Vector3d{orig.as_vector()};
     BOOST_TEST(result == orig);
   }
+
+  // check span conversion
+  {
+    auto const view = static_cast<std::span<double, 3>>(orig);
+    BOOST_TEST(view.data() == orig.data());
+    BOOST_TEST(view.size() == orig.size());
+  }
+
+  // check span conversion
+  {
+    auto const view = std::span(orig);
+    BOOST_TEST(view.data() == orig.data());
+    BOOST_TEST(view.size() == orig.size());
+  }
+
+  // check span conversion
+  {
+    auto const view = orig.as_span();
+    BOOST_TEST(view.data() == orig.data());
+    BOOST_TEST(view.size() == orig.size());
+  }
 }
 
 BOOST_AUTO_TEST_CASE(vector_product_test) {
diff --git a/src/utils/tests/abs_test.cpp b/src/utils/tests/abs_test.cpp
deleted file mode 100644
index 94e43728e7b..00000000000
--- a/src/utils/tests/abs_test.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2019-2022 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#define BOOST_TEST_MODULE abs test
-#define BOOST_TEST_DYN_LINK
-#include <boost/test/unit_test.hpp>
-
-#include <utils/math/abs.hpp>
-
-#include <cmath>
-#include <type_traits>
-
-BOOST_AUTO_TEST_CASE(abs_test) {
-  using Utils::abs;
-
-  static_assert(std::is_same_v<float, decltype(abs(1.f))>);
-  static_assert(std::is_same_v<double, decltype(abs(1.))>);
-
-  BOOST_CHECK_EQUAL(std::abs(3.1415), abs(3.1415));
-  BOOST_CHECK_EQUAL(std::abs(-3.1415), abs(-3.1415));
-  BOOST_CHECK_EQUAL(std::abs(3.1415f), abs(3.1415f));
-  BOOST_CHECK_EQUAL(std::abs(-3.1415f), abs(-3.1415f));
-}
diff --git a/src/utils/tests/accumulator.cpp b/src/utils/tests/accumulator_test.cpp
similarity index 100%
rename from src/utils/tests/accumulator.cpp
rename to src/utils/tests/accumulator_test.cpp
diff --git a/src/utils/tests/coordinate_transformation.cpp b/src/utils/tests/coordinate_transformation_test.cpp
similarity index 90%
rename from src/utils/tests/coordinate_transformation.cpp
rename to src/utils/tests/coordinate_transformation_test.cpp
index 124c1cbc377..ac0c80449f7 100644
--- a/src/utils/tests/coordinate_transformation.cpp
+++ b/src/utils/tests/coordinate_transformation_test.cpp
@@ -22,11 +22,11 @@
 #include <boost/test/unit_test.hpp>
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/coordinate_transformation.hpp>
 #include <utils/math/vec_rotate.hpp>
 
 #include <cmath>
+#include <numbers>
 #include <random>
 
 using Utils::Vector3d;
@@ -57,7 +57,7 @@ BOOST_AUTO_TEST_CASE(basis_transform_test) {
   Vector3d const v3 = Utils::vector_product(v1, v2).normalized();
   Vector3d const v4 = basis_change(v1, v2, v3, 0.1 * v1 + 0.2 * v2 - 0.3 * v3);
   Vector3d const v4_expected = Vector3d{{0.1, 0.2, -0.3}};
-  for (int i = 0; i < 3; ++i) {
+  for (auto i = 0u; i < 3u; ++i) {
     BOOST_CHECK_SMALL(v_identity_transform[i] - v[i], eps);
     BOOST_CHECK_SMALL(v_swap_coord_transform[i] - v[i], eps);
     BOOST_CHECK_SMALL(v4[i] - v4_expected[i], eps);
@@ -80,7 +80,7 @@ BOOST_AUTO_TEST_CASE(
   auto const v5 = Utils::transform_coordinate_cartesian_to_cylinder(
       v1 + 2 * axis, axis, v2);
   Vector3d v5_ref{{v1.norm(), -angle_v1_v2, 2}};
-  for (int i = 0; i < 3; ++i) {
+  for (auto i = 0u; i < 3u; ++i) {
     BOOST_CHECK_SMALL(v3[i] - v3_ref[i], eps);
     BOOST_CHECK_SMALL(v4[i] - v4_ref[i], eps);
     BOOST_CHECK_SMALL(v5[i] - v5_ref[i], eps);
@@ -99,10 +99,10 @@ BOOST_AUTO_TEST_CASE(cartesian_to_cylinder_with_axis_and_orientation_test) {
     auto const x_cyl = transform_coordinate_cartesian_to_cylinder(x, z, y);
     auto const y_cyl = transform_coordinate_cartesian_to_cylinder(y, z, y);
     auto const z_cyl = transform_coordinate_cartesian_to_cylinder(z, z, y);
-    auto const x_ref = Vector3d{{1.0, -Utils::pi() / 2.0, 0.0}};
-    auto const y_ref = Vector3d{{1.0, 0.0, 0.0}};
-    auto const z_ref = Vector3d{{0.0, z_cyl[1], 1.0}};
-    for (int i = 0; i < 3; ++i) {
+    auto const x_ref = Vector3d{{1., -std::numbers::pi / 2., 0.}};
+    auto const y_ref = Vector3d{{1., 0., 0.}};
+    auto const z_ref = Vector3d{{0., z_cyl[1], 1.}};
+    for (auto i = 0u; i < 3u; ++i) {
       BOOST_CHECK_SMALL(x_cyl[i] - x_ref[i], eps);
       BOOST_CHECK_SMALL(y_cyl[i] - y_ref[i], eps);
       BOOST_CHECK_SMALL(z_cyl[i] - z_ref[i], eps);
@@ -110,13 +110,13 @@ BOOST_AUTO_TEST_CASE(cartesian_to_cylinder_with_axis_and_orientation_test) {
   }
   // check transformation with orientation for another angle
   {
-    auto const u = vec_rotate(z, Utils::pi() / 3.0, x);
-    auto const v = vec_rotate(z, Utils::pi() / 3.0, y);
+    auto const u = vec_rotate(z, std::numbers::pi / 3., x);
+    auto const v = vec_rotate(z, std::numbers::pi / 3., y);
     auto const u_cyl = transform_coordinate_cartesian_to_cylinder(u, z, y);
     auto const v_cyl = transform_coordinate_cartesian_to_cylinder(v, z, y);
-    auto const u_ref = Vector3d{{1.0, Utils::pi() * (1. / 3. - 1. / 2.), 0.0}};
-    auto const v_ref = Vector3d{{1.0, Utils::pi() / 3.0, 0.0}};
-    for (int i = 0; i < 3; ++i) {
+    auto const u_ref = Vector3d{{1., std::numbers::pi * (1. / 3. - 0.5), 0.}};
+    auto const v_ref = Vector3d{{1., std::numbers::pi / 3., 0.}};
+    for (auto i = 0u; i < 3u; ++i) {
       BOOST_CHECK_SMALL(u_cyl[i] - u_ref[i], eps);
       BOOST_CHECK_SMALL(v_cyl[i] - v_ref[i], eps);
     }
@@ -137,7 +137,7 @@ BOOST_AUTO_TEST_CASE(cartesian_to_cylinder_with_axis_and_orientation_test) {
       Vector3d const v1_v1_ref{v1.norm(), 0.0, 0.0};
       Vector3d const v2_v1_ref{v2.norm(), Utils::angle_between(v1, v2), 0.0};
       Vector3d const v1_v2_ref{v1.norm(), -Utils::angle_between(v1, v2), 0.0};
-      for (int i = 0; i < 3; ++i) {
+      for (auto i = 0u; i < 3u; ++i) {
         BOOST_CHECK_SMALL(v1_v1[i] - v1_v1_ref[i], eps);
         BOOST_CHECK_SMALL(v2_v1[i] - v2_v1_ref[i], eps);
         BOOST_CHECK_SMALL(v1_v2[i] - v1_v2_ref[i], eps);
@@ -148,7 +148,7 @@ BOOST_AUTO_TEST_CASE(cartesian_to_cylinder_with_axis_and_orientation_test) {
 
 BOOST_AUTO_TEST_CASE(cylinder_to_cartesian_test) {
   constexpr auto eps = 1e-14;
-  auto const cyl = Vector3d{{1.0, Utils::pi() / 4, 2.0}};
+  auto const cyl = Vector3d{{1., std::numbers::pi / 4., 2.}};
   auto const pos = transform_coordinate_cylinder_to_cartesian(cyl);
   BOOST_CHECK_SMALL(pos[0] - std::sqrt(2) / 2, eps);
   BOOST_CHECK_SMALL(pos[1] - std::sqrt(2) / 2, eps);
@@ -171,16 +171,16 @@ BOOST_AUTO_TEST_CASE(cylinder_to_cartesian_with_axis_and_orientation_test) {
   // We transform from cylinder zu cartesian and have to rotate back. See test
   // cartesian_to_cylinder_test.
   auto const expected_x = vec_rotate(
-      e_y, Utils::pi() / 2.0,
+      e_y, std::numbers::pi / 2.0,
       transform_coordinate_cylinder_to_cartesian(cylinder_coord, e_z, e_x));
   auto const expected_y = vec_rotate(
-      e_x, -Utils::pi() / 2.0,
+      e_x, -std::numbers::pi / 2.0,
       transform_coordinate_cylinder_to_cartesian(cylinder_coord, e_z, e_x));
   // x = r * cos(phi); y = r * sin(phi); z = z
   auto const expected_z = Vector3d{
       {cylinder_coord[0] * std::cos(cylinder_coord[1]),
        cylinder_coord[0] * std::sin(cylinder_coord[1]), cylinder_coord[2]}};
-  for (int i = 0; i < 3; ++i) {
+  for (auto i = 0u; i < 3u; ++i) {
     BOOST_CHECK_SMALL(transformed_x[i] - expected_x[i], eps);
     BOOST_CHECK_SMALL(transformed_y[i] - expected_y[i], eps);
     BOOST_CHECK_SMALL(transformed_z[i] - expected_z[i], eps);
@@ -202,7 +202,7 @@ BOOST_AUTO_TEST_CASE(cylinder_to_cartesian_with_axis_with_phi_2_test) {
     auto const x_cart = transform_coordinate_cylinder_to_cartesian(x_cyl, z, y);
     auto const y_cart = transform_coordinate_cylinder_to_cartesian(y_cyl, z, y);
     auto const z_cart = transform_coordinate_cylinder_to_cartesian(z_cyl, z, y);
-    for (int i = 0; i < 3; ++i) {
+    for (auto i = 0u; i < 3u; ++i) {
       BOOST_CHECK_SMALL(x_cart[i] - x[i], eps);
       BOOST_CHECK_SMALL(y_cart[i] - y[i], eps);
       BOOST_CHECK_SMALL(z_cart[i] - z[i], eps);
@@ -210,13 +210,13 @@ BOOST_AUTO_TEST_CASE(cylinder_to_cartesian_with_axis_with_phi_2_test) {
   }
   // check transformation with orientation for another angle
   {
-    auto const u = vec_rotate(z, Utils::pi() / 3.0, x);
-    auto const v = vec_rotate(z, Utils::pi() / 3.0, y);
+    auto const u = vec_rotate(z, std::numbers::pi / 3., x);
+    auto const v = vec_rotate(z, std::numbers::pi / 3., y);
     auto const u_cyl = transform_coordinate_cartesian_to_cylinder(u, z, y);
     auto const v_cyl = transform_coordinate_cartesian_to_cylinder(v, z, y);
     auto const u_cart = transform_coordinate_cylinder_to_cartesian(u_cyl, z, y);
     auto const v_cart = transform_coordinate_cylinder_to_cartesian(v_cyl, z, y);
-    for (int i = 0; i < 3; ++i) {
+    for (auto i = 0u; i < 3u; ++i) {
       BOOST_CHECK_SMALL(u_cart[i] - u[i], eps);
       BOOST_CHECK_SMALL(v_cart[i] - v[i], eps);
     }
@@ -233,7 +233,7 @@ BOOST_AUTO_TEST_CASE(cylinder_to_cartesian_with_axis_with_phi_2_test) {
       auto const a = Utils::vector_product(v1, v2) / v1.norm() / v2.norm();
       auto const v3 = transform_coordinate_cartesian_to_cylinder(v2, a, v1);
       auto const v4 = transform_coordinate_cylinder_to_cartesian(v3, a, v1);
-      for (int i = 0; i < 3; ++i) {
+      for (auto i = 0u; i < 3u; ++i) {
         BOOST_CHECK_SMALL(v4[i] - v2[i], eps);
       }
     }
diff --git a/src/utils/tests/cylindrical_transformation.cpp b/src/utils/tests/cylindrical_transformation_test.cpp
similarity index 100%
rename from src/utils/tests/cylindrical_transformation.cpp
rename to src/utils/tests/cylindrical_transformation_test.cpp
diff --git a/src/utils/tests/gather_buffer_test.cpp b/src/utils/tests/gather_buffer_test.cpp
index 7ee296b8cd5..db4532501bd 100644
--- a/src/utils/tests/gather_buffer_test.cpp
+++ b/src/utils/tests/gather_buffer_test.cpp
@@ -42,9 +42,9 @@ void check_vector(const boost::mpi::communicator &comm, int root) {
 
   if (comm.rank() == root) {
     auto const n = comm.size();
-    const int total_size = n * (n + 1) / 2;
+    auto const total_size = n * (n + 1) / 2;
 
-    BOOST_CHECK(buf.size() == total_size);
+    BOOST_CHECK_EQUAL(static_cast<int>(buf.size()), total_size);
 
     /* Check order in result */
     BOOST_CHECK(std::is_sorted(buf.begin(), buf.end()));
@@ -57,9 +57,9 @@ void check_vector(const boost::mpi::communicator &comm, int root) {
     }
   } else {
     /* Check that buffer is unchanged */
-    BOOST_CHECK(buf.size() == comm.rank() + 1);
+    BOOST_CHECK_EQUAL(static_cast<int>(buf.size()), comm.rank() + 1);
     for (auto const &i : buf) {
-      BOOST_CHECK(i == comm.rank() + 1);
+      BOOST_CHECK_EQUAL(i, comm.rank() + 1);
     }
   }
 }
@@ -71,14 +71,14 @@ void check_vector_out_of_bounds(const boost::mpi::communicator &comm) {
   if (comm.rank() == 1) {
     std::vector<int> buf = {2, 2};
     gather_buffer(buf, comm, root);
-    BOOST_CHECK(buf.size() == 3);
+    BOOST_CHECK(buf.size() == 3u);
     BOOST_CHECK(buf[0] == 1);
     BOOST_CHECK(buf[1] == 2);
     BOOST_CHECK(buf[2] == 2);
   } else if (comm.rank() == 0) {
     std::vector<int> buf = {1};
     gather_buffer(buf, comm, root);
-    BOOST_CHECK(buf.size() == 1);
+    BOOST_CHECK(buf.size() == 1u);
     BOOST_CHECK(buf[0] == 1);
   } else {
     std::vector<int> buf = {};
@@ -92,7 +92,7 @@ void check_vector_empty(const boost::mpi::communicator &comm, int empty) {
   gather_buffer(buf, comm);
 
   if (comm.rank() == 0) {
-    BOOST_CHECK(buf.size() == (comm.size() - 1) * 11);
+    BOOST_CHECK_EQUAL(static_cast<int>(buf.size()), (comm.size() - 1) * 11);
 
     for (int i = 0; i < comm.size(); i++) {
       auto const [lower, upper] = std::equal_range(buf.begin(), buf.end(), i);
@@ -156,7 +156,7 @@ BOOST_AUTO_TEST_CASE(non_trivial_type) {
 
   if (world.rank() == 0) {
     auto const n = world.size();
-    BOOST_CHECK(buf.size() == (n * (n + 1) / 2));
+    BOOST_CHECK_EQUAL(static_cast<int>(buf.size()), (n * (n + 1) / 2));
 
     for (auto const &e : buf) {
       BOOST_CHECK(e == s);
diff --git a/src/utils/tests/histogram.cpp b/src/utils/tests/histogram_test.cpp
similarity index 95%
rename from src/utils/tests/histogram.cpp
rename to src/utils/tests/histogram_test.cpp
index 11f4f5432c6..9a852702a7e 100644
--- a/src/utils/tests/histogram.cpp
+++ b/src/utils/tests/histogram_test.cpp
@@ -21,11 +21,11 @@
 #include <boost/test/unit_test.hpp>
 
 #include "utils/Histogram.hpp"
-#include "utils/constants.hpp"
 
 #include <algorithm>
 #include <array>
 #include <cstddef>
+#include <numbers>
 #include <stdexcept>
 #include <utility>
 #include <vector>
@@ -66,10 +66,10 @@ BOOST_AUTO_TEST_CASE(histogram) {
 }
 
 BOOST_AUTO_TEST_CASE(cylindrical_histogram) {
-  constexpr auto pi = Utils::pi<double>();
+  constexpr auto pi = std::numbers::pi;
   std::array<std::size_t, 3> n_bins{{10, 10, 10}};
   std::array<std::pair<double, double>, 3> limits{{std::make_pair(0.0, 2.0),
-                                                   std::make_pair(0.0, 2 * pi),
+                                                   std::make_pair(0.0, 2. * pi),
                                                    std::make_pair(0.0, 10.0)}};
   constexpr std::size_t n_dims_data = 3;
   auto hist = Utils::CylindricalHistogram<double, n_dims_data>(n_bins, limits);
@@ -77,12 +77,12 @@ BOOST_AUTO_TEST_CASE(cylindrical_histogram) {
   BOOST_CHECK(hist.get_limits() == limits);
   BOOST_CHECK(hist.get_n_bins() == n_bins);
   BOOST_CHECK((hist.get_bin_sizes() ==
-               std::array<double, 3>{{2.0 / 10.0, 2 * pi / 10.0, 1.0}}));
+               std::array<double, 3>{{2.0 / 10.0, 2. * pi / 10.0, 1.0}}));
   // Check that histogram is initialized to zero.
   BOOST_CHECK(hist.get_histogram() ==
               std::vector<double>(n_dims_data * 1000, 0.0));
   // Check that histogram still empty if data is out of bounds.
-  hist.update(std::vector<double>{{1.0, 3 * pi, 1.0}});
+  hist.update(std::vector<double>{{1.0, 3. * pi, 1.0}});
   BOOST_CHECK(hist.get_histogram() ==
               std::vector<double>(n_dims_data * 1000, 0.0));
   // Check if putting in data at the first bin is set correctly.
diff --git a/src/utils/tests/iall_gatherv_test.cpp b/src/utils/tests/iall_gatherv_test.cpp
index 708294802a2..f5e24891514 100644
--- a/src/utils/tests/iall_gatherv_test.cpp
+++ b/src/utils/tests/iall_gatherv_test.cpp
@@ -27,6 +27,7 @@
 #include <boost/mpi.hpp>
 
 #include <algorithm>
+#include <cstddef>
 #include <string>
 #include <vector>
 
@@ -96,7 +97,7 @@ BOOST_AUTO_TEST_CASE(multiple_elements) {
     auto reqs = Utils::Mpi::iall_gatherv(world, in.data(), rank + 1, out.data(),
                                          sizes.data());
     boost::mpi::wait_all(reqs.begin(), reqs.end());
-    for (int i = 0; i < expected_values.size(); i++) {
+    for (std::size_t i = 0u; i < expected_values.size(); i++) {
       BOOST_CHECK_EQUAL(out.at(i), expected_values.at(i));
     }
   }
@@ -112,7 +113,7 @@ BOOST_AUTO_TEST_CASE(multiple_elements) {
     auto reqs = Utils::Mpi::iall_gatherv(world, out.data(), rank + 1,
                                          out.data(), sizes.data());
     boost::mpi::wait_all(reqs.begin(), reqs.end());
-    for (int i = 0; i < expected_values.size(); i++) {
+    for (std::size_t i = 0u; i < expected_values.size(); i++) {
       BOOST_CHECK_EQUAL(out.at(i), expected_values.at(i));
     }
   }
diff --git a/src/utils/tests/interpolation_test.cpp b/src/utils/tests/interpolation_test.cpp
index b452ab32e9d..931f36fa507 100644
--- a/src/utils/tests/interpolation_test.cpp
+++ b/src/utils/tests/interpolation_test.cpp
@@ -152,7 +152,7 @@ BOOST_AUTO_TEST_CASE(nearest_point) {
 BOOST_AUTO_TEST_CASE(interpolation_points_3) {
   std::vector<std::array<int, 3>> int_points;
 
-  auto save_ind = [&int_points](const std::array<int, 3> &ind, double w) {
+  auto save_ind = [&int_points](const std::array<int, 3> &ind, double) {
     int_points.push_back(ind);
   };
 
@@ -178,7 +178,7 @@ BOOST_AUTO_TEST_CASE(interpolation_points_3) {
 BOOST_AUTO_TEST_CASE(interpolation_points_2) {
   std::vector<std::array<int, 3>> int_points;
 
-  auto save_ind = [&int_points](const std::array<int, 3> &ind, double w) {
+  auto save_ind = [&int_points](const std::array<int, 3> &ind, double) {
     int_points.push_back(ind);
   };
 
diff --git a/src/utils/tests/make_lin_space_test.cpp b/src/utils/tests/make_lin_space_test.cpp
index 3a01f0d773c..c6bbe1a1d09 100644
--- a/src/utils/tests/make_lin_space_test.cpp
+++ b/src/utils/tests/make_lin_space_test.cpp
@@ -25,30 +25,32 @@
 
 #include <algorithm>
 #include <cmath>
+#include <cstddef>
 #include <limits>
 #include <vector>
 
 BOOST_AUTO_TEST_CASE(make_lin_space_test) {
   using Utils::make_lin_space;
+  constexpr auto tol = 100. * std::numeric_limits<double>::epsilon();
 
   /* With endpoint */
   {
     auto const start = 1.;
     auto const stop = 2.;
-    auto const num = 13;
+    auto const num = 13u;
 
     auto const lin_space =
         make_lin_space(start, stop, num, /* endpoint */ true);
     BOOST_CHECK_EQUAL(lin_space.size(), num);
 
-    std::vector<double> values(lin_space.begin(), lin_space.end());
+    std::vector<double> values;
+    std::ranges::copy(lin_space, std::back_inserter(values));
     BOOST_CHECK_EQUAL(values.front(), start);
-    BOOST_CHECK_EQUAL(values.back(), stop);
+    BOOST_CHECK_CLOSE(values.back(), stop, tol);
 
-    auto const dx = (stop - start) / (num - 1);
-    for (int i = 0; i < values.size(); i++) {
-      BOOST_CHECK(std::fabs(start + i * dx - values.at(i)) <=
-                  std::numeric_limits<double>::epsilon());
+    auto const dx = (stop - start) / static_cast<double>(num - 1u);
+    for (std::size_t i = 0u; i < values.size(); i++) {
+      BOOST_CHECK_CLOSE(values.at(i), start + static_cast<double>(i) * dx, tol);
     }
   }
 
@@ -56,20 +58,20 @@ BOOST_AUTO_TEST_CASE(make_lin_space_test) {
   {
     auto const start = 1.;
     auto const stop = 2.;
-    auto const num = 13;
+    auto const num = 13u;
 
     auto const lin_space =
         make_lin_space(start, stop, num, /* endpoint */ false);
     BOOST_CHECK_EQUAL(lin_space.size(), num);
 
-    std::vector<double> values(lin_space.begin(), lin_space.end());
+    std::vector<double> values;
+    std::ranges::copy(lin_space, std::back_inserter(values));
     BOOST_CHECK_EQUAL(values.front(), start);
     BOOST_CHECK_LT(values.back(), stop);
 
-    auto const dx = (stop - start) / num;
-    for (int i = 0; i < values.size(); i++) {
-      BOOST_CHECK(std::fabs(start + i * dx - values.at(i)) <=
-                  std::numeric_limits<double>::epsilon());
+    auto const dx = (stop - start) / static_cast<double>(num);
+    for (std::size_t i = 0u; i < values.size(); i++) {
+      BOOST_CHECK_CLOSE(values.at(i), start + static_cast<double>(i) * dx, tol);
     }
   }
 }
diff --git a/src/utils/tests/matrix_vector_product.cpp b/src/utils/tests/matrix_vector_product_test.cpp
similarity index 97%
rename from src/utils/tests/matrix_vector_product.cpp
rename to src/utils/tests/matrix_vector_product_test.cpp
index f98d0f5bd4f..4edfefd6f0c 100644
--- a/src/utils/tests/matrix_vector_product.cpp
+++ b/src/utils/tests/matrix_vector_product_test.cpp
@@ -35,7 +35,7 @@ BOOST_AUTO_TEST_CASE(inner_product) {
   auto constexpr tol = 8. * 100. * std::numeric_limits<double>::epsilon();
   const std::array<double, 3> vector{{0.5, 1.25, 3.1}};
   auto const result = Utils::matrix_vector_product<double, 3, matrix>(vector);
-  for (int i = 0; i < 3; ++i) {
+  for (auto i = 0u; i < 3u; ++i) {
     auto const ref = boost::inner_product(matrix[i], vector, 0.0);
     BOOST_CHECK_CLOSE(result[i], ref, tol);
   }
diff --git a/src/utils/tests/memcpy_archive_test.cpp b/src/utils/tests/memcpy_archive_test.cpp
index 25762d3110b..ef6b833332a 100644
--- a/src/utils/tests/memcpy_archive_test.cpp
+++ b/src/utils/tests/memcpy_archive_test.cpp
@@ -21,40 +21,29 @@
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
-/* This <boost/serialization/version.hpp> include guards against an issue
- * in boost::serialization from boost 1.74.0 that leads to compiler error
- * "explicit specialization of undeclared template struct 'version'" when
- * including <boost/serialization/optional.hpp>. More details in tickets:
- * https://github.com/boostorg/serialization/issues/210
- * https://github.com/boostorg/serialization/issues/217
- */
-#include <boost/serialization/version.hpp>
-
-#include <utils/Span.hpp>
 #include <utils/Vector.hpp>
 #include <utils/serialization/memcpy_archive.hpp>
+#include <utils/serialization/optional.hpp>
 #include <utils/type_traits.hpp>
 
-#include <boost/optional.hpp>
-#include <boost/serialization/optional.hpp>
-
 #include <array>
 #include <cstddef>
+#include <optional>
 #include <type_traits>
 
 struct NonTrivial {
-  boost::optional<Utils::Vector3d> ov;
+  std::optional<Utils::Vector3d> ov;
 
-  template <class Archive> void serialize(Archive &ar, long int) { ar &ov; }
+  template <class Archive> void serialize(Archive &ar, long int) { ar & ov; }
 };
 
-using OpVec = boost::optional<Utils::Vector3d>;
+using OpVec = std::optional<Utils::Vector3d>;
 
 namespace Utils {
 template <> struct is_statically_serializable<NonTrivial> : std::true_type {};
 
 template <class T>
-struct is_statically_serializable<boost::optional<T>>
+struct is_statically_serializable<std::optional<T>>
     : is_statically_serializable<T> {};
 } // namespace Utils
 
@@ -70,8 +59,12 @@ BOOST_AUTO_TEST_CASE(type_traits) {
   static_assert(not Utils::detail::use_serialize<int>::value);
 
   static_assert(Utils::is_statically_serializable<OpVec>::value);
-  static_assert(not Utils::detail::use_memcpy<OpVec>::value);
-  static_assert(Utils::detail::use_serialize<OpVec>::value);
+  static_assert(Utils::detail::use_memcpy<OpVec>::value);
+  static_assert(not Utils::detail::use_serialize<OpVec>::value);
+
+  static_assert(Utils::is_statically_serializable<NonTrivial>::value);
+  static_assert(Utils::detail::use_memcpy<NonTrivial>::value);
+  static_assert(not Utils::detail::use_serialize<NonTrivial>::value);
 
   BOOST_TEST_PASSPOINT();
 }
@@ -79,7 +72,7 @@ BOOST_AUTO_TEST_CASE(type_traits) {
 BOOST_AUTO_TEST_CASE(skipping_and_position) {
   std::array<char, 10> buf;
 
-  auto ar = Utils::MemcpyOArchive(Utils::make_span(buf));
+  auto ar = Utils::MemcpyOArchive(buf);
 
   BOOST_CHECK_EQUAL(0, ar.bytes_processed());
   ar.skip(5);
@@ -92,14 +85,14 @@ BOOST_AUTO_TEST_CASE(memcpy_processing) {
   auto const test_number = 5;
 
   {
-    auto oa = Utils::MemcpyOArchive(Utils::make_span(buf));
+    auto oa = Utils::MemcpyOArchive(buf);
     oa << test_number;
     BOOST_CHECK_EQUAL(oa.bytes_written(), sizeof(test_number));
     BOOST_CHECK_EQUAL(oa.get_library_version(), 4);
   }
 
   {
-    auto ia = Utils::MemcpyIArchive(Utils::make_span(buf));
+    auto ia = Utils::MemcpyIArchive(buf);
     int out;
     ia >> out;
     BOOST_CHECK_EQUAL(out, test_number);
@@ -112,9 +105,9 @@ BOOST_AUTO_TEST_CASE(serializaton_processing) {
   std::array<char, 2 * sizeof(OpVec)> buf;
 
   const OpVec active = Utils::Vector3d{1., 2., 3.};
-  const OpVec inactive = boost::none;
+  const OpVec inactive = std::nullopt;
   {
-    auto oa = Utils::MemcpyOArchive{Utils::make_span(buf)};
+    auto oa = Utils::MemcpyOArchive{buf};
     auto in1 = active;
     auto in2 = inactive;
     oa << in1;
@@ -124,8 +117,9 @@ BOOST_AUTO_TEST_CASE(serializaton_processing) {
   }
 
   {
-    auto ia = Utils::MemcpyIArchive{Utils::make_span(buf)};
-    OpVec out1 = Utils::Vector3d{}, out2;
+    auto ia = Utils::MemcpyIArchive{buf};
+    OpVec out1 = Utils::Vector3d{};
+    OpVec out2;
     ia >> out1;
     ia >> out2;
 
diff --git a/src/utils/tests/sampling_test.cpp b/src/utils/tests/sampling_test.cpp
index 5e90eecfcdf..130f2de88aa 100644
--- a/src/utils/tests/sampling_test.cpp
+++ b/src/utils/tests/sampling_test.cpp
@@ -22,18 +22,18 @@
 #include <boost/test/unit_test.hpp>
 
 #include <utils/Histogram.hpp>
-#include <utils/constants.hpp>
 #include <utils/sampling.hpp>
 
 #include <array>
 #include <cstddef>
+#include <numbers>
 #include <utility>
 
 BOOST_AUTO_TEST_CASE(get_cylindrical_sampling_positions_test) {
   auto const min_r = 0.0;
   auto const max_r = 5.0;
-  auto const min_phi = -Utils::pi();
-  auto const max_phi = Utils::pi();
+  auto const min_phi = -std::numbers::pi;
+  auto const max_phi = std::numbers::pi;
   auto const min_z = 0.0;
   auto const max_z = 10.0;
   auto const n_r_bins = std::size_t{10};
diff --git a/src/utils/tests/scatter_buffer_test.cpp b/src/utils/tests/scatter_buffer_test.cpp
index 07767635ac1..db3f6084b7b 100644
--- a/src/utils/tests/scatter_buffer_test.cpp
+++ b/src/utils/tests/scatter_buffer_test.cpp
@@ -43,7 +43,7 @@ void check_pointer(boost::mpi::communicator comm, int root) {
       }
     }
 
-    BOOST_CHECK(buf.size() == total_size);
+    BOOST_CHECK_EQUAL(static_cast<int>(buf.size()), total_size);
 
     Utils::Mpi::scatter_buffer(buf.data(), comm.rank(), comm, root);
   } else {
diff --git a/src/utils/tests/serialization_test.cpp b/src/utils/tests/serialization_test.cpp
index 7bdb1cdd1d8..fb9ac2b5305 100644
--- a/src/utils/tests/serialization_test.cpp
+++ b/src/utils/tests/serialization_test.cpp
@@ -22,15 +22,6 @@
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
-/* This <boost/serialization/version.hpp> include guards against an issue
- * in boost::serialization from boost 1.74.0 that leads to compiler error
- * "explicit specialization of undeclared template struct 'version'" when
- * including <boost/serialization/optional.hpp>. More details in tickets:
- * https://github.com/boostorg/serialization/issues/210
- * https://github.com/boostorg/serialization/issues/217
- */
-#include <boost/serialization/version.hpp>
-
 #include <utils/Array.hpp>
 #include <utils/Vector.hpp>
 #include <utils/compact_vector.hpp>
@@ -281,8 +272,8 @@ BOOST_AUTO_TEST_CASE(mpi_archive_test) {
   BOOST_TEST(buffer_vector == buffer_ref, boost::test_tools::per_element());
   BOOST_TEST(buffer_storage == buffer_ref, boost::test_tools::per_element());
   BOOST_TEST(buffer_quat == buffer_ref, boost::test_tools::per_element());
-  auto const index_lsb = (is_big_endian()) ? 1 : 0;
-  auto const index_hsb = (is_big_endian()) ? 0 : 1;
+  auto const index_lsb = (is_big_endian()) ? 1u : 0u;
+  auto const index_hsb = (is_big_endian()) ? 0u : 1u;
   BOOST_TEST(buffer_cv[index_lsb] == Testing::N);
   BOOST_TEST(buffer_cv[index_hsb] == 0);
   buffer_cv.erase(buffer_cv.begin());
diff --git a/src/utils/tests/sinc_test.cpp b/src/utils/tests/sinc_test.cpp
deleted file mode 100644
index 1235365669d..00000000000
--- a/src/utils/tests/sinc_test.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2017-2022 The ESPResSo project
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#define BOOST_TEST_MODULE Utils::sinc test
-#define BOOST_TEST_DYN_LINK
-#include <boost/test/unit_test.hpp>
-
-#include "utils/math/sinc.hpp"
-
-#include <cmath>
-#include <cstdlib>
-
-BOOST_AUTO_TEST_CASE(zero) { BOOST_CHECK_EQUAL(Utils::sinc(0.0), 1.0); }
-
-BOOST_AUTO_TEST_CASE(approx) {
-  auto x = 0.001;
-  while (x <= 0.11) {
-    auto const approx = Utils::sinc(x);
-    auto const pi_x = boost::math::constants::pi<double>() * x;
-    auto const exact = std::sin(pi_x) / (pi_x);
-    BOOST_CHECK_SMALL(approx - exact, 1e-13);
-    x += 0.01;
-  }
-}
diff --git a/src/utils/tests/triangle_functions_test.cpp b/src/utils/tests/triangle_functions_test.cpp
index 3f6a6f81e01..5afacd819fe 100644
--- a/src/utils/tests/triangle_functions_test.cpp
+++ b/src/utils/tests/triangle_functions_test.cpp
@@ -26,10 +26,10 @@
 #include "utils/math/triangle_functions.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 
 #include <cstdlib>
 #include <limits>
+#include <numbers>
 
 auto const epsilon = std::numeric_limits<double>::epsilon();
 
@@ -87,11 +87,11 @@ BOOST_AUTO_TEST_CASE(angle_triangles) {
   angle_btw_triangles(B,C,A,D) so that N1 = CB x CA and N2 = CA x CD.
   */
 
-  constexpr double half_pi = Utils::pi() / 2.0;
+  constexpr double half_pi = std::numbers::pi / 2.;
   const Utils::Vector3d a{1, 1, 1}, b{2, 1, 1}, c{1, 2, 1}, d{1, 1, 2};
   using Utils::angle_btw_triangles;
   BOOST_CHECK_SMALL(std::abs(angle_btw_triangles(b, a, c, d) - half_pi),
                     epsilon);
-  BOOST_CHECK_SMALL(std::abs(angle_btw_triangles(b, c, a, d) - 3 * half_pi),
+  BOOST_CHECK_SMALL(std::abs(angle_btw_triangles(b, c, a, d) - 3. * half_pi),
                     epsilon);
 }
diff --git a/src/utils/tests/tuple_test.cpp b/src/utils/tests/tuple_test.cpp
index eaba5e6ca7a..efd3fe644e1 100644
--- a/src/utils/tests/tuple_test.cpp
+++ b/src/utils/tests/tuple_test.cpp
@@ -17,9 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-/* Unit test for Utils tuple algorithms. */
-
-#define BOOST_TEST_MODULE Utils::tuple_test
+#define BOOST_TEST_MODULE Utils::tuple test
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
diff --git a/src/utils/tests/vec_rotate_test.cpp b/src/utils/tests/vec_rotate_test.cpp
index 88b2af17983..2911101f40f 100644
--- a/src/utils/tests/vec_rotate_test.cpp
+++ b/src/utils/tests/vec_rotate_test.cpp
@@ -20,11 +20,11 @@
 #include <boost/test/unit_test.hpp>
 
 #include <utils/Vector.hpp>
-#include <utils/constants.hpp>
 #include <utils/math/vec_rotate.hpp>
 
 #include <cmath>
 #include <limits>
+#include <numbers>
 
 BOOST_AUTO_TEST_CASE(rotation) {
   using std::cos;
@@ -52,5 +52,5 @@ BOOST_AUTO_TEST_CASE(angle_between) {
   Utils::Vector3d const v2 = {1.0, 1.0, 0.0};
 
   auto const angle = Utils::angle_between(v1, v2);
-  BOOST_CHECK_CLOSE(angle, Utils::pi() / 4.0, 1e-7);
+  BOOST_CHECK_CLOSE(angle, std::numbers::pi / 4., 1e-7);
 }
diff --git a/src/walberla_bridge/CMakeLists.txt b/src/walberla_bridge/CMakeLists.txt
index b5595397f35..6b2da504a0a 100644
--- a/src/walberla_bridge/CMakeLists.txt
+++ b/src/walberla_bridge/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2020-2023 The ESPResSo project
+# Copyright (C) 2020-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,46 +17,47 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
+add_library(espresso_walberla_cpp_flags INTERFACE)
+add_library(espresso::walberla::cpp_flags ALIAS espresso_walberla_cpp_flags)
+target_link_libraries(
+  espresso_walberla_cpp_flags
+  INTERFACE espresso::cpp_flags
+            $<$<BOOL:${ESPRESSO_BUILD_WITH_WALBERLA_AVX}>:espresso::avx_flags>)
+add_library(espresso_walberla_cuda_flags INTERFACE)
+add_library(espresso::walberla::cuda_flags ALIAS espresso_walberla_cuda_flags)
+target_link_libraries(
+  espresso_walberla_cuda_flags
+  INTERFACE espresso::cuda_flags
+            $<$<BOOL:${ESPRESSO_BUILD_WITH_WALBERLA_AVX}>:espresso::avx_flags>)
+
+function(espresso_configure_walberla_target)
+  set(TARGET_NAME ${ARGV0})
+  set_target_properties(${TARGET_NAME} PROPERTIES CXX_CLANG_TIDY "")
+  target_link_libraries(${TARGET_NAME} PRIVATE ${WALBERLA_LIBS})
+  target_include_directories(
+    ${TARGET_NAME} PUBLIC include PRIVATE ${WALBERLA_INCLUDE_DIRS}
+                                          ${walberla_BINARY_DIR}/src)
+  install(TARGETS ${TARGET_NAME}
+          LIBRARY DESTINATION ${ESPRESSO_INSTALL_PYTHON}/espressomd)
+endfunction()
+
 add_library(espresso_walberla SHARED)
 add_library(espresso::walberla ALIAS espresso_walberla)
 
-set_target_properties(espresso_walberla PROPERTIES CXX_CLANG_TIDY "")
-target_include_directories(espresso_walberla PUBLIC include)
+espresso_configure_walberla_target(espresso_walberla)
 
-add_library(espresso_walberla_cpp_flags INTERFACE)
-set_target_properties(espresso_walberla_cpp_flags PROPERTIES CXX_CLANG_TIDY "")
-add_library(espresso::walberla::cpp_flags ALIAS espresso_walberla_cpp_flags)
-if(ESPRESSO_BUILD_WITH_WALBERLA_AVX)
-  target_link_libraries(espresso_walberla_cpp_flags
-                        INTERFACE espresso::avx_flags)
-endif()
-install(TARGETS espresso_walberla
-        LIBRARY DESTINATION ${ESPRESSO_INSTALL_PYTHON}/espressomd)
+target_link_libraries(espresso_walberla PUBLIC MPI::MPI_CXX espresso::utils
+                      PRIVATE espresso::walberla::cpp_flags)
 
 if(ESPRESSO_BUILD_WITH_CUDA AND WALBERLA_BUILD_WITH_CUDA)
   espresso_add_gpu_library(espresso_walberla_cuda SHARED)
   add_library(espresso::walberla_cuda ALIAS espresso_walberla_cuda)
-  target_link_libraries(espresso_walberla_cuda PRIVATE CUDA::cuda_driver
-                                                       CUDA::cudart)
+  espresso_configure_walberla_target(espresso_walberla_cuda)
   target_link_libraries(espresso_walberla_cuda PUBLIC espresso::utils
-                        PRIVATE ${WALBERLA_LIBS})
-  target_include_directories(espresso_walberla_cuda PUBLIC include)
-  target_include_directories(
-    espresso_walberla_cuda PRIVATE ${WALBERLA_INCLUDE_DIRS}
-                                   ${walberla_BINARY_DIR}/src)
-  install(TARGETS espresso_walberla_cuda
-          LIBRARY DESTINATION ${ESPRESSO_INSTALL_PYTHON}/espressomd)
-  target_link_libraries(espresso_walberla PUBLIC espresso::walberla_cuda)
+                        PRIVATE CUDA::cuda_driver CUDA::cudart)
 endif()
 
-target_link_libraries(
-  espresso_walberla PUBLIC MPI::MPI_CXX espresso::utils
-  PRIVATE espresso::cpp_flags espresso::walberla::cpp_flags ${WALBERLA_LIBS})
-target_include_directories(espresso_walberla PRIVATE ${WALBERLA_INCLUDE_DIRS}
-                                                     ${walberla_BINARY_DIR}/src)
-
 add_subdirectory(src)
-
 if(ESPRESSO_BUILD_TESTS)
   add_subdirectory(tests)
 endif()
diff --git a/src/walberla_bridge/include/walberla_bridge/BlockAndCell.hpp b/src/walberla_bridge/include/walberla_bridge/BlockAndCell.hpp
index 2eca4fe554b..1355d195782 100644
--- a/src/walberla_bridge/include/walberla_bridge/BlockAndCell.hpp
+++ b/src/walberla_bridge/include/walberla_bridge/BlockAndCell.hpp
@@ -56,7 +56,7 @@ inline std::optional<BlockAndCell>
 get_block_and_cell(::LatticeWalberla const &lattice,
                    Utils::Vector3i const &node, bool consider_ghost_layers) {
   auto const &blocks = lattice.get_blocks();
-  int n_ghost_layers = 0;
+  auto n_ghost_layers = 0u;
   if (consider_ghost_layers) {
     n_ghost_layers = lattice.get_ghost_layers();
   }
diff --git a/src/walberla_bridge/include/walberla_bridge/electrokinetics/EKContainer.hpp b/src/walberla_bridge/include/walberla_bridge/electrokinetics/EKContainer.hpp
index 80597203083..7df70d15369 100644
--- a/src/walberla_bridge/include/walberla_bridge/electrokinetics/EKContainer.hpp
+++ b/src/walberla_bridge/include/walberla_bridge/electrokinetics/EKContainer.hpp
@@ -73,8 +73,7 @@ template <class EKSpecies> class EKContainer {
       : m_tau{tau}, m_poisson_solver{std::move(solver)}, m_ekcontainer{} {}
 
   bool contains(std::shared_ptr<EKSpecies> const &ek_species) const noexcept {
-    return std::find(m_ekcontainer.begin(), m_ekcontainer.end(), ek_species) !=
-           m_ekcontainer.end();
+    return std::ranges::find(m_ekcontainer, ek_species) != m_ekcontainer.end();
   }
 
   void add(std::shared_ptr<EKSpecies> const &ek_species) {
@@ -85,9 +84,7 @@ template <class EKSpecies> class EKContainer {
 
   void remove(std::shared_ptr<EKSpecies> const &ek_species) {
     assert(contains(ek_species));
-    m_ekcontainer.erase(
-        std::remove(m_ekcontainer.begin(), m_ekcontainer.end(), ek_species),
-        m_ekcontainer.end());
+    std::erase(m_ekcontainer, ek_species);
   }
 
   iterator begin() noexcept { return m_ekcontainer.begin(); }
diff --git a/src/walberla_bridge/include/walberla_bridge/electrokinetics/PoissonSolver/FFT.hpp b/src/walberla_bridge/include/walberla_bridge/electrokinetics/PoissonSolver/FFT.hpp
index a4e7025dbed..768fb106154 100644
--- a/src/walberla_bridge/include/walberla_bridge/electrokinetics/PoissonSolver/FFT.hpp
+++ b/src/walberla_bridge/include/walberla_bridge/electrokinetics/PoissonSolver/FFT.hpp
@@ -29,11 +29,10 @@
 #include <field/communication/PackInfo.h>
 #include <stencil/D3Q27.h>
 
-#include <utils/constants.hpp>
-
 #include <cmath>
 #include <cstddef>
 #include <memory>
+#include <numbers>
 #include <utility>
 
 namespace walberla {
@@ -67,9 +66,10 @@ template <typename FloatType> class FFT : public PoissonSolver {
       if (x == 0u && y == 0u && z == 0u)
         return 0.;
       return -0.5 /
-             (std::cos(2. * Utils::pi() * real_c(x) / real_c(dim[0])) +
-              std::cos(2. * Utils::pi() * real_c(y) / real_c(dim[1])) +
-              std::cos(2. * Utils::pi() * real_c(z) / real_c(dim[2])) - 3.) /
+             (std::cos(2. * std::numbers::pi * real_c(x) / real_c(dim[0])) +
+              std::cos(2. * std::numbers::pi * real_c(y) / real_c(dim[1])) +
+              std::cos(2. * std::numbers::pi * real_c(z) / real_c(dim[2])) -
+              3.) /
              real_c(dim[0] * dim[1] * dim[2]);
     };
 
diff --git a/src/walberla_bridge/include/walberla_bridge/lattice_boltzmann/LBWalberlaBase.hpp b/src/walberla_bridge/include/walberla_bridge/lattice_boltzmann/LBWalberlaBase.hpp
index 622a2561036..c0dd6406cc8 100644
--- a/src/walberla_bridge/include/walberla_bridge/lattice_boltzmann/LBWalberlaBase.hpp
+++ b/src/walberla_bridge/include/walberla_bridge/lattice_boltzmann/LBWalberlaBase.hpp
@@ -59,6 +59,10 @@ class LBWalberlaBase : public LatticeModel {
   get_velocity_at_pos(Utils::Vector3d const &position,
                       bool consider_points_in_halo = false) const = 0;
 
+  /** @brief Get interpolated velocities at positions. */
+  virtual std::vector<Utils::Vector3d>
+  get_velocities_at_pos(std::vector<Utils::Vector3d> const &pos) = 0;
+
   /** @brief Get interpolated densities at a position. */
   virtual std::optional<double>
   get_density_at_pos(Utils::Vector3d const &position,
@@ -71,6 +75,14 @@ class LBWalberlaBase : public LatticeModel {
   virtual bool add_force_at_pos(Utils::Vector3d const &position,
                                 Utils::Vector3d const &force) = 0;
 
+  /**
+   * @brief Interpolate forces to the stored forces to be applied on nodes
+   * in the next time step.
+   */
+  virtual void
+  add_forces_at_pos(std::vector<Utils::Vector3d> const &positions,
+                    std::vector<Utils::Vector3d> const &forces) = 0;
+
   /** @brief Get stored force to be applied on node in the next time step. */
   virtual std::optional<Utils::Vector3d>
   get_node_force_to_be_applied(Utils::Vector3i const &node) const = 0;
@@ -256,4 +268,6 @@ class LBWalberlaBase : public LatticeModel {
 
   /** @brief get the force field id */
   [[nodiscard]] virtual std::size_t get_force_field_id() const noexcept = 0;
+
+  [[nodiscard]] virtual bool is_gpu() const noexcept = 0;
 };
diff --git a/src/walberla_bridge/include/walberla_bridge/lattice_boltzmann/lb_walberla_init.hpp b/src/walberla_bridge/include/walberla_bridge/lattice_boltzmann/lb_walberla_init.hpp
index cf3177797f0..b64d2504219 100644
--- a/src/walberla_bridge/include/walberla_bridge/lattice_boltzmann/lb_walberla_init.hpp
+++ b/src/walberla_bridge/include/walberla_bridge/lattice_boltzmann/lb_walberla_init.hpp
@@ -19,12 +19,17 @@
 
 #pragma once
 
-#include "LBWalberlaBase.hpp"
-
 #include <walberla_bridge/LatticeWalberla.hpp>
+#include <walberla_bridge/lattice_boltzmann/LBWalberlaBase.hpp>
 
 #include <memory>
 
 std::shared_ptr<LBWalberlaBase>
-new_lb_walberla(std::shared_ptr<LatticeWalberla> const &lattice,
-                double viscosity, double density, bool single_precision);
+new_lb_walberla_cpu(std::shared_ptr<LatticeWalberla> const &lattice,
+                    double viscosity, double density, bool single_precision);
+
+std::shared_ptr<LBWalberlaBase>
+new_lb_walberla_gpu(std::shared_ptr<LatticeWalberla> const &lattice,
+                    double viscosity, double density, bool single_precision);
+
+void set_device_id_per_rank();
diff --git a/src/walberla_bridge/src/BoundaryHandling.hpp b/src/walberla_bridge/src/BoundaryHandling.hpp
index b5d59cbe875..d8b2082460e 100644
--- a/src/walberla_bridge/src/BoundaryHandling.hpp
+++ b/src/walberla_bridge/src/BoundaryHandling.hpp
@@ -39,6 +39,16 @@
 
 namespace walberla {
 
+/**
+ * @brief Boundary class optimized for sparse data.
+ *
+ * Instead of storing the boundary data on a vector field,
+ * store individual vectors in a map.
+ * The global cell is used as key.
+ *
+ * Requires a custom communicator:
+ * @ref walberla::field::communication::BoundaryPackInfo.
+ */
 template <typename T, typename BoundaryClass> class BoundaryHandling {
 private:
   /** Flag for domain cells, i.e. all cells. */
@@ -73,7 +83,7 @@ template <typename T, typename BoundaryClass> class BoundaryHandling {
       m_value_boundary->erase(global);
     }
 
-    [[nodiscard]] auto
+    [[nodiscard]] auto &
     get_node_boundary_value(Utils::Vector3i const &node) const {
       auto const global = Cell(node[0], node[1], node[2]);
       return get_value(global);
@@ -88,7 +98,7 @@ template <typename T, typename BoundaryClass> class BoundaryHandling {
     std::shared_ptr<std::unordered_map<Cell, T>> m_value_boundary;
     static constexpr T default_value{};
 
-    [[nodiscard]] T get_value(Cell const &cell) const {
+    [[nodiscard]] T const &get_value(Cell const &cell) const {
       if (m_value_boundary->count(cell) == 0) {
         return default_value;
       }
@@ -112,8 +122,8 @@ template <typename T, typename BoundaryClass> class BoundaryHandling {
       : m_blocks(std::move(blocks)), m_flag_field_id(flag_field_id),
         m_callback(DynamicValueCallback()), m_pending_changes(false) {
     // reinitialize the flag field
-    for (auto b = m_blocks->begin(); b != m_blocks->end(); ++b) {
-      flag_reset_kernel(&*b);
+    for (auto block = m_blocks->begin(); block != m_blocks->end(); ++block) {
+      flag_reset_kernel(block->template getData<FlagField>(m_flag_field_id));
     }
     // instantiate the boundary sweep
     std::function callback = m_callback;
@@ -127,7 +137,7 @@ template <typename T, typename BoundaryClass> class BoundaryHandling {
     return m_callback.node_is_boundary(node);
   }
 
-  [[nodiscard]] auto
+  [[nodiscard]] auto &
   get_node_value_at_boundary(Utils::Vector3i const &node) const {
     return m_callback.get_node_boundary_value(node);
   }
@@ -175,8 +185,7 @@ template <typename T, typename BoundaryClass> class BoundaryHandling {
   bool m_pending_changes;
 
   /** Register flags and reset all cells. */
-  void flag_reset_kernel(IBlock *const block) {
-    auto flag_field = block->template getData<FlagField>(m_flag_field_id);
+  void flag_reset_kernel(FlagField *flag_field) {
     // register flags
     if (!flag_field->flagExists(Domain_flag))
       flag_field->registerFlag(Domain_flag);
diff --git a/src/walberla_bridge/src/BoundaryPackInfo.hpp b/src/walberla_bridge/src/BoundaryPackInfo.hpp
index baeeb7c3853..83a26fa91d4 100644
--- a/src/walberla_bridge/src/BoundaryPackInfo.hpp
+++ b/src/walberla_bridge/src/BoundaryPackInfo.hpp
@@ -119,7 +119,7 @@ class BoundaryPackInfo : public PackInfo<GhostLayerField_T> {
     auto const boundary_flag = flag_field->getFlag(Boundary_flag);
     auto const gl = numberOfGhostLayersToCommunicate(flag_field);
     auto const begin = [gl, dir](auto const *flag_field) {
-      return flag_field->beginSliceBeforeGhostLayer(dir, gl);
+      return flag_field->beginSliceBeforeGhostLayer(dir, cell_idx_c(gl));
     };
 
 #ifndef NDEBUG
diff --git a/src/walberla_bridge/src/lattice_boltzmann/CMakeLists.txt b/src/walberla_bridge/src/lattice_boltzmann/CMakeLists.txt
index 3a2c214c478..891cc7aa2bd 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/CMakeLists.txt
+++ b/src/walberla_bridge/src/lattice_boltzmann/CMakeLists.txt
@@ -19,10 +19,8 @@
 
 add_subdirectory(generated_kernels)
 
-target_sources(espresso_walberla
-               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/lb_walberla_init.cpp)
+target_sources(espresso_walberla PRIVATE lb_walberla_init.cpp)
 
 if(ESPRESSO_BUILD_WITH_CUDA AND WALBERLA_BUILD_WITH_CUDA)
-  target_sources(espresso_walberla_cuda
-                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/lb_walberla_init.cu)
+  target_sources(espresso_walberla_cuda PRIVATE lb_walberla_init.cu)
 endif()
diff --git a/src/walberla_bridge/src/lattice_boltzmann/InterpolateAndShiftAtBoundary.hpp b/src/walberla_bridge/src/lattice_boltzmann/InterpolateAndShiftAtBoundary.hpp
index 72b5c326972..489e81be9d7 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/InterpolateAndShiftAtBoundary.hpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/InterpolateAndShiftAtBoundary.hpp
@@ -94,8 +94,6 @@ class InterpolateAndShiftAtBoundary {
     auto const dir = m_shear_direction;
     auto const dim = cell_idx_c(m_blocks->getNumberOfCells(*block, dir));
     auto const length = numeric_cast<FloatType>(dim);
-    auto const weight =
-        std::abs(std::fmod(get_pos_offset() + length, FloatType{1}));
 
     // setup slab
     auto field = block->template getData<FieldType>(m_field_id);
@@ -110,25 +108,24 @@ class InterpolateAndShiftAtBoundary {
     // the target
     auto const prefactor =
         ((slab_dir == m_slab_max) ? FloatType{-1} : FloatType{1});
-    auto const offset = get_pos_offset() * prefactor;
+    auto const offset = static_cast<FloatType>(get_pos_offset()) * prefactor;
+    auto const folded_offset = modulo(offset, length);
+    // 0<=folded_offset<length
+    auto const weight1 = FloatType{1} - std::fmod(folded_offset, FloatType{1});
+    auto const weight2 = std::fmod(folded_offset, FloatType{1});
     for (auto const &&cell : ci) {
       Cell source1 = cell;
       Cell source2 = cell;
-      source1[dir] = cell_idx_c(std::floor(
-                         static_cast<FloatType>(source1[dir]) + offset)) %
-                     dim;
-      source1[dir] = cell_idx_c(static_cast<FloatType>(source1[dir]) + length);
-      source1[dir] = cell_idx_c(source1[dir] % dim);
-
-      source2[dir] =
-          cell_idx_c(std::ceil(static_cast<FloatType>(source2[dir]) + offset)) %
-          dim;
-      source2[dir] = cell_idx_c(static_cast<FloatType>(source2[dir]) + length);
-      source2[dir] = cell_idx_c(source2[dir] % dim);
-
-      for (uint_t f = 0; f < FieldType::F_SIZE; ++f) {
-        tmp_field->get(cell, f) = field->get(source1, f) * (1 - weight) +
-                                  field->get(source2, f) * weight;
+      auto const source_pos = static_cast<FloatType>(cell[dir]) + folded_offset;
+      auto const folded_source_pos = modulo(source_pos, length);
+      // 0 <= folded_source_pos < length
+      source1[dir] = cell_idx_c(std::floor(folded_source_pos));
+      // 0 <= source1[dir] < length, i.e. integer value up to length-1 inclusive
+      source2[dir] = cell_idx_c(modulo(FloatType(source1[dir] + 1), length));
+      // integer value between 0 and length -1 inclusive
+      for (uint_t q = 0u; q < FieldType::F_SIZE; ++q) {
+        tmp_field->get(cell, q) =
+            field->get(source1, q) * weight1 + field->get(source2, q) * weight2;
       }
       tmp_field->get(cell, m_shear_direction) -= prefactor * shift;
     }
@@ -141,6 +138,11 @@ class InterpolateAndShiftAtBoundary {
     }
   }
 
+  FloatType modulo(FloatType a, FloatType b) const {
+    auto const res = std::fmod(a, b);
+    return (res < FloatType{0}) ? res + b : res;
+  }
+
 private:
   std::shared_ptr<StructuredBlockForest> m_blocks;
   BlockDataID m_field_id;
diff --git a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
index 5270c26aa4d..1f1a8bb24e1 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2023 The ESPResSo project
+ * Copyright (C) 2019-2024 The ESPResSo project
  *
  * This file is part of ESPResSo.
  *
@@ -38,6 +38,11 @@
 #include <field/vtk/VTKWriter.h>
 #include <stencil/D3Q19.h>
 #include <stencil/D3Q27.h>
+#if defined(__CUDACC__)
+#include <gpu/AddGPUFieldToStorage.h>
+#include <gpu/communication/MemcpyPackInfo.h>
+#include <gpu/communication/UniformGPUScheme.h>
+#endif
 
 #include "../BoundaryHandling.hpp"
 #include "../BoundaryPackInfo.hpp"
@@ -46,6 +51,9 @@
 #include "InterpolateAndShiftAtBoundary.hpp"
 #include "ResetForce.hpp"
 #include "lb_kernels.hpp"
+#if defined(__CUDACC__)
+#include "lb_kernels.cuh"
+#endif
 
 #include <walberla_bridge/Architecture.hpp>
 #include <walberla_bridge/BlockAndCell.hpp>
@@ -60,6 +68,7 @@
 #include <array>
 #include <cmath>
 #include <cstddef>
+#include <functional>
 #include <initializer_list>
 #include <limits>
 #include <memory>
@@ -96,8 +105,11 @@ class LBWalberlaImpl : public LBWalberlaBase {
       std::variant<CollisionModelThermalized, CollisionModelLeesEdwards>;
 
 public:
-  // Type definitions
+  /** @brief Stencil for collision and streaming operations. */
   using Stencil = stencil::D3Q19;
+  /** @brief Stencil for ghost communication (includes domain corners). */
+  using StencilFull = stencil::D3Q27;
+  /** @brief Lattice model (e.g. blockforest). */
   using Lattice_T = LatticeWalberla::Lattice_T;
 
 protected:
@@ -106,12 +118,39 @@ class LBWalberlaImpl : public LBWalberlaBase {
     using VectorField = field::GhostLayerField<FT, uint_t{3u}>;
     template <class Field>
     using PackInfo = field::communication::PackInfo<Field>;
+    template <class Stencil>
+    using RegularCommScheme =
+        blockforest::communication::UniformBufferedScheme<Stencil>;
+    template <class Stencil>
+    using BoundaryCommScheme =
+        blockforest::communication::UniformBufferedScheme<Stencil>;
   };
 
+#if defined(__CUDACC__)
+  template <typename FT> struct FieldTrait<FT, lbmpy::Arch::GPU> {
+    using PdfField = gpu::GPUField<FT>;
+    using VectorField = gpu::GPUField<FT>;
+    template <class Field>
+    using PackInfo = gpu::communication::MemcpyPackInfo<Field>;
+    template <class Stencil>
+    using RegularCommScheme = gpu::communication::UniformGPUScheme<Stencil>;
+    template <class Stencil>
+    using BoundaryCommScheme =
+        blockforest::communication::UniformBufferedScheme<Stencil>;
+  };
+#endif
+
+  // "underlying" field types (`GPUField` has no f-size info at compile time)
+  using _PdfField = typename FieldTrait<FloatType>::PdfField;
+  using _VectorField = typename FieldTrait<FloatType>::VectorField;
+
 public:
   using PdfField = typename FieldTrait<FloatType, Architecture>::PdfField;
   using VectorField = typename FieldTrait<FloatType, Architecture>::VectorField;
   using FlagField = typename BoundaryModel::FlagField;
+#if defined(__CUDACC__)
+  using GPUField = gpu::GPUField<FloatType>;
+#endif
 
 public:
   template <typename T> FloatType FloatType_c(T t) const {
@@ -123,13 +162,18 @@ class LBWalberlaImpl : public LBWalberlaBase {
   }
 
   [[nodiscard]] virtual bool is_double_precision() const noexcept override {
-    return std::is_same<FloatType, double>::value;
+    return std::is_same_v<FloatType, double>;
   }
 
 private:
   class CollideSweepVisitor {
   public:
-    void operator()(CollisionModelThermalized &cm, IBlock *b) { cm(b); }
+    using StructuredBlockStorage = LatticeWalberla::Lattice_T;
+
+    void operator()(CollisionModelThermalized &cm, IBlock *b) {
+      cm.configure(m_storage, b);
+      cm(b);
+    }
 
     void operator()(CollisionModelLeesEdwards &cm, IBlock *b) {
       cm.v_s_ = static_cast<decltype(cm.v_s_)>(
@@ -138,11 +182,17 @@ class LBWalberlaImpl : public LBWalberlaBase {
     }
 
     CollideSweepVisitor() = default;
-    explicit CollideSweepVisitor(std::shared_ptr<LeesEdwardsPack> callbacks) {
+    CollideSweepVisitor(std::shared_ptr<StructuredBlockStorage> storage) {
+      m_storage = std::move(storage);
+    }
+    CollideSweepVisitor(std::shared_ptr<StructuredBlockStorage> storage,
+                        std::shared_ptr<LeesEdwardsPack> callbacks) {
+      m_storage = std::move(storage);
       m_lees_edwards_callbacks = std::move(callbacks);
     }
 
   private:
+    std::shared_ptr<StructuredBlockStorage> m_storage{};
     std::shared_ptr<LeesEdwardsPack> m_lees_edwards_callbacks{};
   };
   CollideSweepVisitor m_run_collide_sweep{};
@@ -176,6 +226,13 @@ class LBWalberlaImpl : public LBWalberlaBase {
     }
   }
 
+  void pressure_tensor_correction(std::span<FloatType, 9ul> tensor) const {
+    auto const revert_factor = pressure_tensor_correction_factor();
+    for (auto const i : {1u, 2u, 3u, 5u, 6u, 7u}) {
+      tensor[i] *= revert_factor;
+    }
+  }
+
   class interpolation_illegal_access : public std::runtime_error {
   public:
     explicit interpolation_illegal_access(std::string const &field,
@@ -183,9 +240,8 @@ class LBWalberlaImpl : public LBWalberlaBase {
                                           std::array<int, 3> const &node,
                                           double weight)
         : std::runtime_error("Access to LB " + field + " field failed") {
-      std::cerr << "pos [" << pos << "], "
-                << "node [" << Utils::Vector3i(node) << "], "
-                << "weight " << weight << "\n";
+      std::cerr << "pos [" << pos << "], node [" << Utils::Vector3i(node)
+                << "], weight " << weight << "\n";
     }
   };
 
@@ -222,21 +278,26 @@ class LBWalberlaImpl : public LBWalberlaBase {
    * a full ghost communication. This is needed to properly update the corners
    * of the ghost layer when setting cell velocities or populations.
    */
-  using FullCommunicator = blockforest::communication::UniformBufferedScheme<
-      typename stencil::D3Q27>;
+  using RegularFullCommunicator =
+      typename FieldTrait<FloatType, Architecture>::template RegularCommScheme<
+          typename stencil::D3Q27>;
+  using BoundaryFullCommunicator =
+      typename FieldTrait<FloatType, Architecture>::template BoundaryCommScheme<
+          typename stencil::D3Q27>;
   /**
    * @brief Regular communicator.
    * We use the same directions as the stencil during integration.
    */
   using PDFStreamingCommunicator =
-      blockforest::communication::UniformBufferedScheme<Stencil>;
+      typename FieldTrait<FloatType,
+                          Architecture>::template RegularCommScheme<Stencil>;
   template <class Field>
   using PackInfo =
       typename FieldTrait<FloatType, Architecture>::template PackInfo<Field>;
 
   // communicators
-  std::shared_ptr<FullCommunicator> m_boundary_communicator;
-  std::shared_ptr<FullCommunicator> m_pdf_full_communicator;
+  std::shared_ptr<BoundaryFullCommunicator> m_boundary_communicator;
+  std::shared_ptr<RegularFullCommunicator> m_pdf_full_communicator;
   std::shared_ptr<PDFStreamingCommunicator> m_pdf_streaming_communicator;
 
   // ResetForce sweep + external force handling
@@ -247,11 +308,11 @@ class LBWalberlaImpl : public LBWalberlaBase {
 
   // Lees Edwards boundary interpolation
   std::shared_ptr<LeesEdwardsPack> m_lees_edwards_callbacks;
-  std::shared_ptr<InterpolateAndShiftAtBoundary<PdfField, FloatType>>
+  std::shared_ptr<InterpolateAndShiftAtBoundary<_PdfField, FloatType>>
       m_lees_edwards_pdf_interpol_sweep;
-  std::shared_ptr<InterpolateAndShiftAtBoundary<VectorField, FloatType>>
+  std::shared_ptr<InterpolateAndShiftAtBoundary<_VectorField, FloatType>>
       m_lees_edwards_vel_interpol_sweep;
-  std::shared_ptr<InterpolateAndShiftAtBoundary<VectorField, FloatType>>
+  std::shared_ptr<InterpolateAndShiftAtBoundary<_VectorField, FloatType>>
       m_lees_edwards_last_applied_force_interpol_sweep;
 
   // Collision sweep
@@ -295,11 +356,11 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if constexpr (Architecture == lbmpy::Arch::CPU) {
 #ifdef ESPRESSO_BUILD_WITH_AVX_KERNELS
 #if defined(__AVX512F__)
-      constexpr uint_t alignment = 64;
+      constexpr uint_t alignment = 64u;
 #elif defined(__AVX__)
-      constexpr uint_t alignment = 32;
+      constexpr uint_t alignment = 32u;
 #elif defined(__SSE__)
-      constexpr uint_t alignment = 16;
+      constexpr uint_t alignment = 16u;
 #else
 #error "Unsupported arch, check walberla src/field/allocation/FieldAllocator.h"
 #endif
@@ -315,6 +376,25 @@ class LBWalberlaImpl : public LBWalberlaBase {
                                         n_ghost_layers);
 #endif // ESPRESSO_BUILD_WITH_AVX_KERNELS
     }
+#if defined(__CUDACC__)
+    else {
+      auto field_id = gpu::addGPUFieldToStorage<GPUField>(
+          blocks, tag, Field::F_SIZE, field::fzyx, n_ghost_layers);
+      if constexpr (std::is_same_v<Field, _VectorField>) {
+        for (auto block = blocks->begin(); block != blocks->end(); ++block) {
+          auto field = block->template getData<GPUField>(field_id);
+          lbm::accessor::Vector::initialize(field, Vector3<FloatType>{0});
+        }
+      } else if constexpr (std::is_same_v<Field, _PdfField>) {
+        for (auto block = blocks->begin(); block != blocks->end(); ++block) {
+          auto field = block->template getData<GPUField>(field_id);
+          lbm::accessor::Population::initialize(
+              field, std::array<FloatType, Stencil::Size>{});
+        }
+      }
+      return field_id;
+    }
+#endif
   }
 
 public:
@@ -328,13 +408,13 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if (n_ghost_layers == 0u)
       throw std::runtime_error("At least one ghost layer must be used");
 
-    // Initialize and register fields
-    m_pdf_field_id = add_to_storage<PdfField>("pdfs");
-    m_pdf_tmp_field_id = add_to_storage<PdfField>("pdfs_tmp");
-    m_last_applied_force_field_id = add_to_storage<VectorField>("force field");
-    m_force_to_be_applied_id = add_to_storage<VectorField>("force field");
-    m_velocity_field_id = add_to_storage<VectorField>("velocity field");
-    m_vec_tmp_field_id = add_to_storage<VectorField>("velocity_tmp field");
+    // Initialize and register fields (must use the "underlying" types)
+    m_pdf_field_id = add_to_storage<_PdfField>("pdfs");
+    m_pdf_tmp_field_id = add_to_storage<_PdfField>("pdfs_tmp");
+    m_last_applied_force_field_id = add_to_storage<_VectorField>("force last");
+    m_force_to_be_applied_id = add_to_storage<_VectorField>("force next");
+    m_velocity_field_id = add_to_storage<_VectorField>("velocity");
+    m_vec_tmp_field_id = add_to_storage<_VectorField>("velocity_tmp");
 
     // Initialize and register pdf field
     auto pdf_setter =
@@ -354,28 +434,26 @@ class LBWalberlaImpl : public LBWalberlaBase {
     m_pdf_streaming_communicator =
         std::make_shared<PDFStreamingCommunicator>(blocks);
     m_pdf_streaming_communicator->addPackInfo(
-        std::make_shared<PackInfo<PdfField>>(m_pdf_field_id, n_ghost_layers));
+        std::make_shared<PackInfo<PdfField>>(m_pdf_field_id));
     m_pdf_streaming_communicator->addPackInfo(
-        std::make_shared<PackInfo<VectorField>>(m_last_applied_force_field_id,
-                                                n_ghost_layers));
+        std::make_shared<PackInfo<VectorField>>(m_last_applied_force_field_id));
 
-    m_pdf_full_communicator = std::make_shared<FullCommunicator>(blocks);
+    m_pdf_full_communicator = std::make_shared<RegularFullCommunicator>(blocks);
     m_pdf_full_communicator->addPackInfo(
-        std::make_shared<PackInfo<PdfField>>(m_pdf_field_id, n_ghost_layers));
+        std::make_shared<PackInfo<PdfField>>(m_pdf_field_id));
     m_pdf_full_communicator->addPackInfo(
-        std::make_shared<PackInfo<VectorField>>(m_last_applied_force_field_id,
-                                                n_ghost_layers));
+        std::make_shared<PackInfo<VectorField>>(m_last_applied_force_field_id));
     m_pdf_full_communicator->addPackInfo(
-        std::make_shared<PackInfo<VectorField>>(m_velocity_field_id,
-                                                n_ghost_layers));
+        std::make_shared<PackInfo<VectorField>>(m_velocity_field_id));
 
-    m_boundary_communicator = std::make_shared<FullCommunicator>(blocks);
+    m_boundary_communicator =
+        std::make_shared<BoundaryFullCommunicator>(blocks);
     m_boundary_communicator->addPackInfo(
         std::make_shared<field::communication::PackInfo<FlagField>>(
-            m_flag_field_id, n_ghost_layers));
+            m_flag_field_id));
     auto boundary_packinfo = std::make_shared<
         field::communication::BoundaryPackInfo<FlagField, BoundaryModel>>(
-        m_flag_field_id, n_ghost_layers);
+        m_flag_field_id);
     boundary_packinfo->setup_boundary_handle(m_lattice, m_boundary);
     m_boundary_communicator->addPackInfo(boundary_packinfo);
 
@@ -457,13 +535,13 @@ class LBWalberlaImpl : public LBWalberlaBase {
 
   void integrate_pull_scheme() {
     auto const &blocks = get_lattice().get_blocks();
-    integrate_reset_force(blocks);
     // Handle boundaries
     integrate_boundaries(blocks);
     // LB stream
     integrate_stream(blocks);
     // LB collide
     integrate_collide(blocks);
+    integrate_reset_force(blocks);
     // Refresh ghost layers
     ghost_communication_pdfs();
   }
@@ -512,26 +590,23 @@ class LBWalberlaImpl : public LBWalberlaBase {
   void set_collision_model(double kT, unsigned int seed) override {
     auto const omega = shear_mode_relaxation_rate();
     auto const omega_odd = odd_mode_relaxation_rate(omega);
+    auto const blocks = get_lattice().get_blocks();
     m_kT = FloatType_c(kT);
-    auto obj = CollisionModelThermalized(
-        m_last_applied_force_field_id, m_pdf_field_id, uint32_t{0u},
-        uint32_t{0u}, uint32_t{0u}, m_kT, omega, omega, omega_odd, omega, seed,
-        uint32_t{0u});
-    obj.block_offset_generator =
-        [this](IBlock *const block, uint32_t &block_offset_0,
-               uint32_t &block_offset_1, uint32_t &block_offset_2) {
-          auto const &blocks = get_lattice().get_blocks();
-          auto const &ci = blocks->getBlockCellBB(*block);
-          block_offset_0 = static_cast<uint32_t>(ci.xMin());
-          block_offset_1 = static_cast<uint32_t>(ci.yMin());
-          block_offset_2 = static_cast<uint32_t>(ci.zMin());
-        };
+    auto obj = CollisionModelThermalized(m_last_applied_force_field_id,
+                                         m_pdf_field_id, m_kT, omega, omega,
+                                         omega_odd, omega, seed, uint32_t{0u});
     m_collision_model = std::make_shared<CollisionModel>(std::move(obj));
+    m_run_collide_sweep = CollideSweepVisitor(blocks);
   }
 
   void set_collision_model(
       std::unique_ptr<LeesEdwardsPack> &&lees_edwards_pack) override {
     assert(m_kT == 0.);
+#if defined(__CUDACC__)
+    if constexpr (Architecture == lbmpy::Arch::GPU) {
+      throw std::runtime_error("Lees-Edwards LB doesn't support GPU yet");
+    }
+#endif
     auto const shear_direction = lees_edwards_pack->shear_direction;
     auto const shear_plane_normal = lees_edwards_pack->shear_plane_normal;
     auto const shear_vel = FloatType_c(lees_edwards_pack->get_shear_velocity());
@@ -549,23 +624,23 @@ class LBWalberlaImpl : public LBWalberlaBase {
         m_last_applied_force_field_id, m_pdf_field_id, agrid, omega, shear_vel);
     m_collision_model = std::make_shared<CollisionModel>(std::move(obj));
     m_lees_edwards_callbacks = std::move(lees_edwards_pack);
-    m_run_collide_sweep = CollideSweepVisitor{m_lees_edwards_callbacks};
+    m_run_collide_sweep = CollideSweepVisitor(blocks, m_lees_edwards_callbacks);
     m_lees_edwards_pdf_interpol_sweep =
-        std::make_shared<InterpolateAndShiftAtBoundary<PdfField, FloatType>>(
+        std::make_shared<InterpolateAndShiftAtBoundary<_PdfField, FloatType>>(
             blocks, m_pdf_field_id, m_pdf_tmp_field_id, n_ghost_layers,
             shear_direction, shear_plane_normal,
             m_lees_edwards_callbacks->get_pos_offset);
-    m_lees_edwards_vel_interpol_sweep =
-        std::make_shared<InterpolateAndShiftAtBoundary<VectorField, FloatType>>(
-            blocks, m_velocity_field_id, m_vec_tmp_field_id, n_ghost_layers,
-            shear_direction, shear_plane_normal,
-            m_lees_edwards_callbacks->get_pos_offset,
-            m_lees_edwards_callbacks->get_shear_velocity);
-    m_lees_edwards_last_applied_force_interpol_sweep =
-        std::make_shared<InterpolateAndShiftAtBoundary<VectorField, FloatType>>(
-            blocks, m_last_applied_force_field_id, m_vec_tmp_field_id,
-            n_ghost_layers, shear_direction, shear_plane_normal,
-            m_lees_edwards_callbacks->get_pos_offset);
+    m_lees_edwards_vel_interpol_sweep = std::make_shared<
+        InterpolateAndShiftAtBoundary<_VectorField, FloatType>>(
+        blocks, m_velocity_field_id, m_vec_tmp_field_id, n_ghost_layers,
+        shear_direction, shear_plane_normal,
+        m_lees_edwards_callbacks->get_pos_offset,
+        m_lees_edwards_callbacks->get_shear_velocity);
+    m_lees_edwards_last_applied_force_interpol_sweep = std::make_shared<
+        InterpolateAndShiftAtBoundary<_VectorField, FloatType>>(
+        blocks, m_last_applied_force_field_id, m_vec_tmp_field_id,
+        n_ghost_layers, shear_direction, shear_plane_normal,
+        m_lees_edwards_callbacks->get_pos_offset);
   }
 
   void check_lebc(unsigned int shear_direction,
@@ -622,8 +697,8 @@ class LBWalberlaImpl : public LBWalberlaBase {
     auto force_field =
         bc->block->template getData<VectorField>(m_last_applied_force_field_id);
     auto const vel = to_vector3<FloatType>(v);
-    lbm::accessor::Velocity::set(pdf_field, force_field, vel, bc->cell);
-    lbm::accessor::Vector::set(vel_field, vel, bc->cell);
+    lbm::accessor::Velocity::set(pdf_field, vel_field, force_field, vel,
+                                 bc->cell);
 
     return true;
   }
@@ -637,24 +712,31 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const &block = *(lattice.get_blocks()->begin());
       auto const field =
           block.template getData<VectorField>(m_velocity_field_id);
+      auto const values = lbm::accessor::Vector::get(field, *ci);
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
+      assert(values.size() == 3u * ci->numCells());
+      if constexpr (std::is_same_v<typename decltype(values)::value_type,
+                                   double>) {
+        out = std::move(values);
+      } else {
+        out = std::vector<double>(values.begin(), values.end());
+      }
       auto const local_offset = std::get<0>(lattice.get_local_grid_range());
       auto const lower_cell = ci->min();
       auto const upper_cell = ci->max();
-      out.reserve(ci->numCells());
+      auto it = out.begin();
       for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
         for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
           for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
             auto const node = local_offset + Utils::Vector3i{{x, y, z}};
             if (m_boundary->node_is_boundary(node)) {
-              auto const vec = m_boundary->get_node_value_at_boundary(node);
+              auto const &vec = m_boundary->get_node_value_at_boundary(node);
               for (uint_t f = 0u; f < 3u; ++f) {
-                out.emplace_back(double_c(vec[f]));
+                (*it) = double_c(vec[f]);
+                std::advance(it, 1l);
               }
             } else {
-              auto const vec = lbm::accessor::Vector::get(field, Cell{x, y, z});
-              for (uint_t f = 0u; f < 3u; ++f) {
-                out.emplace_back(double_c(vec[f]));
-              }
+              std::advance(it, 3l);
             }
           }
         }
@@ -669,30 +751,107 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
       auto const &lattice = get_lattice();
       auto &block = *(lattice.get_blocks()->begin());
-      // We have to set both, the pdf and the stored velocity field
       auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
       auto force_field =
           block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto it = velocity.begin();
+      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
       assert(velocity.size() == 3u * ci->numCells());
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const cell = Cell{x, y, z};
-            Vector3<FloatType> vec;
-            for (uint_t f = 0u; f < 3u; ++f) {
-              vec[f] = FloatType_c(*it);
-              ++it;
-            }
-            lbm::accessor::Velocity::set(pdf_field, force_field, vec, cell);
-            lbm::accessor::Vector::set(vel_field, vec, cell);
-          }
+      std::vector<FloatType> const values(velocity.begin(), velocity.end());
+      lbm::accessor::Velocity::set(pdf_field, vel_field, force_field, values,
+                                   *ci);
+    }
+  }
+
+  [[nodiscard]] bool is_gpu() const noexcept override {
+    return Architecture == lbmpy::Arch::GPU;
+  }
+
+  void add_forces_at_pos(std::vector<Utils::Vector3d> const &pos,
+                         std::vector<Utils::Vector3d> const &forces) override {
+    assert(pos.size() == forces.size());
+    if (pos.empty()) {
+      return;
+    }
+    if constexpr (Architecture == lbmpy::Arch::CPU) {
+      for (std::size_t i = 0ul; i < pos.size(); ++i) {
+        add_force_at_pos(pos[i], forces[i]);
+      }
+    }
+#if defined(__CUDACC__)
+    if constexpr (Architecture == lbmpy::Arch::GPU) {
+      auto const &lattice = get_lattice();
+      auto const &block = *(lattice.get_blocks()->begin());
+      auto const origin = block.getAABB().min();
+      std::vector<FloatType> host_pos;
+      std::vector<FloatType> host_force;
+      host_pos.reserve(3ul * pos.size());
+      host_force.reserve(3ul * forces.size());
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
+      for (auto const &vec : pos) {
+#pragma unroll
+        for (std::size_t i : {0ul, 1ul, 2ul}) {
+          host_pos.emplace_back(static_cast<FloatType>(vec[i] - origin[i]));
+        }
+      }
+      for (auto const &vec : forces) {
+#pragma unroll
+        for (std::size_t i : {0ul, 1ul, 2ul}) {
+          host_force.emplace_back(static_cast<FloatType>(vec[i]));
+        }
+      }
+      auto const gl = lattice.get_ghost_layers();
+      auto field = block.template uncheckedFastGetData<VectorField>(
+          m_force_to_be_applied_id);
+      lbm::accessor::Interpolation::set(field, host_pos, host_force, gl);
+    }
+#endif
+  }
+
+  std::vector<Utils::Vector3d>
+  get_velocities_at_pos(std::vector<Utils::Vector3d> const &pos) override {
+    if (pos.empty()) {
+      return {};
+    }
+    if constexpr (Architecture == lbmpy::Arch::CPU) {
+      std::vector<Utils::Vector3d> vel{};
+      vel.reserve(pos.size());
+      for (auto const &vec : pos) {
+        auto res = get_velocity_at_pos(vec, true);
+        assert(res.has_value());
+        vel.emplace_back(*res);
+      }
+      return vel;
+    }
+#if defined(__CUDACC__)
+    if constexpr (Architecture == lbmpy::Arch::GPU) {
+      auto const &lattice = get_lattice();
+      auto const &block = *(lattice.get_blocks()->begin());
+      auto const origin = block.getAABB().min();
+      std::vector<FloatType> host_pos;
+      host_pos.reserve(3ul * pos.size());
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
+      for (auto const &vec : pos) {
+#pragma unroll
+        for (std::size_t i : {0ul, 1ul, 2ul}) {
+          host_pos.emplace_back(static_cast<FloatType>(vec[i] - origin[i]));
         }
       }
+      auto const gl = lattice.get_ghost_layers();
+      auto field =
+          block.template uncheckedFastGetData<VectorField>(m_velocity_field_id);
+      auto const res = lbm::accessor::Interpolation::get(field, host_pos, gl);
+      std::vector<Utils::Vector3d> vel{};
+      vel.reserve(res.size() / 3ul);
+      for (auto it = res.begin(); it != res.end(); it += 3) {
+        vel.emplace_back(Utils::Vector3d{static_cast<double>(*(it + 0)),
+                                         static_cast<double>(*(it + 1)),
+                                         static_cast<double>(*(it + 2))});
+      }
+      return vel;
     }
+#endif
+    return {};
   }
 
   std::optional<Utils::Vector3d>
@@ -793,10 +952,13 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if (!bc)
       return false;
 
-    auto field =
+    auto pdf_field = bc->block->template getData<PdfField>(m_pdf_field_id);
+    auto force_field =
         bc->block->template getData<VectorField>(m_last_applied_force_field_id);
+    auto vel_field =
+        bc->block->template getData<VectorField>(m_velocity_field_id);
     auto const vec = to_vector3<FloatType>(force);
-    lbm::accessor::Vector::set(field, vec, bc->cell);
+    lbm::accessor::Force::set(pdf_field, vel_field, force_field, vec, bc->cell);
 
     return true;
   }
@@ -810,21 +972,15 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const &block = *(lattice.get_blocks()->begin());
       auto const field =
           block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto const n_values = 3u * ci->numCells();
-      out.reserve(n_values);
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const vec = lbm::accessor::Vector::get(field, Cell{x, y, z});
-            for (uint_t f = 0u; f < 3u; ++f) {
-              out.emplace_back(double_c(vec[f]));
-            }
-          }
-        }
+      auto const values = lbm::accessor::Vector::get(field, *ci);
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
+      assert(values.size() == 3u * ci->numCells());
+      if constexpr (std::is_same_v<typename decltype(values)::value_type,
+                                   double>) {
+        out = std::move(values);
+      } else {
+        out = std::vector<double>(values.begin(), values.end());
       }
-      assert(out.size() == n_values);
     }
     return out;
   }
@@ -835,24 +991,14 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
       auto const &lattice = get_lattice();
       auto &block = *(lattice.get_blocks()->begin());
-      auto field =
+      auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+      auto force_field =
           block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto it = force.begin();
+      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
       assert(force.size() == 3u * ci->numCells());
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            Vector3<FloatType> vec;
-            for (uint_t f = 0u; f < 3u; ++f) {
-              vec[f] = FloatType_c(*it);
-              ++it;
-            }
-            lbm::accessor::Vector::set(field, vec, Cell{x, y, z});
-          }
-        }
-      }
+      std::vector<FloatType> const values(force.begin(), force.end());
+      lbm::accessor::Force::set(pdf_field, vel_field, force_field, values, *ci);
     }
   }
 
@@ -881,11 +1027,16 @@ class LBWalberlaImpl : public LBWalberlaBase {
       return false;
 
     auto pdf_field = bc->block->template getData<PdfField>(m_pdf_field_id);
+    auto force_field =
+        bc->block->template getData<VectorField>(m_last_applied_force_field_id);
+    auto vel_field =
+        bc->block->template getData<VectorField>(m_velocity_field_id);
     std::array<FloatType, Stencil::Size> pop;
     for (uint_t f = 0u; f < Stencil::Size; ++f) {
       pop[f] = FloatType_c(population[f]);
     }
-    lbm::accessor::Population::set(pdf_field, pop, bc->cell);
+    lbm::accessor::Population::set(pdf_field, vel_field, force_field, pop,
+                                   bc->cell);
 
     return true;
   }
@@ -899,13 +1050,14 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const &block = *(lattice.get_blocks()->begin());
       auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
       auto const values = lbm::accessor::Population::get(pdf_field, *ci);
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
+      assert(values.size() == stencil_size() * ci->numCells());
       if constexpr (std::is_same_v<typename decltype(values)::value_type,
                                    double>) {
         out = std::move(values);
       } else {
         out = std::vector<double>(values.begin(), values.end());
       }
-      assert(out.size() == stencil_size() * ci->numCells());
     }
     return out;
   }
@@ -917,9 +1069,14 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const &lattice = get_lattice();
       auto &block = *(lattice.get_blocks()->begin());
       auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+      auto force_field =
+          block.template getData<VectorField>(m_last_applied_force_field_id);
+      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
       assert(population.size() == stencil_size() * ci->numCells());
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
       std::vector<FloatType> const values(population.begin(), population.end());
-      lbm::accessor::Population::set(pdf_field, values, *ci);
+      lbm::accessor::Population::set(pdf_field, vel_field, force_field, values,
+                                     *ci);
     }
   }
 
@@ -957,13 +1114,14 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const &block = *(lattice.get_blocks()->begin());
       auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
       auto const values = lbm::accessor::Density::get(pdf_field, *ci);
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
+      assert(values.size() == ci->numCells());
       if constexpr (std::is_same_v<typename decltype(values)::value_type,
                                    double>) {
         out = std::move(values);
       } else {
         out = std::vector<double>(values.begin(), values.end());
       }
-      assert(out.size() == ci->numCells());
     }
     return out;
   }
@@ -976,6 +1134,7 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto &block = *(lattice.get_blocks()->begin());
       auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
       assert(density.size() == ci->numCells());
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
       std::vector<FloatType> const values(density.begin(), density.end());
       lbm::accessor::Density::set(pdf_field, values, *ci);
     }
@@ -1149,23 +1308,18 @@ class LBWalberlaImpl : public LBWalberlaBase {
       auto const &lattice = get_lattice();
       auto const &block = *(lattice.get_blocks()->begin());
       auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto const n_values = 9u * ci->numCells();
-      out.reserve(n_values);
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const cell = Cell{x, y, z};
-            auto tensor = lbm::accessor::PressureTensor::get(pdf_field, cell);
-            pressure_tensor_correction(tensor);
-            for (auto i = 0u; i < 9u; ++i) {
-              out.emplace_back(tensor[i]);
-            }
-          }
-        }
+      auto values = lbm::accessor::PressureTensor::get(pdf_field, *ci);
+      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
+      assert(values.size() == 9u * ci->numCells());
+      for (auto it = values.begin(); it != values.end(); std::advance(it, 9l)) {
+        pressure_tensor_correction(std::span<FloatType, 9ul>(it, 9ul));
+      }
+      if constexpr (std::is_same_v<typename decltype(values)::value_type,
+                                   double>) {
+        out = std::move(values);
+      } else {
+        out = std::vector<double>(values.begin(), values.end());
       }
-      assert(out.size() == n_values);
     }
     return out;
   }
@@ -1243,9 +1397,13 @@ class LBWalberlaImpl : public LBWalberlaBase {
   }
 
   void register_vtk_field_filters(walberla::vtk::VTKOutput &vtk_obj) override {
-    field::FlagFieldCellFilter<FlagField> fluid_filter(m_flag_field_id);
-    fluid_filter.addFlag(Boundary_flag);
-    vtk_obj.addCellExclusionFilter(fluid_filter);
+    if constexpr (Architecture == lbmpy::Arch::GPU) {
+      throw std::runtime_error("VTK output not supported for GPU");
+    } else {
+      field::FlagFieldCellFilter<FlagField> fluid_filter(m_flag_field_id);
+      fluid_filter.addFlag(Boundary_flag);
+      vtk_obj.addCellExclusionFilter(fluid_filter);
+    }
   }
 
 protected:
@@ -1272,7 +1430,9 @@ class LBWalberlaImpl : public LBWalberlaBase {
   template <typename OutputType = float>
   class DensityVTKWriter : public VTKWriter<PdfField, 1u, OutputType> {
   public:
-    using VTKWriter<PdfField, 1u, OutputType>::VTKWriter;
+    using Base = VTKWriter<PdfField, 1u, OutputType>;
+    using Base::Base;
+    using Base::evaluate;
 
   protected:
     OutputType evaluate(cell_idx_t const x, cell_idx_t const y,
@@ -1287,7 +1447,9 @@ class LBWalberlaImpl : public LBWalberlaBase {
   template <typename OutputType = float>
   class VelocityVTKWriter : public VTKWriter<VectorField, 3u, OutputType> {
   public:
-    using VTKWriter<VectorField, 3u, OutputType>::VTKWriter;
+    using Base = VTKWriter<VectorField, 3u, OutputType>;
+    using Base::Base;
+    using Base::evaluate;
 
   protected:
     OutputType evaluate(cell_idx_t const x, cell_idx_t const y,
@@ -1302,11 +1464,14 @@ class LBWalberlaImpl : public LBWalberlaBase {
   template <typename OutputType = float>
   class PressureTensorVTKWriter : public VTKWriter<PdfField, 9u, OutputType> {
   public:
+    using Base = VTKWriter<PdfField, 9u, OutputType>;
+    using Base::Base;
+    using Base::evaluate;
+
     PressureTensorVTKWriter(ConstBlockDataID const &block_id,
                             std::string const &id, FloatType unit_conversion,
                             FloatType off_diag_factor)
-        : VTKWriter<PdfField, 9u, OutputType>::VTKWriter(block_id, id,
-                                                         unit_conversion),
+        : Base(block_id, id, unit_conversion),
           m_off_diag_factor(off_diag_factor) {}
 
   protected:
@@ -1327,21 +1492,26 @@ class LBWalberlaImpl : public LBWalberlaBase {
   void register_vtk_field_writers(walberla::vtk::VTKOutput &vtk_obj,
                                   LatticeModel::units_map const &units,
                                   int flag_observables) override {
-    if (flag_observables & static_cast<int>(OutputVTK::density)) {
-      auto const unit_conversion = FloatType_c(units.at("density"));
-      vtk_obj.addCellDataWriter(make_shared<DensityVTKWriter<float>>(
-          m_pdf_field_id, "density", unit_conversion));
-    }
-    if (flag_observables & static_cast<int>(OutputVTK::velocity_vector)) {
-      auto const unit_conversion = FloatType_c(units.at("velocity"));
-      vtk_obj.addCellDataWriter(make_shared<VelocityVTKWriter<float>>(
-          m_velocity_field_id, "velocity_vector", unit_conversion));
-    }
-    if (flag_observables & static_cast<int>(OutputVTK::pressure_tensor)) {
-      auto const unit_conversion = FloatType_c(units.at("pressure"));
-      vtk_obj.addCellDataWriter(make_shared<PressureTensorVTKWriter<float>>(
-          m_pdf_field_id, "pressure_tensor", unit_conversion,
-          pressure_tensor_correction_factor()));
+    if constexpr (Architecture == lbmpy::Arch::GPU) {
+      throw std::runtime_error("VTK output not supported for GPU");
+    } else {
+      if (flag_observables & static_cast<int>(OutputVTK::density)) {
+        auto const unit_conversion = FloatType_c(units.at("density"));
+        vtk_obj.addCellDataWriter(std::make_shared<DensityVTKWriter<float>>(
+            m_pdf_field_id, "density", unit_conversion));
+      }
+      if (flag_observables & static_cast<int>(OutputVTK::velocity_vector)) {
+        auto const unit_conversion = FloatType_c(units.at("velocity"));
+        vtk_obj.addCellDataWriter(std::make_shared<VelocityVTKWriter<float>>(
+            m_velocity_field_id, "velocity_vector", unit_conversion));
+      }
+      if (flag_observables & static_cast<int>(OutputVTK::pressure_tensor)) {
+        auto const unit_conversion = FloatType_c(units.at("pressure"));
+        vtk_obj.addCellDataWriter(
+            std::make_shared<PressureTensorVTKWriter<float>>(
+                m_pdf_field_id, "pressure_tensor", unit_conversion,
+                pressure_tensor_correction_factor()));
+      }
     }
   }
 
diff --git a/src/walberla_bridge/src/lattice_boltzmann/ResetForce.hpp b/src/walberla_bridge/src/lattice_boltzmann/ResetForce.hpp
index ce7d19295ac..d14f846ac54 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/ResetForce.hpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/ResetForce.hpp
@@ -21,6 +21,10 @@
 
 #include "generated_kernels/FieldAccessorsDoublePrecision.h"
 #include "generated_kernels/FieldAccessorsSinglePrecision.h"
+#if defined(__CUDACC__)
+#include "generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh"
+#include "generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh"
+#endif
 
 #include "../utils/types_conversion.hpp"
 
@@ -60,8 +64,8 @@ template <typename PdfField, typename ForceField> class ResetForce {
     force_field->swapDataPointers(force_to_be_applied);
 
     lbm::accessor::Vector::add_to_all(force_field, m_ext_force);
-    lbm::accessor::Vector::broadcast(force_to_be_applied,
-                                     Vector3<FloatType>{0});
+    lbm::accessor::Vector::initialize(force_to_be_applied,
+                                      Vector3<FloatType>{0});
   }
 
 private:
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CMakeLists.txt b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CMakeLists.txt
index 7756ccd63a7..27c3d39749d 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CMakeLists.txt
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CMakeLists.txt
@@ -38,3 +38,19 @@ else()
             CollideSweepSinglePrecisionThermalized.cpp
             CollideSweepDoublePrecisionThermalized.cpp)
 endif()
+if(ESPRESSO_BUILD_WITH_CUDA AND WALBERLA_BUILD_WITH_CUDA)
+  target_sources(
+    espresso_walberla_cuda
+    PRIVATE CollideSweepDoublePrecisionLeesEdwardsCUDA.cu
+            CollideSweepDoublePrecisionThermalizedCUDA.cu
+            CollideSweepSinglePrecisionLeesEdwardsCUDA.cu
+            CollideSweepSinglePrecisionThermalizedCUDA.cu
+            FieldAccessorsDoublePrecisionCUDA.cu
+            FieldAccessorsSinglePrecisionCUDA.cu
+            StreamSweepDoublePrecisionCUDA.cu
+            StreamSweepSinglePrecisionCUDA.cu
+            InitialPDFsSetterDoublePrecisionCUDA.cu
+            InitialPDFsSetterSinglePrecisionCUDA.cu
+            Dynamic_UBB_double_precisionCUDA.cu
+            Dynamic_UBB_single_precisionCUDA.cu)
+endif()
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.cpp
index 2814fff59d8..bb22b683b1c 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file CollideSweepDoublePrecisionLeesEdwards.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -51,149 +50,105 @@ static FUNC_PREFIX void collidesweepdoubleprecisionleesedwards_collidesweepdoubl
   const double xi_0 = ((1.0) / (omega_shear * -0.25 + 2.0));
   const double rr_0 = xi_0 * (omega_shear * -2.0 + 4.0);
   for (int64_t ctr_2 = 0; ctr_2 < _size_force_2; ctr_2 += 1) {
-    double *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    double *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    double *RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    double *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    double *RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3;
-    double *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    double *RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3;
     for (int64_t ctr_1 = 0; ctr_1 < _size_force_1; ctr_1 += 1) {
-      double *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      double *RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_36;
-      double *RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_315;
-      double *RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_310;
-      double *RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_312;
-      double *RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_318;
-      double *RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_39;
-      double *RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_31;
-      double *RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_37;
-      double *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      double *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      double *RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_316;
-      double *RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_38;
-      double *RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_313;
-      double *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      double *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      double *RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_314;
-      double *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      double *RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_317;
-      double *RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_311;
-      double *RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_32;
-      double *RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_35;
       for (int64_t ctr_0 = 0; ctr_0 < _size_force_0; ctr_0 += 1) {
-        const double xi_25 = _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0];
-        const double xi_26 = _data_pdfs_20_36_10[_stride_pdfs_0 * ctr_0];
-        const double xi_27 = _data_pdfs_20_315_10[_stride_pdfs_0 * ctr_0];
-        const double xi_28 = _data_pdfs_20_310_10[_stride_pdfs_0 * ctr_0];
-        const double xi_29 = _data_pdfs_20_312_10[_stride_pdfs_0 * ctr_0];
-        const double xi_30 = _data_pdfs_20_318_10[_stride_pdfs_0 * ctr_0];
-        const double xi_31 = _data_pdfs_20_39_10[_stride_pdfs_0 * ctr_0];
-        const double xi_32 = _data_pdfs_20_31_10[_stride_pdfs_0 * ctr_0];
-        const double xi_33 = _data_pdfs_20_37_10[_stride_pdfs_0 * ctr_0];
-        const double xi_34 = _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0];
-        const double xi_35 = _data_force_20_31_10[_stride_force_0 * ctr_0];
-        const double xi_36 = _data_pdfs_20_316_10[_stride_pdfs_0 * ctr_0];
-        const double xi_37 = _data_pdfs_20_38_10[_stride_pdfs_0 * ctr_0];
-        const double xi_38 = _data_pdfs_20_313_10[_stride_pdfs_0 * ctr_0];
-        const double xi_39 = _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0];
-        const double xi_40 = _data_force_20_32_10[_stride_force_0 * ctr_0];
-        const double xi_41 = _data_pdfs_20_314_10[_stride_pdfs_0 * ctr_0];
-        const double xi_42 = _data_force_20_30_10[_stride_force_0 * ctr_0];
-        const double xi_43 = _data_pdfs_20_317_10[_stride_pdfs_0 * ctr_0];
-        const double xi_44 = _data_pdfs_20_311_10[_stride_pdfs_0 * ctr_0];
-        const double xi_45 = _data_pdfs_20_32_10[_stride_pdfs_0 * ctr_0];
-        const double xi_46 = _data_pdfs_20_35_10[_stride_pdfs_0 * ctr_0];
-        const double xi_3 = xi_25;
-        const double xi_4 = xi_26;
-        const double xi_5 = xi_27;
-        const double xi_6 = xi_28;
-        const double xi_7 = xi_29;
-        const double xi_8 = xi_30;
-        const double xi_9 = xi_31;
-        const double xi_10 = xi_32;
-        const double xi_11 = xi_33;
-        const double xi_12 = xi_34;
-        const double xi_13 = xi_35;
-        const double xi_14 = xi_36;
-        const double xi_15 = xi_37;
-        const double xi_16 = xi_38;
-        const double xi_17 = xi_39;
-        const double xi_18 = xi_40;
-        const double xi_19 = xi_41;
-        const double xi_20 = xi_42;
-        const double xi_21 = xi_43;
-        const double xi_22 = xi_44;
-        const double xi_23 = xi_45;
-        const double xi_24 = xi_46;
-        const double vel0Term = xi_15 + xi_19 + xi_3 + xi_6 + xi_8;
-        const double vel1Term = xi_10 + xi_11 + xi_22 + xi_5;
-        const double vel2Term = xi_16 + xi_24 + xi_7;
-        const double rho = vel0Term + vel1Term + vel2Term + xi_12 + xi_14 + xi_17 + xi_21 + xi_23 + xi_4 + xi_9;
+        const double xi_25 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+        const double xi_26 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+        const double xi_27 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3];
+        const double xi_28 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+        const double xi_29 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+        const double xi_30 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
+        const double xi_31 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+        const double xi_32 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3];
+        const double xi_33 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3];
+        const double xi_34 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3];
+        const double xi_35 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+        const double xi_36 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3];
+        const double xi_37 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+        const double xi_38 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+        const double xi_39 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3];
+        const double xi_40 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+        const double xi_41 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+        const double xi_42 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3];
+        const double xi_43 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3];
+        const double xi_44 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3];
+        const double xi_45 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+        const double xi_46 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3];
+        const double xi_3 = xi_40;
+        const double xi_4 = xi_44;
+        const double xi_5 = xi_37;
+        const double xi_6 = xi_35;
+        const double xi_7 = xi_46;
+        const double xi_8 = xi_43;
+        const double xi_9 = xi_28;
+        const double xi_10 = xi_30;
+        const double xi_11 = xi_41;
+        const double xi_12 = xi_29;
+        const double xi_13 = xi_45;
+        const double xi_14 = xi_38;
+        const double xi_15 = xi_42;
+        const double xi_16 = xi_39;
+        const double xi_17 = xi_34;
+        const double xi_18 = xi_36;
+        const double xi_19 = xi_27;
+        const double xi_20 = xi_31;
+        const double xi_21 = xi_32;
+        const double xi_22 = xi_26;
+        const double xi_23 = xi_33;
+        const double xi_24 = xi_25;
+        const double vel0Term = xi_10 + xi_19 + xi_21 + xi_22 + xi_9;
+        const double vel1Term = xi_18 + xi_24 + xi_4 + xi_6;
+        const double vel2Term = xi_16 + xi_23 + xi_7;
+        const double rho = vel0Term + vel1Term + vel2Term + xi_11 + xi_13 + xi_14 + xi_15 + xi_17 + xi_5 + xi_8;
         const double xi_1 = ((1.0) / (rho));
-        const double u_0 = xi_1 * xi_20 * 0.5 + xi_1 * (vel0Term + xi_11 * -1.0 + xi_16 * -1.0 + xi_17 * -1.0 + xi_21 * -1.0 + xi_9 * -1.0);
-        const double u_1 = xi_1 * xi_13 * 0.5 + xi_1 * (vel1Term + xi_14 * -1.0 + xi_15 + xi_23 * -1.0 + xi_6 * -1.0 + xi_7 * -1.0 + xi_9 * -1.0);
-        const double u_2 = xi_1 * xi_18 * 0.5 + xi_1 * (vel2Term + xi_14 * -1.0 + xi_19 + xi_21 * -1.0 + xi_22 + xi_4 * -1.0 + xi_5 * -1.0 + xi_8 * -1.0);
-        const double forceTerm_0 = omega_shear * u_0 * xi_20 * 0.5 + omega_shear * u_1 * xi_13 * 0.5 + omega_shear * u_2 * xi_18 * 0.5 + u_0 * xi_20 * -1.0 + u_1 * xi_13 * -1.0 + u_2 * xi_18 * -1.0;
-        const double forceTerm_1 = omega_shear * u_0 * xi_20 * 0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.16666666666666666 + omega_shear * u_2 * xi_18 * 0.083333333333333329 + rr_0 * xi_13 * -0.083333333333333329 + u_0 * xi_20 * -0.16666666666666666 + u_1 * xi_13 * 0.33333333333333331 + u_2 * xi_18 * -0.16666666666666666 + xi_13 * 0.16666666666666666;
-        const double forceTerm_2 = omega_shear * u_0 * xi_20 * 0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.16666666666666666 + omega_shear * u_2 * xi_18 * 0.083333333333333329 + rr_0 * xi_13 * 0.083333333333333329 + u_0 * xi_20 * -0.16666666666666666 + u_1 * xi_13 * 0.33333333333333331 + u_2 * xi_18 * -0.16666666666666666 + xi_13 * -0.16666666666666666;
-        const double forceTerm_3 = omega_shear * u_0 * xi_20 * -0.16666666666666666 + omega_shear * u_1 * xi_13 * 0.083333333333333329 + omega_shear * u_2 * xi_18 * 0.083333333333333329 + rr_0 * xi_20 * 0.083333333333333329 + u_0 * xi_20 * 0.33333333333333331 + u_1 * xi_13 * -0.16666666666666666 + u_2 * xi_18 * -0.16666666666666666 + xi_20 * -0.16666666666666666;
-        const double forceTerm_4 = omega_shear * u_0 * xi_20 * -0.16666666666666666 + omega_shear * u_1 * xi_13 * 0.083333333333333329 + omega_shear * u_2 * xi_18 * 0.083333333333333329 + rr_0 * xi_20 * -0.083333333333333329 + u_0 * xi_20 * 0.33333333333333331 + u_1 * xi_13 * -0.16666666666666666 + u_2 * xi_18 * -0.16666666666666666 + xi_20 * 0.16666666666666666;
-        const double forceTerm_5 = omega_shear * u_0 * xi_20 * 0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.083333333333333329 + omega_shear * u_2 * xi_18 * -0.16666666666666666 + rr_0 * xi_18 * -0.083333333333333329 + u_0 * xi_20 * -0.16666666666666666 + u_1 * xi_13 * -0.16666666666666666 + u_2 * xi_18 * 0.33333333333333331 + xi_18 * 0.16666666666666666;
-        const double forceTerm_6 = omega_shear * u_0 * xi_20 * 0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.083333333333333329 + omega_shear * u_2 * xi_18 * -0.16666666666666666 + rr_0 * xi_18 * 0.083333333333333329 + u_0 * xi_20 * -0.16666666666666666 + u_1 * xi_13 * -0.16666666666666666 + u_2 * xi_18 * 0.33333333333333331 + xi_18 * -0.16666666666666666;
-        const double forceTerm_7 = omega_shear * u_0 * xi_13 * 0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_20 * 0.125 + omega_shear * u_2 * xi_18 * 0.041666666666666664 + rr_0 * xi_13 * -0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_13 * -0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_20 * -0.25 + u_2 * xi_18 * -0.083333333333333329 + xi_13 * 0.083333333333333329 + xi_20 * -0.083333333333333329;
-        const double forceTerm_8 = omega_shear * u_0 * xi_13 * -0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_20 * -0.125 + omega_shear * u_2 * xi_18 * 0.041666666666666664 + rr_0 * xi_13 * -0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_13 * 0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_20 * 0.25 + u_2 * xi_18 * -0.083333333333333329 + xi_13 * 0.083333333333333329 + xi_20 * 0.083333333333333329;
-        const double forceTerm_9 = omega_shear * u_0 * xi_13 * -0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_20 * -0.125 + omega_shear * u_2 * xi_18 * 0.041666666666666664 + rr_0 * xi_13 * 0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_13 * 0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_20 * 0.25 + u_2 * xi_18 * -0.083333333333333329 + xi_13 * -0.083333333333333329 + xi_20 * -0.083333333333333329;
-        const double forceTerm_10 = omega_shear * u_0 * xi_13 * 0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_20 * 0.125 + omega_shear * u_2 * xi_18 * 0.041666666666666664 + rr_0 * xi_13 * 0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_13 * -0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_20 * -0.25 + u_2 * xi_18 * -0.083333333333333329 + xi_13 * -0.083333333333333329 + xi_20 * 0.083333333333333329;
-        const double forceTerm_11 = omega_shear * u_0 * xi_20 * 0.041666666666666664 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_18 * -0.125 + omega_shear * u_2 * xi_13 * -0.125 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + rr_0 * xi_13 * -0.041666666666666664 + rr_0 * xi_18 * -0.041666666666666664 + u_0 * xi_20 * -0.083333333333333329 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_18 * 0.25 + u_2 * xi_13 * 0.25 + u_2 * xi_18 * 0.16666666666666666 + xi_13 * 0.083333333333333329 + xi_18 * 0.083333333333333329;
-        const double forceTerm_12 = omega_shear * u_0 * xi_20 * 0.041666666666666664 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_18 * 0.125 + omega_shear * u_2 * xi_13 * 0.125 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + rr_0 * xi_13 * 0.041666666666666664 + rr_0 * xi_18 * -0.041666666666666664 + u_0 * xi_20 * -0.083333333333333329 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_18 * -0.25 + u_2 * xi_13 * -0.25 + u_2 * xi_18 * 0.16666666666666666 + xi_13 * -0.083333333333333329 + xi_18 * 0.083333333333333329;
-        const double forceTerm_13 = omega_shear * u_0 * xi_18 * 0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.041666666666666664 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + omega_shear * u_2 * xi_20 * 0.125 + rr_0 * xi_18 * -0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_18 * -0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * -0.083333333333333329 + u_2 * xi_18 * 0.16666666666666666 + u_2 * xi_20 * -0.25 + xi_18 * 0.083333333333333329 + xi_20 * -0.083333333333333329;
-        const double forceTerm_14 = omega_shear * u_0 * xi_18 * -0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.041666666666666664 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + omega_shear * u_2 * xi_20 * -0.125 + rr_0 * xi_18 * -0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_18 * 0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * -0.083333333333333329 + u_2 * xi_18 * 0.16666666666666666 + u_2 * xi_20 * 0.25 + xi_18 * 0.083333333333333329 + xi_20 * 0.083333333333333329;
-        const double forceTerm_15 = omega_shear * u_0 * xi_20 * 0.041666666666666664 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_18 * 0.125 + omega_shear * u_2 * xi_13 * 0.125 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + rr_0 * xi_13 * -0.041666666666666664 + rr_0 * xi_18 * 0.041666666666666664 + u_0 * xi_20 * -0.083333333333333329 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_18 * -0.25 + u_2 * xi_13 * -0.25 + u_2 * xi_18 * 0.16666666666666666 + xi_13 * 0.083333333333333329 + xi_18 * -0.083333333333333329;
-        const double forceTerm_16 = omega_shear * u_0 * xi_20 * 0.041666666666666664 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_18 * -0.125 + omega_shear * u_2 * xi_13 * -0.125 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + rr_0 * xi_13 * 0.041666666666666664 + rr_0 * xi_18 * 0.041666666666666664 + u_0 * xi_20 * -0.083333333333333329 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_18 * 0.25 + u_2 * xi_13 * 0.25 + u_2 * xi_18 * 0.16666666666666666 + xi_13 * -0.083333333333333329 + xi_18 * -0.083333333333333329;
-        const double forceTerm_17 = omega_shear * u_0 * xi_18 * -0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.041666666666666664 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + omega_shear * u_2 * xi_20 * -0.125 + rr_0 * xi_18 * 0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_18 * 0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * -0.083333333333333329 + u_2 * xi_18 * 0.16666666666666666 + u_2 * xi_20 * 0.25 + xi_18 * -0.083333333333333329 + xi_20 * -0.083333333333333329;
-        const double forceTerm_18 = omega_shear * u_0 * xi_18 * 0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.041666666666666664 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + omega_shear * u_2 * xi_20 * 0.125 + rr_0 * xi_18 * 0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_18 * -0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * -0.083333333333333329 + u_2 * xi_18 * 0.16666666666666666 + u_2 * xi_20 * -0.25 + xi_18 * -0.083333333333333329 + xi_20 * 0.083333333333333329;
-        const double u0Mu1 = u_0 + u_1 * -1.0;
+        const double u_0 = xi_1 * xi_12 * 0.5 + xi_1 * (vel0Term - xi_14 - xi_15 - xi_23 - xi_5 - xi_6);
+        const double u_1 = xi_1 * xi_20 * 0.5 + xi_1 * (vel1Term - xi_11 - xi_14 - xi_17 + xi_22 - xi_7 - xi_9);
+        const double u_2 = xi_1 * xi_3 * 0.5 + xi_1 * (vel2Term - xi_15 - xi_17 - xi_18 - xi_19 + xi_21 + xi_4 - xi_8);
+        const double forceTerm_0 = omega_shear * u_0 * xi_12 * 0.5 + omega_shear * u_1 * xi_20 * 0.5 + omega_shear * u_2 * xi_3 * 0.5 - u_0 * xi_12 - u_1 * xi_20 - u_2 * xi_3;
+        const double forceTerm_1 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * -0.16666666666666666 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_20 * -0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * 0.33333333333333331 + u_2 * xi_3 * -0.16666666666666666 + xi_20 * 0.16666666666666666;
+        const double forceTerm_2 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * -0.16666666666666666 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_20 * 0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * 0.33333333333333331 + u_2 * xi_3 * -0.16666666666666666 + xi_20 * -0.16666666666666666;
+        const double forceTerm_3 = omega_shear * u_0 * xi_12 * -0.16666666666666666 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_12 * 0.083333333333333329 + u_0 * xi_12 * 0.33333333333333331 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * -0.16666666666666666 + xi_12 * -0.16666666666666666;
+        const double forceTerm_4 = omega_shear * u_0 * xi_12 * -0.16666666666666666 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_12 * -0.083333333333333329 + u_0 * xi_12 * 0.33333333333333331 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * -0.16666666666666666 + xi_12 * 0.16666666666666666;
+        const double forceTerm_5 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * -0.16666666666666666 + rr_0 * xi_3 * -0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * 0.33333333333333331 + xi_3 * 0.16666666666666666;
+        const double forceTerm_6 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * -0.16666666666666666 + rr_0 * xi_3 * 0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * 0.33333333333333331 + xi_3 * -0.16666666666666666;
+        const double forceTerm_7 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * 0.125 + omega_shear * u_1 * xi_12 * 0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * -0.25 + u_1 * xi_12 * -0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * -0.083333333333333329 + xi_20 * 0.083333333333333329;
+        const double forceTerm_8 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * -0.125 + omega_shear * u_1 * xi_12 * -0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * 0.25 + u_1 * xi_12 * 0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * 0.083333333333333329 + xi_20 * 0.083333333333333329;
+        const double forceTerm_9 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * -0.125 + omega_shear * u_1 * xi_12 * -0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * 0.25 + u_1 * xi_12 * 0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * -0.083333333333333329 + xi_20 * -0.083333333333333329;
+        const double forceTerm_10 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * 0.125 + omega_shear * u_1 * xi_12 * 0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * -0.25 + u_1 * xi_12 * -0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * 0.083333333333333329 + xi_20 * -0.083333333333333329;
+        const double forceTerm_11 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * -0.125 + omega_shear * u_2 * xi_20 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * -0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * 0.25 + u_2 * xi_20 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * 0.083333333333333329 + xi_3 * 0.083333333333333329;
+        const double forceTerm_12 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * 0.125 + omega_shear * u_2 * xi_20 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * 0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * -0.25 + u_2 * xi_20 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * -0.083333333333333329 + xi_3 * 0.083333333333333329;
+        const double forceTerm_13 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * 0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * -0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * -0.083333333333333329 + xi_3 * 0.083333333333333329;
+        const double forceTerm_14 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * -0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * 0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * 0.083333333333333329 + xi_3 * 0.083333333333333329;
+        const double forceTerm_15 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * 0.125 + omega_shear * u_2 * xi_20 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * -0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * -0.25 + u_2 * xi_20 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * 0.083333333333333329 + xi_3 * -0.083333333333333329;
+        const double forceTerm_16 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * -0.125 + omega_shear * u_2 * xi_20 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * 0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * 0.25 + u_2 * xi_20 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * -0.083333333333333329 + xi_3 * -0.083333333333333329;
+        const double forceTerm_17 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * -0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * 0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * -0.083333333333333329 + xi_3 * -0.083333333333333329;
+        const double forceTerm_18 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * 0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * -0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * 0.083333333333333329 + xi_3 * -0.083333333333333329;
+        const double u0Mu1 = u_0 - u_1;
         const double u0Pu1 = u_0 + u_1;
         const double u1Pu2 = u_1 + u_2;
-        const double u1Mu2 = u_1 + u_2 * -1.0;
-        const double u0Mu2 = u_0 + u_2 * -1.0;
+        const double u1Mu2 = u_1 - u_2;
+        const double u0Mu2 = u_0 - u_2;
         const double u0Pu2 = u_0 + u_2;
-        const double f_eq_common = rho * -1.0 * (u_0 * u_0) + rho * -1.0 * (u_1 * u_1) + rho * -1.0 * (u_2 * u_2) + rho;
-        _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0] = forceTerm_0 + omega_shear * (f_eq_common * 0.33333333333333331 + xi_12 * -1.0) + xi_12;
-        _data_pdfs_20_31_10[_stride_pdfs_0 * ctr_0] = forceTerm_1 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_1 * u_1)) + xi_10 * -0.5 + xi_23 * -0.5) + rr_0 * (rho * u_1 * 0.16666666666666666 + xi_10 * -0.5 + xi_23 * 0.5) + xi_10 + ((-1.0 <= grid_size * -1.0 + ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + v_s) * 0.16666666666666666) : (0.0));
-        _data_pdfs_20_32_10[_stride_pdfs_0 * ctr_0] = forceTerm_2 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_1 * u_1)) + xi_10 * -0.5 + xi_23 * -0.5) + rr_0 * (rho * u_1 * -0.16666666666666666 + xi_10 * 0.5 + xi_23 * -0.5) + xi_23 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * -2.0 + v_s) * 0.16666666666666666) : (0.0));
-        _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0] = forceTerm_3 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_0 * u_0)) + xi_17 * -0.5 + xi_3 * -0.5) + rr_0 * (rho * u_0 * -0.16666666666666666 + xi_17 * -0.5 + xi_3 * 0.5) + xi_17;
-        _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0] = forceTerm_4 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_0 * u_0)) + xi_17 * -0.5 + xi_3 * -0.5) + rr_0 * (rho * u_0 * 0.16666666666666666 + xi_17 * 0.5 + xi_3 * -0.5) + xi_3;
-        _data_pdfs_20_35_10[_stride_pdfs_0 * ctr_0] = forceTerm_5 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_2 * u_2)) + xi_24 * -0.5 + xi_4 * -0.5) + rr_0 * (rho * u_2 * 0.16666666666666666 + xi_24 * -0.5 + xi_4 * 0.5) + xi_24;
-        _data_pdfs_20_36_10[_stride_pdfs_0 * ctr_0] = forceTerm_6 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_2 * u_2)) + xi_24 * -0.5 + xi_4 * -0.5) + rr_0 * (rho * u_2 * -0.16666666666666666 + xi_24 * 0.5 + xi_4 * -0.5) + xi_4;
-        _data_pdfs_20_37_10[_stride_pdfs_0 * ctr_0] = forceTerm_7 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Mu1 * u0Mu1)) + xi_11 * -0.5 + xi_6 * -0.5) + rr_0 * (rho * u0Mu1 * -0.083333333333333329 + xi_11 * -0.5 + xi_6 * 0.5) + xi_11 + ((-1.0 <= grid_size * -1.0 + ((double)(ctr_1))) ? (rho * v_s * (u_0 * -2.0 + u_1 * 3.0 + v_s * -1.0 + 1.0) * 0.083333333333333329) : (0.0));
-        _data_pdfs_20_38_10[_stride_pdfs_0 * ctr_0] = forceTerm_8 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Pu1 * u0Pu1)) + xi_15 * -0.5 + xi_9 * -0.5) + rr_0 * (rho * u0Pu1 * 0.083333333333333329 + xi_15 * -0.5 + xi_9 * 0.5) + xi_15 + ((-1.0 <= grid_size * -1.0 + ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * 3.0 + v_s + 1.0) * -0.083333333333333329) : (0.0));
-        _data_pdfs_20_39_10[_stride_pdfs_0 * ctr_0] = forceTerm_9 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Pu1 * u0Pu1)) + xi_15 * -0.5 + xi_9 * -0.5) + rr_0 * (rho * u0Pu1 * -0.083333333333333329 + xi_15 * 0.5 + xi_9 * -0.5) + xi_9 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * 3.0 + v_s * -1.0 - 1.0) * 0.083333333333333329) : (0.0));
-        _data_pdfs_20_310_10[_stride_pdfs_0 * ctr_0] = forceTerm_10 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Mu1 * u0Mu1)) + xi_11 * -0.5 + xi_6 * -0.5) + rr_0 * (rho * u0Mu1 * 0.083333333333333329 + xi_11 * 0.5 + xi_6 * -0.5) + xi_6 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * -3.0 + v_s * -1.0 + 1.0) * 0.083333333333333329) : (0.0));
-        _data_pdfs_20_311_10[_stride_pdfs_0 * ctr_0] = forceTerm_11 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Pu2 * u1Pu2)) + xi_14 * -0.5 + xi_22 * -0.5) + rr_0 * (rho * u1Pu2 * 0.083333333333333329 + xi_14 * 0.5 + xi_22 * -0.5) + xi_22;
-        _data_pdfs_20_312_10[_stride_pdfs_0 * ctr_0] = forceTerm_12 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Mu2 * u1Mu2)) + xi_5 * -0.5 + xi_7 * -0.5) + rr_0 * (rho * u1Mu2 * -0.083333333333333329 + xi_5 * 0.5 + xi_7 * -0.5) + xi_7;
-        _data_pdfs_20_313_10[_stride_pdfs_0 * ctr_0] = forceTerm_13 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Mu2 * u0Mu2)) + xi_16 * -0.5 + xi_8 * -0.5) + rr_0 * (rho * u0Mu2 * -0.083333333333333329 + xi_16 * -0.5 + xi_8 * 0.5) + xi_16;
-        _data_pdfs_20_314_10[_stride_pdfs_0 * ctr_0] = forceTerm_14 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Pu2 * u0Pu2)) + xi_19 * -0.5 + xi_21 * -0.5) + rr_0 * (rho * u0Pu2 * 0.083333333333333329 + xi_19 * -0.5 + xi_21 * 0.5) + xi_19;
-        _data_pdfs_20_315_10[_stride_pdfs_0 * ctr_0] = forceTerm_15 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Mu2 * u1Mu2)) + xi_5 * -0.5 + xi_7 * -0.5) + rr_0 * (rho * u1Mu2 * 0.083333333333333329 + xi_5 * -0.5 + xi_7 * 0.5) + xi_5;
-        _data_pdfs_20_316_10[_stride_pdfs_0 * ctr_0] = forceTerm_16 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Pu2 * u1Pu2)) + xi_14 * -0.5 + xi_22 * -0.5) + rr_0 * (rho * u1Pu2 * -0.083333333333333329 + xi_14 * -0.5 + xi_22 * 0.5) + xi_14;
-        _data_pdfs_20_317_10[_stride_pdfs_0 * ctr_0] = forceTerm_17 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Pu2 * u0Pu2)) + xi_19 * -0.5 + xi_21 * -0.5) + rr_0 * (rho * u0Pu2 * -0.083333333333333329 + xi_19 * 0.5 + xi_21 * -0.5) + xi_21;
-        _data_pdfs_20_318_10[_stride_pdfs_0 * ctr_0] = forceTerm_18 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Mu2 * u0Mu2)) + xi_16 * -0.5 + xi_8 * -0.5) + rr_0 * (rho * u0Mu2 * 0.083333333333333329 + xi_16 * 0.5 + xi_8 * -0.5) + xi_8;
+        const double f_eq_common = rho - rho * u_0 * u_0 - rho * u_1 * u_1 - rho * u_2 * u_2;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = forceTerm_0 + omega_shear * (f_eq_common * 0.33333333333333331 - xi_13) + xi_13;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = forceTerm_1 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_1 * u_1)) + xi_11 * -0.5 + xi_24 * -0.5) + rr_0 * (rho * u_1 * 0.16666666666666666 + xi_11 * 0.5 + xi_24 * -0.5) + xi_24 + ((-1.0 <= -grid_size + ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + v_s) * 0.16666666666666666) : (0.0));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = forceTerm_2 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_1 * u_1)) + xi_11 * -0.5 + xi_24 * -0.5) + rr_0 * (rho * u_1 * -0.16666666666666666 + xi_11 * -0.5 + xi_24 * 0.5) + xi_11 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * -2.0 + v_s) * 0.16666666666666666) : (0.0));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = forceTerm_3 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_0 * u_0)) + xi_10 * -0.5 + xi_5 * -0.5) + rr_0 * (rho * u_0 * -0.16666666666666666 + xi_10 * 0.5 + xi_5 * -0.5) + xi_5;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = forceTerm_4 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_0 * u_0)) + xi_10 * -0.5 + xi_5 * -0.5) + rr_0 * (rho * u_0 * 0.16666666666666666 + xi_10 * -0.5 + xi_5 * 0.5) + xi_10;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = forceTerm_5 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_2 * u_2)) + xi_16 * -0.5 + xi_8 * -0.5) + rr_0 * (rho * u_2 * 0.16666666666666666 + xi_16 * -0.5 + xi_8 * 0.5) + xi_16;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = forceTerm_6 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_2 * u_2)) + xi_16 * -0.5 + xi_8 * -0.5) + rr_0 * (rho * u_2 * -0.16666666666666666 + xi_16 * 0.5 + xi_8 * -0.5) + xi_8;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = forceTerm_7 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Mu1 * u0Mu1)) + xi_6 * -0.5 + xi_9 * -0.5) + rr_0 * (rho * u0Mu1 * -0.083333333333333329 + xi_6 * -0.5 + xi_9 * 0.5) + xi_6 + ((-1.0 <= -grid_size + ((double)(ctr_1))) ? (rho * v_s * (u_0 * -2.0 + u_1 * 3.0 - v_s + 1.0) * 0.083333333333333329) : (0.0));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = forceTerm_8 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Pu1 * u0Pu1)) + xi_14 * -0.5 + xi_22 * -0.5) + rr_0 * (rho * u0Pu1 * 0.083333333333333329 + xi_14 * 0.5 + xi_22 * -0.5) + xi_22 + ((-1.0 <= -grid_size + ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * 3.0 + v_s + 1.0) * -0.083333333333333329) : (0.0));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = forceTerm_9 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Pu1 * u0Pu1)) + xi_14 * -0.5 + xi_22 * -0.5) + rr_0 * (rho * u0Pu1 * -0.083333333333333329 + xi_14 * -0.5 + xi_22 * 0.5) + xi_14 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * 3.0 - v_s - 1.0) * 0.083333333333333329) : (0.0));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = forceTerm_10 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Mu1 * u0Mu1)) + xi_6 * -0.5 + xi_9 * -0.5) + rr_0 * (rho * u0Mu1 * 0.083333333333333329 + xi_6 * 0.5 + xi_9 * -0.5) + xi_9 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * -3.0 - v_s + 1.0) * 0.083333333333333329) : (0.0));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = forceTerm_11 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Pu2 * u1Pu2)) + xi_17 * -0.5 + xi_4 * -0.5) + rr_0 * (rho * u1Pu2 * 0.083333333333333329 + xi_17 * 0.5 + xi_4 * -0.5) + xi_4;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = forceTerm_12 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Mu2 * u1Mu2)) + xi_18 * -0.5 + xi_7 * -0.5) + rr_0 * (rho * u1Mu2 * -0.083333333333333329 + xi_18 * 0.5 + xi_7 * -0.5) + xi_7;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = forceTerm_13 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Mu2 * u0Mu2)) + xi_19 * -0.5 + xi_23 * -0.5) + rr_0 * (rho * u0Mu2 * -0.083333333333333329 + xi_19 * 0.5 + xi_23 * -0.5) + xi_23;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = forceTerm_14 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Pu2 * u0Pu2)) + xi_15 * -0.5 + xi_21 * -0.5) + rr_0 * (rho * u0Pu2 * 0.083333333333333329 + xi_15 * 0.5 + xi_21 * -0.5) + xi_21;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = forceTerm_15 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Mu2 * u1Mu2)) + xi_18 * -0.5 + xi_7 * -0.5) + rr_0 * (rho * u1Mu2 * 0.083333333333333329 + xi_18 * -0.5 + xi_7 * 0.5) + xi_18;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = forceTerm_16 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Pu2 * u1Pu2)) + xi_17 * -0.5 + xi_4 * -0.5) + rr_0 * (rho * u1Pu2 * -0.083333333333333329 + xi_17 * -0.5 + xi_4 * 0.5) + xi_17;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = forceTerm_17 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Pu2 * u0Pu2)) + xi_15 * -0.5 + xi_21 * -0.5) + rr_0 * (rho * u0Pu2 * -0.083333333333333329 + xi_15 * -0.5 + xi_21 * 0.5) + xi_15;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = forceTerm_18 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Mu2 * u0Mu2)) + xi_19 * -0.5 + xi_23 * -0.5) + rr_0 * (rho * u0Mu2 * 0.083333333333333329 + xi_19 * -0.5 + xi_23 * 0.5) + xi_19;
       }
     }
   }
@@ -201,27 +156,28 @@ static FUNC_PREFIX void collidesweepdoubleprecisionleesedwards_collidesweepdoubl
 } // namespace internal_607d8a5c7ac58c25acf09ad94bb82cf4
 
 void CollideSweepDoublePrecisionLeesEdwards::run(IBlock *block) {
+
   auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
 
+  auto &grid_size = this->grid_size_;
   auto &v_s = this->v_s_;
   auto &omega_shear = this->omega_shear_;
-  auto &grid_size = this->grid_size_;
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -234,6 +190,7 @@ void CollideSweepDoublePrecisionLeesEdwards::run(IBlock *block) {
 }
 
 void CollideSweepDoublePrecisionLeesEdwards::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -245,28 +202,28 @@ void CollideSweepDoublePrecisionLeesEdwards::runOnCellInterval(const shared_ptr<
   auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
 
+  auto &grid_size = this->grid_size_;
   auto &v_s = this->v_s_;
   auto &omega_shear = this->omega_shear_;
-  auto &grid_size = this->grid_size_;
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -287,4 +244,4 @@ void CollideSweepDoublePrecisionLeesEdwards::runOnCellInterval(const shared_ptr<
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h
index aa33168b644..fb144478e13 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwards.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -92,6 +93,9 @@ class CollideSweepDoublePrecisionLeesEdwards {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   double grid_size_;
@@ -105,4 +109,4 @@ class CollideSweepDoublePrecisionLeesEdwards {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.cpp
index 5c339796387..874b255195f 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file CollideSweepDoublePrecisionLeesEdwardsAVX.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -53,124 +52,80 @@ static FUNC_PREFIX void collidesweepdoubleprecisionleesedwardsavx_collidesweepdo
   const double xi_0 = ((1.0) / (omega_shear * -0.25 + 2.0));
   const double rr_0 = xi_0 * (omega_shear * -2.0 + 4.0);
   for (int64_t ctr_2 = 0; ctr_2 < _size_force_2; ctr_2 += 1) {
-    double *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    double *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    double *RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    double *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    double *RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3;
-    double *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    double *RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3;
     for (int64_t ctr_1 = 0; ctr_1 < _size_force_1; ctr_1 += 1) {
-      double *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      double *RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_36;
-      double *RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_315;
-      double *RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_310;
-      double *RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_312;
-      double *RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_318;
-      double *RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_39;
-      double *RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_31;
-      double *RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_37;
-      double *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      double *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      double *RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_316;
-      double *RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_38;
-      double *RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_313;
-      double *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      double *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      double *RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_314;
-      double *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      double *RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_317;
-      double *RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_311;
-      double *RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_32;
-      double *RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_35;
       {
         for (int64_t ctr_0 = 0; ctr_0 < (int64_t)((_size_force_0) / (4)) * (4); ctr_0 += 4) {
-          const __m256d xi_25 = _mm256_load_pd(&_data_pdfs_20_34_10[ctr_0]);
-          const __m256d xi_26 = _mm256_load_pd(&_data_pdfs_20_36_10[ctr_0]);
-          const __m256d xi_27 = _mm256_load_pd(&_data_pdfs_20_315_10[ctr_0]);
-          const __m256d xi_28 = _mm256_load_pd(&_data_pdfs_20_310_10[ctr_0]);
-          const __m256d xi_29 = _mm256_load_pd(&_data_pdfs_20_312_10[ctr_0]);
-          const __m256d xi_30 = _mm256_load_pd(&_data_pdfs_20_318_10[ctr_0]);
-          const __m256d xi_31 = _mm256_load_pd(&_data_pdfs_20_39_10[ctr_0]);
-          const __m256d xi_32 = _mm256_load_pd(&_data_pdfs_20_31_10[ctr_0]);
-          const __m256d xi_33 = _mm256_load_pd(&_data_pdfs_20_37_10[ctr_0]);
-          const __m256d xi_34 = _mm256_load_pd(&_data_pdfs_20_30_10[ctr_0]);
-          const __m256d xi_35 = _mm256_load_pd(&_data_force_20_31_10[ctr_0]);
-          const __m256d xi_36 = _mm256_load_pd(&_data_pdfs_20_316_10[ctr_0]);
-          const __m256d xi_37 = _mm256_load_pd(&_data_pdfs_20_38_10[ctr_0]);
-          const __m256d xi_38 = _mm256_load_pd(&_data_pdfs_20_313_10[ctr_0]);
-          const __m256d xi_39 = _mm256_load_pd(&_data_pdfs_20_33_10[ctr_0]);
-          const __m256d xi_40 = _mm256_load_pd(&_data_force_20_32_10[ctr_0]);
-          const __m256d xi_41 = _mm256_load_pd(&_data_pdfs_20_314_10[ctr_0]);
-          const __m256d xi_42 = _mm256_load_pd(&_data_force_20_30_10[ctr_0]);
-          const __m256d xi_43 = _mm256_load_pd(&_data_pdfs_20_317_10[ctr_0]);
-          const __m256d xi_44 = _mm256_load_pd(&_data_pdfs_20_311_10[ctr_0]);
-          const __m256d xi_45 = _mm256_load_pd(&_data_pdfs_20_32_10[ctr_0]);
-          const __m256d xi_46 = _mm256_load_pd(&_data_pdfs_20_35_10[ctr_0]);
-          const __m256d xi_3 = xi_25;
-          const __m256d xi_4 = xi_26;
-          const __m256d xi_5 = xi_27;
-          const __m256d xi_6 = xi_28;
-          const __m256d xi_7 = xi_29;
-          const __m256d xi_8 = xi_30;
-          const __m256d xi_9 = xi_31;
-          const __m256d xi_10 = xi_32;
-          const __m256d xi_11 = xi_33;
-          const __m256d xi_12 = xi_34;
-          const __m256d xi_13 = xi_35;
-          const __m256d xi_14 = xi_36;
-          const __m256d xi_15 = xi_37;
-          const __m256d xi_16 = xi_38;
-          const __m256d xi_17 = xi_39;
-          const __m256d xi_18 = xi_40;
-          const __m256d xi_19 = xi_41;
-          const __m256d xi_20 = xi_42;
-          const __m256d xi_21 = xi_43;
-          const __m256d xi_22 = xi_44;
-          const __m256d xi_23 = xi_45;
-          const __m256d xi_24 = xi_46;
-          const __m256d vel0Term = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_15, xi_19), xi_3), xi_6), xi_8);
-          const __m256d vel1Term = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_10, xi_11), xi_22), xi_5);
-          const __m256d vel2Term = _mm256_add_pd(_mm256_add_pd(xi_16, xi_24), xi_7);
-          const __m256d rho = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(vel0Term, vel1Term), vel2Term), xi_12), xi_14), xi_17), xi_21), xi_23), xi_4), xi_9);
+          const __m256d xi_25 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_26 = _mm256_load_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_27 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_28 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_29 = _mm256_load_pd(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0]);
+          const __m256d xi_30 = _mm256_load_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_31 = _mm256_loadu_pd(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0]);
+          const __m256d xi_32 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_33 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_34 = _mm256_load_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_35 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_36 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_37 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_38 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_39 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_40 = _mm256_loadu_pd(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0]);
+          const __m256d xi_41 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_42 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_43 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_44 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_45 = _mm256_load_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0]);
+          const __m256d xi_46 = _mm256_load_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_3 = xi_40;
+          const __m256d xi_4 = xi_44;
+          const __m256d xi_5 = xi_37;
+          const __m256d xi_6 = xi_35;
+          const __m256d xi_7 = xi_46;
+          const __m256d xi_8 = xi_43;
+          const __m256d xi_9 = xi_28;
+          const __m256d xi_10 = xi_30;
+          const __m256d xi_11 = xi_41;
+          const __m256d xi_12 = xi_29;
+          const __m256d xi_13 = xi_45;
+          const __m256d xi_14 = xi_38;
+          const __m256d xi_15 = xi_42;
+          const __m256d xi_16 = xi_39;
+          const __m256d xi_17 = xi_34;
+          const __m256d xi_18 = xi_36;
+          const __m256d xi_19 = xi_27;
+          const __m256d xi_20 = xi_31;
+          const __m256d xi_21 = xi_32;
+          const __m256d xi_22 = xi_26;
+          const __m256d xi_23 = xi_33;
+          const __m256d xi_24 = xi_25;
+          const __m256d vel0Term = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_10, xi_19), xi_21), xi_22), xi_9);
+          const __m256d vel1Term = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_18, xi_24), xi_4), xi_6);
+          const __m256d vel2Term = _mm256_add_pd(_mm256_add_pd(xi_16, xi_23), xi_7);
+          const __m256d rho = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(vel0Term, vel1Term), vel2Term), xi_11), xi_13), xi_14), xi_15), xi_17), xi_5), xi_8);
           const __m256d xi_1 = _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho);
-          const __m256d u_0 = _mm256_add_pd(_mm256_mul_pd(xi_1, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_11, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_16, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_17, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_21, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_9, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), vel0Term)), _mm256_mul_pd(_mm256_mul_pd(xi_1, xi_20), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)));
-          const __m256d u_1 = _mm256_add_pd(_mm256_mul_pd(xi_1, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_14, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_23, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_6, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_7, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_9, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), vel1Term), xi_15)), _mm256_mul_pd(_mm256_mul_pd(xi_1, xi_13), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)));
-          const __m256d u_2 = _mm256_add_pd(_mm256_mul_pd(xi_1, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_14, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_21, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_4, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_5, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_8, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), vel2Term), xi_19), xi_22)), _mm256_mul_pd(_mm256_mul_pd(xi_1, xi_18), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)));
-          const __m256d forceTerm_0 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_1 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_2 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_3 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_4 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_5 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_6 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_7 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_20, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_13), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_13), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_8 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_20, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_13), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_13), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_9 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_mul_pd(xi_20, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_13), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_13), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_10 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_13, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_13), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_13), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_11 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_18, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_18), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_13), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_18), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_13), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_12 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_13, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_18), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_13), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_18), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_13), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_13 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_20, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_18), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_18), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_14 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_20, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_18), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_18), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_15 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_18, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_18), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_13), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_18), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_13), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_16 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_mul_pd(xi_18, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_18), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_13), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_13, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_18), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_13), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_17 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_mul_pd(xi_20, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_18), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_18), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256d forceTerm_18 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_18, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_18), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_18), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_13), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_18), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d u_0 = _mm256_add_pd(_mm256_mul_pd(xi_1, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_14, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_15, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_23, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_5, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_6, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), vel0Term)), _mm256_mul_pd(_mm256_mul_pd(xi_1, xi_12), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)));
+          const __m256d u_1 = _mm256_add_pd(_mm256_mul_pd(xi_1, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_11, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_14, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_17, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_7, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_9, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), vel1Term), xi_22)), _mm256_mul_pd(_mm256_mul_pd(xi_1, xi_20), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)));
+          const __m256d u_2 = _mm256_add_pd(_mm256_mul_pd(xi_1, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_15, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_17, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_18, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_19, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_8, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), vel2Term), xi_21), xi_4)), _mm256_mul_pd(_mm256_mul_pd(xi_1, xi_3), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)));
+          const __m256d forceTerm_0 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_1 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_2 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_3 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_4 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_5 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_6 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_mul_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_7 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_12, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_12), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_12), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_8 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_20, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_12), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_12), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_9 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_mul_pd(xi_20, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_12), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_12), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_10 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_20, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_12), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_20), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_12), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_11 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_3, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_3), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_3), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_12 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_20, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_3), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_3), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_13 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_12, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_3), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_12), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_3), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_12), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_14 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_3, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_3), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_12), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_3), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_12), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_15 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_3, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_3), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_3), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_16 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_mul_pd(xi_3, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_3), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_20, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_3), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_20), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_17 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_mul_pd(xi_3, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_3), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_12), _mm256_set_pd(0.25, 0.25, 0.25, 0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_3), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_12), _mm256_set_pd(-0.125, -0.125, -0.125, -0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256d forceTerm_18 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_3, _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_mul_pd(_mm256_mul_pd(u_0, xi_3), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(u_2, xi_12), _mm256_set_pd(-0.25, -0.25, -0.25, -0.25))), _mm256_mul_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0))), _mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_3), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_12), _mm256_set_pd(0.125, 0.125, 0.125, 0.125)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_1, xi_20), _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_0, xi_12), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(u_2, xi_3), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)));
           const __m256d u0Mu1 = _mm256_add_pd(_mm256_mul_pd(u_1, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), u_0);
           const __m256d u0Pu1 = _mm256_add_pd(u_0, u_1);
           const __m256d u1Pu2 = _mm256_add_pd(u_1, u_2);
@@ -178,124 +133,124 @@ static FUNC_PREFIX void collidesweepdoubleprecisionleesedwardsavx_collidesweepdo
           const __m256d u0Mu2 = _mm256_add_pd(_mm256_mul_pd(u_2, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), u_0);
           const __m256d u0Pu2 = _mm256_add_pd(u_0, u_2);
           const __m256d f_eq_common = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(u_0, u_0)), _mm256_mul_pd(_mm256_mul_pd(rho, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(u_1, u_1))), _mm256_mul_pd(_mm256_mul_pd(rho, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(u_2, u_2))), rho);
-          _mm256_store_pd(&_data_pdfs_20_30_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(xi_12, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(f_eq_common, _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)), forceTerm_0), xi_12));
-          _mm256_store_pd(&_data_pdfs_20_31_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_23, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_10, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_1), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_10, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_23, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_1, u_1)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(u_0, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)), _mm256_set_pd(v_s, v_s, v_s, v_s))), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(grid_size, grid_size, grid_size, grid_size)), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)))), _CMP_LE_OQ))), forceTerm_1), xi_10));
-          _mm256_store_pd(&_data_pdfs_20_32_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_10, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_23, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_1), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_10, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_23, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_1, u_1)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(u_0, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0)), _mm256_set_pd(v_s, v_s, v_s, v_s))), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1))), _CMP_GE_OQ))), forceTerm_2), xi_23));
-          _mm256_store_pd(&_data_pdfs_20_33_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_3, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_17, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_0), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_17, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_3, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_0, u_0)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), forceTerm_3), xi_17));
-          _mm256_store_pd(&_data_pdfs_20_34_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_17, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_3, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_0), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_17, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_3, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_0, u_0)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), forceTerm_4), xi_3));
-          _mm256_store_pd(&_data_pdfs_20_35_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_4, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_24, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_2), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_24, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_4, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_2, u_2)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), forceTerm_5), xi_24));
-          _mm256_store_pd(&_data_pdfs_20_36_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_24, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_4, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_2), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_24, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_4, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_2, u_2)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), forceTerm_6), xi_4));
-          _mm256_store_pd(&_data_pdfs_20_37_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_6, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_11, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Mu1), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_11, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_6, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Mu1, u0Mu1)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_2, u_2))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_mul_pd(u_1, _mm256_set_pd(3.0, 3.0, 3.0, 3.0))), _mm256_mul_pd(u_0, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0))), _mm256_set_pd(1.0, 1.0, 1.0, 1.0))), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(grid_size, grid_size, grid_size, grid_size)), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)))), _CMP_LE_OQ))), forceTerm_7), xi_11));
-          _mm256_store_pd(&_data_pdfs_20_38_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_9, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_15, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Pu1), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_15, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_9, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Pu1, u0Pu1)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_2, u_2))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(u_0, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)), _mm256_mul_pd(u_1, _mm256_set_pd(3.0, 3.0, 3.0, 3.0))), _mm256_set_pd(1.0, 1.0, 1.0, 1.0)), _mm256_set_pd(v_s, v_s, v_s, v_s))), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(grid_size, grid_size, grid_size, grid_size)), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)))), _CMP_LE_OQ))), forceTerm_8), xi_15));
-          _mm256_store_pd(&_data_pdfs_20_39_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_15, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_9, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Pu1), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_15, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_9, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Pu1, u0Pu1)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_2, u_2))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_mul_pd(u_0, _mm256_set_pd(2.0, 2.0, 2.0, 2.0))), _mm256_mul_pd(u_1, _mm256_set_pd(3.0, 3.0, 3.0, 3.0))), _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1))), _CMP_GE_OQ))), forceTerm_9), xi_9));
-          _mm256_store_pd(&_data_pdfs_20_310_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_11, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_6, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Mu1), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_11, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_6, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Mu1, u0Mu1)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_2, u_2))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_mul_pd(u_0, _mm256_set_pd(2.0, 2.0, 2.0, 2.0))), _mm256_mul_pd(u_1, _mm256_set_pd(-3.0, -3.0, -3.0, -3.0))), _mm256_set_pd(1.0, 1.0, 1.0, 1.0))), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1))), _CMP_GE_OQ))), forceTerm_10), xi_6));
-          _mm256_store_pd(&_data_pdfs_20_311_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_14, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_22, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u1Pu2), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_14, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_22, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u1Pu2, u1Pu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_0, u_0))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_11), xi_22));
-          _mm256_store_pd(&_data_pdfs_20_312_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_5, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_7, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u1Mu2), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_5, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_7, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u1Mu2, u1Mu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_0, u_0))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_12), xi_7));
-          _mm256_store_pd(&_data_pdfs_20_313_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_8, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_16, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Mu2), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_16, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_8, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Mu2, u0Mu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_1, u_1))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_13), xi_16));
-          _mm256_store_pd(&_data_pdfs_20_314_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_21, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_19, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Pu2), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_19, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_21, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Pu2, u0Pu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_1, u_1))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_14), xi_19));
-          _mm256_store_pd(&_data_pdfs_20_315_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_7, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_5, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u1Mu2), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_5, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_7, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u1Mu2, u1Mu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_0, u_0))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_15), xi_5));
-          _mm256_store_pd(&_data_pdfs_20_316_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_22, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_14, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u1Pu2), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_14, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_22, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u1Pu2, u1Pu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_0, u_0))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_16), xi_14));
-          _mm256_store_pd(&_data_pdfs_20_317_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_19, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_21, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Pu2), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_19, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_21, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Pu2, u0Pu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_1, u_1))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_17), xi_21));
-          _mm256_store_pd(&_data_pdfs_20_318_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_16, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_8, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Mu2), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_16, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_8, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Mu2, u0Mu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_1, u_1))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_18), xi_8));
+          _mm256_store_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331)), _mm256_mul_pd(xi_13, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear)), forceTerm_0), xi_13));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_11, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_24, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_1), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_11, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_24, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_1, u_1)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(u_0, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)), _mm256_set_pd(v_s, v_s, v_s, v_s))), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(grid_size, grid_size, grid_size, grid_size)), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)))), _CMP_LE_OQ))), forceTerm_1), xi_24));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_24, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_11, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_1), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_11, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_24, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_1, u_1)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(u_0, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0)), _mm256_set_pd(v_s, v_s, v_s, v_s))), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1))), _CMP_GE_OQ))), forceTerm_2), xi_11));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_10, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_5, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_0), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_10, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_5, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_0, u_0)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), forceTerm_3), xi_5));
+          _mm256_store_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_5, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_10, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_0), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_10, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_5, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_0, u_0)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), forceTerm_4), xi_10));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_8, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_16, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_2), _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_16, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_8, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_2, u_2)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), forceTerm_5), xi_16));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_16, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_8, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u_2), _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666)), _mm256_mul_pd(xi_16, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_8, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331), _mm256_mul_pd(u_2, u_2)), _mm256_set_pd(-0.1111111111111111, -0.1111111111111111, -0.1111111111111111, -0.1111111111111111)))))), forceTerm_6), xi_8));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_9, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_6, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Mu1), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_6, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_9, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Mu1, u0Mu1)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_2, u_2))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_mul_pd(u_1, _mm256_set_pd(3.0, 3.0, 3.0, 3.0))), _mm256_mul_pd(u_0, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0))), _mm256_set_pd(1.0, 1.0, 1.0, 1.0))), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(grid_size, grid_size, grid_size, grid_size)), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)))), _CMP_LE_OQ))), forceTerm_7), xi_6));
+          _mm256_store_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_14, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_22, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Pu1), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_14, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_22, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Pu1, u0Pu1)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_2, u_2))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(u_0, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)), _mm256_mul_pd(u_1, _mm256_set_pd(3.0, 3.0, 3.0, 3.0))), _mm256_set_pd(1.0, 1.0, 1.0, 1.0)), _mm256_set_pd(v_s, v_s, v_s, v_s))), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(grid_size, grid_size, grid_size, grid_size)), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)))), _CMP_LE_OQ))), forceTerm_8), xi_22));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_22, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_14, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Pu1), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_14, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_22, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Pu1, u0Pu1)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_2, u_2))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_mul_pd(u_0, _mm256_set_pd(2.0, 2.0, 2.0, 2.0))), _mm256_mul_pd(u_1, _mm256_set_pd(3.0, 3.0, 3.0, 3.0))), _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1))), _CMP_GE_OQ))), forceTerm_9), xi_14));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_6, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_9, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Mu1), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_6, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_9, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Mu1, u0Mu1)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_2, u_2))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), _mm256_blendv_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_mul_pd(_mm256_mul_pd(_mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_mul_pd(u_0, _mm256_set_pd(2.0, 2.0, 2.0, 2.0))), _mm256_mul_pd(u_1, _mm256_set_pd(-3.0, -3.0, -3.0, -3.0))), _mm256_set_pd(1.0, 1.0, 1.0, 1.0))), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_set_pd(v_s, v_s, v_s, v_s)), _mm256_cmp_pd(_mm256_set_pd(0.0, 0.0, 0.0, 0.0), _mm256_set_pd(((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1)), ((double)(ctr_1))), _CMP_GE_OQ))), forceTerm_10), xi_9));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_17, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_4, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u1Pu2), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_17, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_4, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u1Pu2, u1Pu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_0, u_0))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_11), xi_4));
+          _mm256_store_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_18, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_7, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u1Mu2), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_18, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_7, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u1Mu2, u1Mu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_0, u_0))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_12), xi_7));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_19, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_23, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Mu2), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_19, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_23, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Mu2, u0Mu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_1, u_1))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_13), xi_23));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_15, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_21, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Pu2), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_15, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_21, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Pu2, u0Pu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_1, u_1))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_14), xi_21));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_7, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_18, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u1Mu2), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_18, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_7, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u1Mu2, u1Mu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_0, u_0))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_15), xi_18));
+          _mm256_store_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_4, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_17, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u1Pu2), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_17, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_4, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u1Pu2, u1Pu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_0, u_0))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_16), xi_17));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_21, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_15, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Pu2), _mm256_set_pd(-0.083333333333333329, -0.083333333333333329, -0.083333333333333329, -0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_15, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_21, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Pu2, u0Pu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_1, u_1))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_17), xi_15));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_23, _mm256_set_pd(0.5, 0.5, 0.5, 0.5)), _mm256_mul_pd(xi_19, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(_mm256_mul_pd(rho, u0Mu2), _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329))), _mm256_set_pd(rr_0, rr_0, rr_0, rr_0)), _mm256_mul_pd(_mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(f_eq_common, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_19, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_23, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(rho, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(0.125, 0.125, 0.125, 0.125), _mm256_mul_pd(u0Mu2, u0Mu2)), _mm256_mul_pd(_mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664), _mm256_mul_pd(u_1, u_1))), _mm256_set_pd(-0.013888888888888888, -0.013888888888888888, -0.013888888888888888, -0.013888888888888888)))))), forceTerm_18), xi_19));
         }
         for (int64_t ctr_0 = (int64_t)((_size_force_0) / (4)) * (4); ctr_0 < _size_force_0; ctr_0 += 1) {
-          const double xi_25 = _data_pdfs_20_34_10[ctr_0];
-          const double xi_26 = _data_pdfs_20_36_10[ctr_0];
-          const double xi_27 = _data_pdfs_20_315_10[ctr_0];
-          const double xi_28 = _data_pdfs_20_310_10[ctr_0];
-          const double xi_29 = _data_pdfs_20_312_10[ctr_0];
-          const double xi_30 = _data_pdfs_20_318_10[ctr_0];
-          const double xi_31 = _data_pdfs_20_39_10[ctr_0];
-          const double xi_32 = _data_pdfs_20_31_10[ctr_0];
-          const double xi_33 = _data_pdfs_20_37_10[ctr_0];
-          const double xi_34 = _data_pdfs_20_30_10[ctr_0];
-          const double xi_35 = _data_force_20_31_10[ctr_0];
-          const double xi_36 = _data_pdfs_20_316_10[ctr_0];
-          const double xi_37 = _data_pdfs_20_38_10[ctr_0];
-          const double xi_38 = _data_pdfs_20_313_10[ctr_0];
-          const double xi_39 = _data_pdfs_20_33_10[ctr_0];
-          const double xi_40 = _data_force_20_32_10[ctr_0];
-          const double xi_41 = _data_pdfs_20_314_10[ctr_0];
-          const double xi_42 = _data_force_20_30_10[ctr_0];
-          const double xi_43 = _data_pdfs_20_317_10[ctr_0];
-          const double xi_44 = _data_pdfs_20_311_10[ctr_0];
-          const double xi_45 = _data_pdfs_20_32_10[ctr_0];
-          const double xi_46 = _data_pdfs_20_35_10[ctr_0];
-          const double xi_3 = xi_25;
-          const double xi_4 = xi_26;
-          const double xi_5 = xi_27;
-          const double xi_6 = xi_28;
-          const double xi_7 = xi_29;
-          const double xi_8 = xi_30;
-          const double xi_9 = xi_31;
-          const double xi_10 = xi_32;
-          const double xi_11 = xi_33;
-          const double xi_12 = xi_34;
-          const double xi_13 = xi_35;
-          const double xi_14 = xi_36;
-          const double xi_15 = xi_37;
-          const double xi_16 = xi_38;
-          const double xi_17 = xi_39;
-          const double xi_18 = xi_40;
-          const double xi_19 = xi_41;
-          const double xi_20 = xi_42;
-          const double xi_21 = xi_43;
-          const double xi_22 = xi_44;
-          const double xi_23 = xi_45;
-          const double xi_24 = xi_46;
-          const double vel0Term = xi_15 + xi_19 + xi_3 + xi_6 + xi_8;
-          const double vel1Term = xi_10 + xi_11 + xi_22 + xi_5;
-          const double vel2Term = xi_16 + xi_24 + xi_7;
-          const double rho = vel0Term + vel1Term + vel2Term + xi_12 + xi_14 + xi_17 + xi_21 + xi_23 + xi_4 + xi_9;
+          const double xi_25 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0];
+          const double xi_26 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0];
+          const double xi_27 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0];
+          const double xi_28 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0];
+          const double xi_29 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0];
+          const double xi_30 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0];
+          const double xi_31 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0];
+          const double xi_32 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0];
+          const double xi_33 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0];
+          const double xi_34 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0];
+          const double xi_35 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0];
+          const double xi_36 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0];
+          const double xi_37 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0];
+          const double xi_38 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0];
+          const double xi_39 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0];
+          const double xi_40 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0];
+          const double xi_41 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0];
+          const double xi_42 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0];
+          const double xi_43 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0];
+          const double xi_44 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0];
+          const double xi_45 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0];
+          const double xi_46 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0];
+          const double xi_3 = xi_40;
+          const double xi_4 = xi_44;
+          const double xi_5 = xi_37;
+          const double xi_6 = xi_35;
+          const double xi_7 = xi_46;
+          const double xi_8 = xi_43;
+          const double xi_9 = xi_28;
+          const double xi_10 = xi_30;
+          const double xi_11 = xi_41;
+          const double xi_12 = xi_29;
+          const double xi_13 = xi_45;
+          const double xi_14 = xi_38;
+          const double xi_15 = xi_42;
+          const double xi_16 = xi_39;
+          const double xi_17 = xi_34;
+          const double xi_18 = xi_36;
+          const double xi_19 = xi_27;
+          const double xi_20 = xi_31;
+          const double xi_21 = xi_32;
+          const double xi_22 = xi_26;
+          const double xi_23 = xi_33;
+          const double xi_24 = xi_25;
+          const double vel0Term = xi_10 + xi_19 + xi_21 + xi_22 + xi_9;
+          const double vel1Term = xi_18 + xi_24 + xi_4 + xi_6;
+          const double vel2Term = xi_16 + xi_23 + xi_7;
+          const double rho = vel0Term + vel1Term + vel2Term + xi_11 + xi_13 + xi_14 + xi_15 + xi_17 + xi_5 + xi_8;
           const double xi_1 = ((1.0) / (rho));
-          const double u_0 = xi_1 * xi_20 * 0.5 + xi_1 * (vel0Term + xi_11 * -1.0 + xi_16 * -1.0 + xi_17 * -1.0 + xi_21 * -1.0 + xi_9 * -1.0);
-          const double u_1 = xi_1 * xi_13 * 0.5 + xi_1 * (vel1Term + xi_14 * -1.0 + xi_15 + xi_23 * -1.0 + xi_6 * -1.0 + xi_7 * -1.0 + xi_9 * -1.0);
-          const double u_2 = xi_1 * xi_18 * 0.5 + xi_1 * (vel2Term + xi_14 * -1.0 + xi_19 + xi_21 * -1.0 + xi_22 + xi_4 * -1.0 + xi_5 * -1.0 + xi_8 * -1.0);
-          const double forceTerm_0 = omega_shear * u_0 * xi_20 * 0.5 + omega_shear * u_1 * xi_13 * 0.5 + omega_shear * u_2 * xi_18 * 0.5 + u_0 * xi_20 * -1.0 + u_1 * xi_13 * -1.0 + u_2 * xi_18 * -1.0;
-          const double forceTerm_1 = omega_shear * u_0 * xi_20 * 0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.16666666666666666 + omega_shear * u_2 * xi_18 * 0.083333333333333329 + rr_0 * xi_13 * -0.083333333333333329 + u_0 * xi_20 * -0.16666666666666666 + u_1 * xi_13 * 0.33333333333333331 + u_2 * xi_18 * -0.16666666666666666 + xi_13 * 0.16666666666666666;
-          const double forceTerm_2 = omega_shear * u_0 * xi_20 * 0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.16666666666666666 + omega_shear * u_2 * xi_18 * 0.083333333333333329 + rr_0 * xi_13 * 0.083333333333333329 + u_0 * xi_20 * -0.16666666666666666 + u_1 * xi_13 * 0.33333333333333331 + u_2 * xi_18 * -0.16666666666666666 + xi_13 * -0.16666666666666666;
-          const double forceTerm_3 = omega_shear * u_0 * xi_20 * -0.16666666666666666 + omega_shear * u_1 * xi_13 * 0.083333333333333329 + omega_shear * u_2 * xi_18 * 0.083333333333333329 + rr_0 * xi_20 * 0.083333333333333329 + u_0 * xi_20 * 0.33333333333333331 + u_1 * xi_13 * -0.16666666666666666 + u_2 * xi_18 * -0.16666666666666666 + xi_20 * -0.16666666666666666;
-          const double forceTerm_4 = omega_shear * u_0 * xi_20 * -0.16666666666666666 + omega_shear * u_1 * xi_13 * 0.083333333333333329 + omega_shear * u_2 * xi_18 * 0.083333333333333329 + rr_0 * xi_20 * -0.083333333333333329 + u_0 * xi_20 * 0.33333333333333331 + u_1 * xi_13 * -0.16666666666666666 + u_2 * xi_18 * -0.16666666666666666 + xi_20 * 0.16666666666666666;
-          const double forceTerm_5 = omega_shear * u_0 * xi_20 * 0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.083333333333333329 + omega_shear * u_2 * xi_18 * -0.16666666666666666 + rr_0 * xi_18 * -0.083333333333333329 + u_0 * xi_20 * -0.16666666666666666 + u_1 * xi_13 * -0.16666666666666666 + u_2 * xi_18 * 0.33333333333333331 + xi_18 * 0.16666666666666666;
-          const double forceTerm_6 = omega_shear * u_0 * xi_20 * 0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.083333333333333329 + omega_shear * u_2 * xi_18 * -0.16666666666666666 + rr_0 * xi_18 * 0.083333333333333329 + u_0 * xi_20 * -0.16666666666666666 + u_1 * xi_13 * -0.16666666666666666 + u_2 * xi_18 * 0.33333333333333331 + xi_18 * -0.16666666666666666;
-          const double forceTerm_7 = omega_shear * u_0 * xi_13 * 0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_20 * 0.125 + omega_shear * u_2 * xi_18 * 0.041666666666666664 + rr_0 * xi_13 * -0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_13 * -0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_20 * -0.25 + u_2 * xi_18 * -0.083333333333333329 + xi_13 * 0.083333333333333329 + xi_20 * -0.083333333333333329;
-          const double forceTerm_8 = omega_shear * u_0 * xi_13 * -0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_20 * -0.125 + omega_shear * u_2 * xi_18 * 0.041666666666666664 + rr_0 * xi_13 * -0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_13 * 0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_20 * 0.25 + u_2 * xi_18 * -0.083333333333333329 + xi_13 * 0.083333333333333329 + xi_20 * 0.083333333333333329;
-          const double forceTerm_9 = omega_shear * u_0 * xi_13 * -0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_20 * -0.125 + omega_shear * u_2 * xi_18 * 0.041666666666666664 + rr_0 * xi_13 * 0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_13 * 0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_20 * 0.25 + u_2 * xi_18 * -0.083333333333333329 + xi_13 * -0.083333333333333329 + xi_20 * -0.083333333333333329;
-          const double forceTerm_10 = omega_shear * u_0 * xi_13 * 0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_20 * 0.125 + omega_shear * u_2 * xi_18 * 0.041666666666666664 + rr_0 * xi_13 * 0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_13 * -0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_20 * -0.25 + u_2 * xi_18 * -0.083333333333333329 + xi_13 * -0.083333333333333329 + xi_20 * 0.083333333333333329;
-          const double forceTerm_11 = omega_shear * u_0 * xi_20 * 0.041666666666666664 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_18 * -0.125 + omega_shear * u_2 * xi_13 * -0.125 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + rr_0 * xi_13 * -0.041666666666666664 + rr_0 * xi_18 * -0.041666666666666664 + u_0 * xi_20 * -0.083333333333333329 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_18 * 0.25 + u_2 * xi_13 * 0.25 + u_2 * xi_18 * 0.16666666666666666 + xi_13 * 0.083333333333333329 + xi_18 * 0.083333333333333329;
-          const double forceTerm_12 = omega_shear * u_0 * xi_20 * 0.041666666666666664 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_18 * 0.125 + omega_shear * u_2 * xi_13 * 0.125 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + rr_0 * xi_13 * 0.041666666666666664 + rr_0 * xi_18 * -0.041666666666666664 + u_0 * xi_20 * -0.083333333333333329 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_18 * -0.25 + u_2 * xi_13 * -0.25 + u_2 * xi_18 * 0.16666666666666666 + xi_13 * -0.083333333333333329 + xi_18 * 0.083333333333333329;
-          const double forceTerm_13 = omega_shear * u_0 * xi_18 * 0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.041666666666666664 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + omega_shear * u_2 * xi_20 * 0.125 + rr_0 * xi_18 * -0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_18 * -0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * -0.083333333333333329 + u_2 * xi_18 * 0.16666666666666666 + u_2 * xi_20 * -0.25 + xi_18 * 0.083333333333333329 + xi_20 * -0.083333333333333329;
-          const double forceTerm_14 = omega_shear * u_0 * xi_18 * -0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.041666666666666664 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + omega_shear * u_2 * xi_20 * -0.125 + rr_0 * xi_18 * -0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_18 * 0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * -0.083333333333333329 + u_2 * xi_18 * 0.16666666666666666 + u_2 * xi_20 * 0.25 + xi_18 * 0.083333333333333329 + xi_20 * 0.083333333333333329;
-          const double forceTerm_15 = omega_shear * u_0 * xi_20 * 0.041666666666666664 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_18 * 0.125 + omega_shear * u_2 * xi_13 * 0.125 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + rr_0 * xi_13 * -0.041666666666666664 + rr_0 * xi_18 * 0.041666666666666664 + u_0 * xi_20 * -0.083333333333333329 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_18 * -0.25 + u_2 * xi_13 * -0.25 + u_2 * xi_18 * 0.16666666666666666 + xi_13 * 0.083333333333333329 + xi_18 * -0.083333333333333329;
-          const double forceTerm_16 = omega_shear * u_0 * xi_20 * 0.041666666666666664 + omega_shear * u_1 * xi_13 * -0.083333333333333329 + omega_shear * u_1 * xi_18 * -0.125 + omega_shear * u_2 * xi_13 * -0.125 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + rr_0 * xi_13 * 0.041666666666666664 + rr_0 * xi_18 * 0.041666666666666664 + u_0 * xi_20 * -0.083333333333333329 + u_1 * xi_13 * 0.16666666666666666 + u_1 * xi_18 * 0.25 + u_2 * xi_13 * 0.25 + u_2 * xi_18 * 0.16666666666666666 + xi_13 * -0.083333333333333329 + xi_18 * -0.083333333333333329;
-          const double forceTerm_17 = omega_shear * u_0 * xi_18 * -0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.041666666666666664 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + omega_shear * u_2 * xi_20 * -0.125 + rr_0 * xi_18 * 0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_18 * 0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * -0.083333333333333329 + u_2 * xi_18 * 0.16666666666666666 + u_2 * xi_20 * 0.25 + xi_18 * -0.083333333333333329 + xi_20 * -0.083333333333333329;
-          const double forceTerm_18 = omega_shear * u_0 * xi_18 * 0.125 + omega_shear * u_0 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_13 * 0.041666666666666664 + omega_shear * u_2 * xi_18 * -0.083333333333333329 + omega_shear * u_2 * xi_20 * 0.125 + rr_0 * xi_18 * 0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_18 * -0.25 + u_0 * xi_20 * 0.16666666666666666 + u_1 * xi_13 * -0.083333333333333329 + u_2 * xi_18 * 0.16666666666666666 + u_2 * xi_20 * -0.25 + xi_18 * -0.083333333333333329 + xi_20 * 0.083333333333333329;
-          const double u0Mu1 = u_0 + u_1 * -1.0;
+          const double u_0 = xi_1 * xi_12 * 0.5 + xi_1 * (vel0Term - xi_14 - xi_15 - xi_23 - xi_5 - xi_6);
+          const double u_1 = xi_1 * xi_20 * 0.5 + xi_1 * (vel1Term - xi_11 - xi_14 - xi_17 + xi_22 - xi_7 - xi_9);
+          const double u_2 = xi_1 * xi_3 * 0.5 + xi_1 * (vel2Term - xi_15 - xi_17 - xi_18 - xi_19 + xi_21 + xi_4 - xi_8);
+          const double forceTerm_0 = omega_shear * u_0 * xi_12 * 0.5 + omega_shear * u_1 * xi_20 * 0.5 + omega_shear * u_2 * xi_3 * 0.5 - u_0 * xi_12 - u_1 * xi_20 - u_2 * xi_3;
+          const double forceTerm_1 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * -0.16666666666666666 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_20 * -0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * 0.33333333333333331 + u_2 * xi_3 * -0.16666666666666666 + xi_20 * 0.16666666666666666;
+          const double forceTerm_2 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * -0.16666666666666666 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_20 * 0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * 0.33333333333333331 + u_2 * xi_3 * -0.16666666666666666 + xi_20 * -0.16666666666666666;
+          const double forceTerm_3 = omega_shear * u_0 * xi_12 * -0.16666666666666666 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_12 * 0.083333333333333329 + u_0 * xi_12 * 0.33333333333333331 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * -0.16666666666666666 + xi_12 * -0.16666666666666666;
+          const double forceTerm_4 = omega_shear * u_0 * xi_12 * -0.16666666666666666 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_12 * -0.083333333333333329 + u_0 * xi_12 * 0.33333333333333331 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * -0.16666666666666666 + xi_12 * 0.16666666666666666;
+          const double forceTerm_5 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * -0.16666666666666666 + rr_0 * xi_3 * -0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * 0.33333333333333331 + xi_3 * 0.16666666666666666;
+          const double forceTerm_6 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * -0.16666666666666666 + rr_0 * xi_3 * 0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * 0.33333333333333331 + xi_3 * -0.16666666666666666;
+          const double forceTerm_7 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * 0.125 + omega_shear * u_1 * xi_12 * 0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * -0.25 + u_1 * xi_12 * -0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * -0.083333333333333329 + xi_20 * 0.083333333333333329;
+          const double forceTerm_8 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * -0.125 + omega_shear * u_1 * xi_12 * -0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * 0.25 + u_1 * xi_12 * 0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * 0.083333333333333329 + xi_20 * 0.083333333333333329;
+          const double forceTerm_9 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * -0.125 + omega_shear * u_1 * xi_12 * -0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * 0.25 + u_1 * xi_12 * 0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * -0.083333333333333329 + xi_20 * -0.083333333333333329;
+          const double forceTerm_10 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * 0.125 + omega_shear * u_1 * xi_12 * 0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * -0.25 + u_1 * xi_12 * -0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * 0.083333333333333329 + xi_20 * -0.083333333333333329;
+          const double forceTerm_11 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * -0.125 + omega_shear * u_2 * xi_20 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * -0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * 0.25 + u_2 * xi_20 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * 0.083333333333333329 + xi_3 * 0.083333333333333329;
+          const double forceTerm_12 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * 0.125 + omega_shear * u_2 * xi_20 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * 0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * -0.25 + u_2 * xi_20 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * -0.083333333333333329 + xi_3 * 0.083333333333333329;
+          const double forceTerm_13 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * 0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * -0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * -0.083333333333333329 + xi_3 * 0.083333333333333329;
+          const double forceTerm_14 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * -0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * 0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * 0.083333333333333329 + xi_3 * 0.083333333333333329;
+          const double forceTerm_15 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * 0.125 + omega_shear * u_2 * xi_20 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * -0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * -0.25 + u_2 * xi_20 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * 0.083333333333333329 + xi_3 * -0.083333333333333329;
+          const double forceTerm_16 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * -0.125 + omega_shear * u_2 * xi_20 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * 0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * 0.25 + u_2 * xi_20 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * -0.083333333333333329 + xi_3 * -0.083333333333333329;
+          const double forceTerm_17 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * -0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * 0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * -0.083333333333333329 + xi_3 * -0.083333333333333329;
+          const double forceTerm_18 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * 0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * -0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * 0.083333333333333329 + xi_3 * -0.083333333333333329;
+          const double u0Mu1 = u_0 - u_1;
           const double u0Pu1 = u_0 + u_1;
           const double u1Pu2 = u_1 + u_2;
-          const double u1Mu2 = u_1 + u_2 * -1.0;
-          const double u0Mu2 = u_0 + u_2 * -1.0;
+          const double u1Mu2 = u_1 - u_2;
+          const double u0Mu2 = u_0 - u_2;
           const double u0Pu2 = u_0 + u_2;
-          const double f_eq_common = rho * -1.0 * u_0 * u_0 + rho * -1.0 * u_1 * u_1 + rho * -1.0 * u_2 * u_2 + rho;
-          _data_pdfs_20_30_10[ctr_0] = forceTerm_0 + omega_shear * (f_eq_common * 0.33333333333333331 + xi_12 * -1.0) + xi_12;
-          _data_pdfs_20_31_10[ctr_0] = forceTerm_1 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * u_1 * u_1) + xi_10 * -0.5 + xi_23 * -0.5) + rr_0 * (rho * u_1 * 0.16666666666666666 + xi_10 * -0.5 + xi_23 * 0.5) + xi_10 + ((-1.0 <= grid_size * -1.0 + ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + v_s) * 0.16666666666666666) : (0.0));
-          _data_pdfs_20_32_10[ctr_0] = forceTerm_2 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * u_1 * u_1) + xi_10 * -0.5 + xi_23 * -0.5) + rr_0 * (rho * u_1 * -0.16666666666666666 + xi_10 * 0.5 + xi_23 * -0.5) + xi_23 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * -2.0 + v_s) * 0.16666666666666666) : (0.0));
-          _data_pdfs_20_33_10[ctr_0] = forceTerm_3 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * u_0 * u_0) + xi_17 * -0.5 + xi_3 * -0.5) + rr_0 * (rho * u_0 * -0.16666666666666666 + xi_17 * -0.5 + xi_3 * 0.5) + xi_17;
-          _data_pdfs_20_34_10[ctr_0] = forceTerm_4 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * u_0 * u_0) + xi_17 * -0.5 + xi_3 * -0.5) + rr_0 * (rho * u_0 * 0.16666666666666666 + xi_17 * 0.5 + xi_3 * -0.5) + xi_3;
-          _data_pdfs_20_35_10[ctr_0] = forceTerm_5 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * u_2 * u_2) + xi_24 * -0.5 + xi_4 * -0.5) + rr_0 * (rho * u_2 * 0.16666666666666666 + xi_24 * -0.5 + xi_4 * 0.5) + xi_24;
-          _data_pdfs_20_36_10[ctr_0] = forceTerm_6 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * u_2 * u_2) + xi_24 * -0.5 + xi_4 * -0.5) + rr_0 * (rho * u_2 * -0.16666666666666666 + xi_24 * 0.5 + xi_4 * -0.5) + xi_4;
-          _data_pdfs_20_37_10[ctr_0] = forceTerm_7 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_2 * u_2 + 0.125 * u0Mu1 * u0Mu1) + xi_11 * -0.5 + xi_6 * -0.5) + rr_0 * (rho * u0Mu1 * -0.083333333333333329 + xi_11 * -0.5 + xi_6 * 0.5) + xi_11 + ((-1.0 <= grid_size * -1.0 + ((double)(ctr_1))) ? (rho * v_s * (u_0 * -2.0 + u_1 * 3.0 + v_s * -1.0 + 1.0) * 0.083333333333333329) : (0.0));
-          _data_pdfs_20_38_10[ctr_0] = forceTerm_8 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_2 * u_2 + 0.125 * u0Pu1 * u0Pu1) + xi_15 * -0.5 + xi_9 * -0.5) + rr_0 * (rho * u0Pu1 * 0.083333333333333329 + xi_15 * -0.5 + xi_9 * 0.5) + xi_15 + ((-1.0 <= grid_size * -1.0 + ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * 3.0 + v_s + 1.0) * -0.083333333333333329) : (0.0));
-          _data_pdfs_20_39_10[ctr_0] = forceTerm_9 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_2 * u_2 + 0.125 * u0Pu1 * u0Pu1) + xi_15 * -0.5 + xi_9 * -0.5) + rr_0 * (rho * u0Pu1 * -0.083333333333333329 + xi_15 * 0.5 + xi_9 * -0.5) + xi_9 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * 3.0 + v_s * -1.0 - 1.0) * 0.083333333333333329) : (0.0));
-          _data_pdfs_20_310_10[ctr_0] = forceTerm_10 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_2 * u_2 + 0.125 * u0Mu1 * u0Mu1) + xi_11 * -0.5 + xi_6 * -0.5) + rr_0 * (rho * u0Mu1 * 0.083333333333333329 + xi_11 * 0.5 + xi_6 * -0.5) + xi_6 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * -3.0 + v_s * -1.0 + 1.0) * 0.083333333333333329) : (0.0));
-          _data_pdfs_20_311_10[ctr_0] = forceTerm_11 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_0 * u_0 + 0.125 * u1Pu2 * u1Pu2) + xi_14 * -0.5 + xi_22 * -0.5) + rr_0 * (rho * u1Pu2 * 0.083333333333333329 + xi_14 * 0.5 + xi_22 * -0.5) + xi_22;
-          _data_pdfs_20_312_10[ctr_0] = forceTerm_12 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_0 * u_0 + 0.125 * u1Mu2 * u1Mu2) + xi_5 * -0.5 + xi_7 * -0.5) + rr_0 * (rho * u1Mu2 * -0.083333333333333329 + xi_5 * 0.5 + xi_7 * -0.5) + xi_7;
-          _data_pdfs_20_313_10[ctr_0] = forceTerm_13 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_1 * u_1 + 0.125 * u0Mu2 * u0Mu2) + xi_16 * -0.5 + xi_8 * -0.5) + rr_0 * (rho * u0Mu2 * -0.083333333333333329 + xi_16 * -0.5 + xi_8 * 0.5) + xi_16;
-          _data_pdfs_20_314_10[ctr_0] = forceTerm_14 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_1 * u_1 + 0.125 * u0Pu2 * u0Pu2) + xi_19 * -0.5 + xi_21 * -0.5) + rr_0 * (rho * u0Pu2 * 0.083333333333333329 + xi_19 * -0.5 + xi_21 * 0.5) + xi_19;
-          _data_pdfs_20_315_10[ctr_0] = forceTerm_15 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_0 * u_0 + 0.125 * u1Mu2 * u1Mu2) + xi_5 * -0.5 + xi_7 * -0.5) + rr_0 * (rho * u1Mu2 * 0.083333333333333329 + xi_5 * -0.5 + xi_7 * 0.5) + xi_5;
-          _data_pdfs_20_316_10[ctr_0] = forceTerm_16 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_0 * u_0 + 0.125 * u1Pu2 * u1Pu2) + xi_14 * -0.5 + xi_22 * -0.5) + rr_0 * (rho * u1Pu2 * -0.083333333333333329 + xi_14 * -0.5 + xi_22 * 0.5) + xi_14;
-          _data_pdfs_20_317_10[ctr_0] = forceTerm_17 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_1 * u_1 + 0.125 * u0Pu2 * u0Pu2) + xi_19 * -0.5 + xi_21 * -0.5) + rr_0 * (rho * u0Pu2 * -0.083333333333333329 + xi_19 * 0.5 + xi_21 * -0.5) + xi_21;
-          _data_pdfs_20_318_10[ctr_0] = forceTerm_18 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * u_1 * u_1 + 0.125 * u0Mu2 * u0Mu2) + xi_16 * -0.5 + xi_8 * -0.5) + rr_0 * (rho * u0Mu2 * 0.083333333333333329 + xi_16 * 0.5 + xi_8 * -0.5) + xi_8;
+          const double f_eq_common = rho - rho * (u_0 * u_0) - rho * (u_1 * u_1) - rho * (u_2 * u_2);
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0] = forceTerm_0 + omega_shear * (f_eq_common * 0.33333333333333331 - xi_13) + xi_13;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0] = forceTerm_1 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_1 * u_1)) + xi_11 * -0.5 + xi_24 * -0.5) + rr_0 * (rho * u_1 * 0.16666666666666666 + xi_11 * 0.5 + xi_24 * -0.5) + xi_24 + ((-1.0 <= -grid_size + ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + v_s) * 0.16666666666666666) : (0.0));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0] = forceTerm_2 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_1 * u_1)) + xi_11 * -0.5 + xi_24 * -0.5) + rr_0 * (rho * u_1 * -0.16666666666666666 + xi_11 * -0.5 + xi_24 * 0.5) + xi_11 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * -2.0 + v_s) * 0.16666666666666666) : (0.0));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0] = forceTerm_3 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_0 * u_0)) + xi_10 * -0.5 + xi_5 * -0.5) + rr_0 * (rho * u_0 * -0.16666666666666666 + xi_10 * 0.5 + xi_5 * -0.5) + xi_5;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0] = forceTerm_4 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_0 * u_0)) + xi_10 * -0.5 + xi_5 * -0.5) + rr_0 * (rho * u_0 * 0.16666666666666666 + xi_10 * -0.5 + xi_5 * 0.5) + xi_10;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0] = forceTerm_5 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_2 * u_2)) + xi_16 * -0.5 + xi_8 * -0.5) + rr_0 * (rho * u_2 * 0.16666666666666666 + xi_16 * -0.5 + xi_8 * 0.5) + xi_16;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0] = forceTerm_6 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_2 * u_2)) + xi_16 * -0.5 + xi_8 * -0.5) + rr_0 * (rho * u_2 * -0.16666666666666666 + xi_16 * 0.5 + xi_8 * -0.5) + xi_8;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0] = forceTerm_7 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Mu1 * u0Mu1)) + xi_6 * -0.5 + xi_9 * -0.5) + rr_0 * (rho * u0Mu1 * -0.083333333333333329 + xi_6 * -0.5 + xi_9 * 0.5) + xi_6 + ((-1.0 <= -grid_size + ((double)(ctr_1))) ? (rho * v_s * (u_0 * -2.0 + u_1 * 3.0 - v_s + 1.0) * 0.083333333333333329) : (0.0));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0] = forceTerm_8 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Pu1 * u0Pu1)) + xi_14 * -0.5 + xi_22 * -0.5) + rr_0 * (rho * u0Pu1 * 0.083333333333333329 + xi_14 * 0.5 + xi_22 * -0.5) + xi_22 + ((-1.0 <= -grid_size + ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * 3.0 + v_s + 1.0) * -0.083333333333333329) : (0.0));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0] = forceTerm_9 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Pu1 * u0Pu1)) + xi_14 * -0.5 + xi_22 * -0.5) + rr_0 * (rho * u0Pu1 * -0.083333333333333329 + xi_14 * -0.5 + xi_22 * 0.5) + xi_14 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * 3.0 - v_s - 1.0) * 0.083333333333333329) : (0.0));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0] = forceTerm_10 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Mu1 * u0Mu1)) + xi_6 * -0.5 + xi_9 * -0.5) + rr_0 * (rho * u0Mu1 * 0.083333333333333329 + xi_6 * 0.5 + xi_9 * -0.5) + xi_9 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * -3.0 - v_s + 1.0) * 0.083333333333333329) : (0.0));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0] = forceTerm_11 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Pu2 * u1Pu2)) + xi_17 * -0.5 + xi_4 * -0.5) + rr_0 * (rho * u1Pu2 * 0.083333333333333329 + xi_17 * 0.5 + xi_4 * -0.5) + xi_4;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0] = forceTerm_12 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Mu2 * u1Mu2)) + xi_18 * -0.5 + xi_7 * -0.5) + rr_0 * (rho * u1Mu2 * -0.083333333333333329 + xi_18 * 0.5 + xi_7 * -0.5) + xi_7;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0] = forceTerm_13 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Mu2 * u0Mu2)) + xi_19 * -0.5 + xi_23 * -0.5) + rr_0 * (rho * u0Mu2 * -0.083333333333333329 + xi_19 * 0.5 + xi_23 * -0.5) + xi_23;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0] = forceTerm_14 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Pu2 * u0Pu2)) + xi_15 * -0.5 + xi_21 * -0.5) + rr_0 * (rho * u0Pu2 * 0.083333333333333329 + xi_15 * 0.5 + xi_21 * -0.5) + xi_21;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0] = forceTerm_15 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Mu2 * u1Mu2)) + xi_18 * -0.5 + xi_7 * -0.5) + rr_0 * (rho * u1Mu2 * 0.083333333333333329 + xi_18 * -0.5 + xi_7 * 0.5) + xi_18;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0] = forceTerm_16 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Pu2 * u1Pu2)) + xi_17 * -0.5 + xi_4 * -0.5) + rr_0 * (rho * u1Pu2 * -0.083333333333333329 + xi_17 * -0.5 + xi_4 * 0.5) + xi_17;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0] = forceTerm_17 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Pu2 * u0Pu2)) + xi_15 * -0.5 + xi_21 * -0.5) + rr_0 * (rho * u0Pu2 * -0.083333333333333329 + xi_15 * -0.5 + xi_21 * 0.5) + xi_15;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0] = forceTerm_18 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Mu2 * u0Mu2)) + xi_19 * -0.5 + xi_23 * -0.5) + rr_0 * (rho * u0Mu2 * 0.083333333333333329 + xi_19 * -0.5 + xi_23 * 0.5) + xi_19;
         }
       }
     }
@@ -304,32 +259,33 @@ static FUNC_PREFIX void collidesweepdoubleprecisionleesedwardsavx_collidesweepdo
 } // namespace internal_f11a519921c681cbc9d0b2f51454c920
 
 void CollideSweepDoublePrecisionLeesEdwardsAVX::run(IBlock *block) {
+
   auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
 
+  auto &grid_size = this->grid_size_;
   auto &v_s = this->v_s_;
   auto &omega_shear = this->omega_shear_;
-  auto &grid_size = this->grid_size_;
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -340,6 +296,7 @@ void CollideSweepDoublePrecisionLeesEdwardsAVX::run(IBlock *block) {
 }
 
 void CollideSweepDoublePrecisionLeesEdwardsAVX::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -351,33 +308,33 @@ void CollideSweepDoublePrecisionLeesEdwardsAVX::runOnCellInterval(const shared_p
   auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
 
+  auto &grid_size = this->grid_size_;
   auto &v_s = this->v_s_;
   auto &omega_shear = this->omega_shear_;
-  auto &grid_size = this->grid_size_;
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -396,4 +353,4 @@ void CollideSweepDoublePrecisionLeesEdwardsAVX::runOnCellInterval(const shared_p
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h
index 272d0555606..dfd7bcdde63 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsAVX.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -93,6 +94,9 @@ class CollideSweepDoublePrecisionLeesEdwardsAVX {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   double grid_size_;
@@ -106,4 +110,4 @@ class CollideSweepDoublePrecisionLeesEdwardsAVX {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.cu
new file mode 100644
index 00000000000..c5797f8990d
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.cu
@@ -0,0 +1,250 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file CollideSweepDoublePrecisionLeesEdwardsCUDA.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#include <cmath>
+
+#include "CollideSweepDoublePrecisionLeesEdwardsCUDA.h"
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+
+#define FUNC_PREFIX __global__
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#pragma GCC diagnostic ignored "-Wshadow"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning push
+#pragma warning(disable : 1599)
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace pystencils {
+
+namespace internal_collidesweepdoubleprecisionleesedwardscuda_collidesweepdoubleprecisionleesedwardscuda {
+static FUNC_PREFIX __launch_bounds__(256) void collidesweepdoubleprecisionleesedwardscuda_collidesweepdoubleprecisionleesedwardscuda(double *RESTRICT const _data_force, double *RESTRICT _data_pdfs, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double grid_size, double omega_shear, double v_s) {
+  if (blockDim.x * blockIdx.x + threadIdx.x < _size_force_0 && blockDim.y * blockIdx.y + threadIdx.y < _size_force_1 && blockDim.z * blockIdx.z + threadIdx.z < _size_force_2) {
+    const int64_t ctr_0 = blockDim.x * blockIdx.x + threadIdx.x;
+    const int64_t ctr_1 = blockDim.y * blockIdx.y + threadIdx.y;
+    const int64_t ctr_2 = blockDim.z * blockIdx.z + threadIdx.z;
+    const double xi_25 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+    const double xi_26 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3];
+    const double xi_27 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3];
+    const double xi_28 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3];
+    const double xi_29 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+    const double xi_30 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3];
+    const double xi_31 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+    const double xi_32 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3];
+    const double xi_33 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+    const double xi_34 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+    const double xi_35 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+    const double xi_36 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+    const double xi_37 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3];
+    const double xi_38 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3];
+    const double xi_39 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3];
+    const double xi_40 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+    const double xi_41 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
+    const double xi_42 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+    const double xi_43 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+    const double xi_44 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3];
+    const double xi_45 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3];
+    const double xi_46 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+    const double xi_3 = xi_34;
+    const double xi_4 = xi_32;
+    const double xi_5 = xi_43;
+    const double xi_6 = xi_36;
+    const double xi_7 = xi_38;
+    const double xi_8 = xi_45;
+    const double xi_9 = xi_40;
+    const double xi_10 = xi_41;
+    const double xi_11 = xi_33;
+    const double xi_12 = xi_35;
+    const double xi_13 = xi_25;
+    const double xi_14 = xi_29;
+    const double xi_15 = xi_44;
+    const double xi_16 = xi_28;
+    const double xi_17 = xi_37;
+    const double xi_18 = xi_39;
+    const double xi_19 = xi_26;
+    const double xi_20 = xi_46;
+    const double xi_21 = xi_30;
+    const double xi_22 = xi_31;
+    const double xi_23 = xi_27;
+    const double xi_24 = xi_42;
+    const double xi_0 = ((1.0) / (omega_shear * -0.25 + 2.0));
+    const double rr_0 = xi_0 * (omega_shear * -2.0 + 4.0);
+    const double vel0Term = xi_10 + xi_19 + xi_21 + xi_22 + xi_9;
+    const double vel1Term = xi_18 + xi_24 + xi_4 + xi_6;
+    const double vel2Term = xi_16 + xi_23 + xi_7;
+    const double rho = vel0Term + vel1Term + vel2Term + xi_11 + xi_13 + xi_14 + xi_15 + xi_17 + xi_5 + xi_8;
+    const double xi_1 = ((1.0) / (rho));
+    const double u_0 = xi_1 * xi_12 * 0.5 + xi_1 * (vel0Term - xi_14 - xi_15 - xi_23 - xi_5 - xi_6);
+    const double u_1 = xi_1 * xi_20 * 0.5 + xi_1 * (vel1Term - xi_11 - xi_14 - xi_17 + xi_22 - xi_7 - xi_9);
+    const double u_2 = xi_1 * xi_3 * 0.5 + xi_1 * (vel2Term - xi_15 - xi_17 - xi_18 - xi_19 + xi_21 + xi_4 - xi_8);
+    const double forceTerm_0 = omega_shear * u_0 * xi_12 * 0.5 + omega_shear * u_1 * xi_20 * 0.5 + omega_shear * u_2 * xi_3 * 0.5 - u_0 * xi_12 - u_1 * xi_20 - u_2 * xi_3;
+    const double forceTerm_1 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * -0.16666666666666666 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_20 * -0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * 0.33333333333333331 + u_2 * xi_3 * -0.16666666666666666 + xi_20 * 0.16666666666666666;
+    const double forceTerm_2 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * -0.16666666666666666 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_20 * 0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * 0.33333333333333331 + u_2 * xi_3 * -0.16666666666666666 + xi_20 * -0.16666666666666666;
+    const double forceTerm_3 = omega_shear * u_0 * xi_12 * -0.16666666666666666 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_12 * 0.083333333333333329 + u_0 * xi_12 * 0.33333333333333331 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * -0.16666666666666666 + xi_12 * -0.16666666666666666;
+    const double forceTerm_4 = omega_shear * u_0 * xi_12 * -0.16666666666666666 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.083333333333333329 + rr_0 * xi_12 * -0.083333333333333329 + u_0 * xi_12 * 0.33333333333333331 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * -0.16666666666666666 + xi_12 * 0.16666666666666666;
+    const double forceTerm_5 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * -0.16666666666666666 + rr_0 * xi_3 * -0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * 0.33333333333333331 + xi_3 * 0.16666666666666666;
+    const double forceTerm_6 = omega_shear * u_0 * xi_12 * 0.083333333333333329 + omega_shear * u_1 * xi_20 * 0.083333333333333329 + omega_shear * u_2 * xi_3 * -0.16666666666666666 + rr_0 * xi_3 * 0.083333333333333329 + u_0 * xi_12 * -0.16666666666666666 + u_1 * xi_20 * -0.16666666666666666 + u_2 * xi_3 * 0.33333333333333331 + xi_3 * -0.16666666666666666;
+    const double forceTerm_7 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * 0.125 + omega_shear * u_1 * xi_12 * 0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * -0.25 + u_1 * xi_12 * -0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * -0.083333333333333329 + xi_20 * 0.083333333333333329;
+    const double forceTerm_8 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * -0.125 + omega_shear * u_1 * xi_12 * -0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_20 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * 0.25 + u_1 * xi_12 * 0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * 0.083333333333333329 + xi_20 * 0.083333333333333329;
+    const double forceTerm_9 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * -0.125 + omega_shear * u_1 * xi_12 * -0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * 0.25 + u_1 * xi_12 * 0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * -0.083333333333333329 + xi_20 * -0.083333333333333329;
+    const double forceTerm_10 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_20 * 0.125 + omega_shear * u_1 * xi_12 * 0.125 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_2 * xi_3 * 0.041666666666666664 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_20 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_20 * -0.25 + u_1 * xi_12 * -0.25 + u_1 * xi_20 * 0.16666666666666666 + u_2 * xi_3 * -0.083333333333333329 + xi_12 * 0.083333333333333329 + xi_20 * -0.083333333333333329;
+    const double forceTerm_11 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * -0.125 + omega_shear * u_2 * xi_20 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * -0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * 0.25 + u_2 * xi_20 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * 0.083333333333333329 + xi_3 * 0.083333333333333329;
+    const double forceTerm_12 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * 0.125 + omega_shear * u_2 * xi_20 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * 0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * -0.25 + u_2 * xi_20 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * -0.083333333333333329 + xi_3 * 0.083333333333333329;
+    const double forceTerm_13 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * 0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * -0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * -0.083333333333333329 + xi_3 * 0.083333333333333329;
+    const double forceTerm_14 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * -0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_3 * -0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * 0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * 0.083333333333333329 + xi_3 * 0.083333333333333329;
+    const double forceTerm_15 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * 0.125 + omega_shear * u_2 * xi_20 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * -0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * -0.25 + u_2 * xi_20 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * 0.083333333333333329 + xi_3 * -0.083333333333333329;
+    const double forceTerm_16 = omega_shear * u_0 * xi_12 * 0.041666666666666664 + omega_shear * u_1 * xi_20 * -0.083333333333333329 + omega_shear * u_1 * xi_3 * -0.125 + omega_shear * u_2 * xi_20 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_20 * 0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * -0.083333333333333329 + u_1 * xi_20 * 0.16666666666666666 + u_1 * xi_3 * 0.25 + u_2 * xi_20 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_20 * -0.083333333333333329 + xi_3 * -0.083333333333333329;
+    const double forceTerm_17 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * -0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * -0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * 0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * 0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * 0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * -0.083333333333333329 + xi_3 * -0.083333333333333329;
+    const double forceTerm_18 = omega_shear * u_0 * xi_12 * -0.083333333333333329 + omega_shear * u_0 * xi_3 * 0.125 + omega_shear * u_1 * xi_20 * 0.041666666666666664 + omega_shear * u_2 * xi_12 * 0.125 + omega_shear * u_2 * xi_3 * -0.083333333333333329 + rr_0 * xi_12 * -0.041666666666666664 + rr_0 * xi_3 * 0.041666666666666664 + u_0 * xi_12 * 0.16666666666666666 + u_0 * xi_3 * -0.25 + u_1 * xi_20 * -0.083333333333333329 + u_2 * xi_12 * -0.25 + u_2 * xi_3 * 0.16666666666666666 + xi_12 * 0.083333333333333329 + xi_3 * -0.083333333333333329;
+    const double u0Mu1 = u_0 - u_1;
+    const double u0Pu1 = u_0 + u_1;
+    const double u1Pu2 = u_1 + u_2;
+    const double u1Mu2 = u_1 - u_2;
+    const double u0Mu2 = u_0 - u_2;
+    const double u0Pu2 = u_0 + u_2;
+    const double f_eq_common = rho - rho * u_0 * u_0 - rho * u_1 * u_1 - rho * u_2 * u_2;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = forceTerm_0 + omega_shear * (f_eq_common * 0.33333333333333331 - xi_13) + xi_13;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = forceTerm_1 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_1 * u_1)) + xi_11 * -0.5 + xi_24 * -0.5) + rr_0 * (rho * u_1 * 0.16666666666666666 + xi_11 * 0.5 + xi_24 * -0.5) + xi_24 + ((-1.0 <= -grid_size + ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + v_s) * 0.16666666666666666) : (0.0));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = forceTerm_2 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_1 * u_1)) + xi_11 * -0.5 + xi_24 * -0.5) + rr_0 * (rho * u_1 * -0.16666666666666666 + xi_11 * -0.5 + xi_24 * 0.5) + xi_11 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * -2.0 + v_s) * 0.16666666666666666) : (0.0));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = forceTerm_3 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_0 * u_0)) + xi_10 * -0.5 + xi_5 * -0.5) + rr_0 * (rho * u_0 * -0.16666666666666666 + xi_10 * 0.5 + xi_5 * -0.5) + xi_5;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = forceTerm_4 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_0 * u_0)) + xi_10 * -0.5 + xi_5 * -0.5) + rr_0 * (rho * u_0 * 0.16666666666666666 + xi_10 * -0.5 + xi_5 * 0.5) + xi_10;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = forceTerm_5 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_2 * u_2)) + xi_16 * -0.5 + xi_8 * -0.5) + rr_0 * (rho * u_2 * 0.16666666666666666 + xi_16 * -0.5 + xi_8 * 0.5) + xi_16;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = forceTerm_6 + omega_shear * (f_eq_common * 0.16666666666666666 + rho * (-0.1111111111111111 + 0.33333333333333331 * (u_2 * u_2)) + xi_16 * -0.5 + xi_8 * -0.5) + rr_0 * (rho * u_2 * -0.16666666666666666 + xi_16 * 0.5 + xi_8 * -0.5) + xi_8;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = forceTerm_7 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Mu1 * u0Mu1)) + xi_6 * -0.5 + xi_9 * -0.5) + rr_0 * (rho * u0Mu1 * -0.083333333333333329 + xi_6 * -0.5 + xi_9 * 0.5) + xi_6 + ((-1.0 <= -grid_size + ((double)(ctr_1))) ? (rho * v_s * (u_0 * -2.0 + u_1 * 3.0 - v_s + 1.0) * 0.083333333333333329) : (0.0));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = forceTerm_8 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Pu1 * u0Pu1)) + xi_14 * -0.5 + xi_22 * -0.5) + rr_0 * (rho * u0Pu1 * 0.083333333333333329 + xi_14 * 0.5 + xi_22 * -0.5) + xi_22 + ((-1.0 <= -grid_size + ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * 3.0 + v_s + 1.0) * -0.083333333333333329) : (0.0));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = forceTerm_9 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Pu1 * u0Pu1)) + xi_14 * -0.5 + xi_22 * -0.5) + rr_0 * (rho * u0Pu1 * -0.083333333333333329 + xi_14 * -0.5 + xi_22 * 0.5) + xi_14 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * 3.0 - v_s - 1.0) * 0.083333333333333329) : (0.0));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = forceTerm_10 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_2 * u_2) + 0.125 * (u0Mu1 * u0Mu1)) + xi_6 * -0.5 + xi_9 * -0.5) + rr_0 * (rho * u0Mu1 * 0.083333333333333329 + xi_6 * 0.5 + xi_9 * -0.5) + xi_9 + ((0.0 >= ((double)(ctr_1))) ? (rho * v_s * (u_0 * 2.0 + u_1 * -3.0 - v_s + 1.0) * 0.083333333333333329) : (0.0));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = forceTerm_11 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Pu2 * u1Pu2)) + xi_17 * -0.5 + xi_4 * -0.5) + rr_0 * (rho * u1Pu2 * 0.083333333333333329 + xi_17 * 0.5 + xi_4 * -0.5) + xi_4;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = forceTerm_12 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Mu2 * u1Mu2)) + xi_18 * -0.5 + xi_7 * -0.5) + rr_0 * (rho * u1Mu2 * -0.083333333333333329 + xi_18 * 0.5 + xi_7 * -0.5) + xi_7;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = forceTerm_13 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Mu2 * u0Mu2)) + xi_19 * -0.5 + xi_23 * -0.5) + rr_0 * (rho * u0Mu2 * -0.083333333333333329 + xi_19 * 0.5 + xi_23 * -0.5) + xi_23;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = forceTerm_14 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Pu2 * u0Pu2)) + xi_15 * -0.5 + xi_21 * -0.5) + rr_0 * (rho * u0Pu2 * 0.083333333333333329 + xi_15 * 0.5 + xi_21 * -0.5) + xi_21;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = forceTerm_15 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Mu2 * u1Mu2)) + xi_18 * -0.5 + xi_7 * -0.5) + rr_0 * (rho * u1Mu2 * 0.083333333333333329 + xi_18 * -0.5 + xi_7 * 0.5) + xi_18;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = forceTerm_16 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_0 * u_0) + 0.125 * (u1Pu2 * u1Pu2)) + xi_17 * -0.5 + xi_4 * -0.5) + rr_0 * (rho * u1Pu2 * -0.083333333333333329 + xi_17 * -0.5 + xi_4 * 0.5) + xi_17;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = forceTerm_17 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Pu2 * u0Pu2)) + xi_15 * -0.5 + xi_21 * -0.5) + rr_0 * (rho * u0Pu2 * -0.083333333333333329 + xi_15 * -0.5 + xi_21 * 0.5) + xi_15;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = forceTerm_18 + omega_shear * (f_eq_common * 0.041666666666666664 + rho * (-0.013888888888888888 + 0.041666666666666664 * (u_1 * u_1) + 0.125 * (u0Mu2 * u0Mu2)) + xi_19 * -0.5 + xi_23 * -0.5) + rr_0 * (rho * u0Mu2 * 0.083333333333333329 + xi_19 * -0.5 + xi_23 * 0.5) + xi_19;
+  }
+}
+} // namespace internal_collidesweepdoubleprecisionleesedwardscuda_collidesweepdoubleprecisionleesedwardscuda
+
+void CollideSweepDoublePrecisionLeesEdwardsCUDA::run(IBlock *block, gpuStream_t stream) {
+
+  auto force = block->getData<gpu::GPUField<double>>(forceID);
+  auto pdfs = block->getData<gpu::GPUField<double>>(pdfsID);
+
+  auto &omega_shear = this->omega_shear_;
+  auto &grid_size = this->grid_size_;
+  auto &v_s = this->v_s_;
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
+  double *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+  double *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_collidesweepdoubleprecisionleesedwardscuda_collidesweepdoubleprecisionleesedwardscuda::collidesweepdoubleprecisionleesedwardscuda_collidesweepdoubleprecisionleesedwardscuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, grid_size, omega_shear, v_s);
+}
+
+void CollideSweepDoublePrecisionLeesEdwardsCUDA::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block, gpuStream_t stream) {
+
+  CellInterval ci = globalCellInterval;
+  CellInterval blockBB = blocks->getBlockCellBB(*block);
+  blockBB.expand(ghostLayers);
+  ci.intersect(blockBB);
+  blocks->transformGlobalToBlockLocalCellInterval(ci, *block);
+  if (ci.empty())
+    return;
+
+  auto force = block->getData<gpu::GPUField<double>>(forceID);
+  auto pdfs = block->getData<gpu::GPUField<double>>(pdfsID);
+
+  auto &omega_shear = this->omega_shear_;
+  auto &grid_size = this->grid_size_;
+  auto &v_s = this->v_s_;
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
+  double *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+  double *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_collidesweepdoubleprecisionleesedwardscuda_collidesweepdoubleprecisionleesedwardscuda::collidesweepdoubleprecisionleesedwardscuda_collidesweepdoubleprecisionleesedwardscuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, grid_size, omega_shear, v_s);
+}
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h
new file mode 100644
index 00000000000..276592774e1
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h
@@ -0,0 +1,122 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file CollideSweepDoublePrecisionLeesEdwardsCUDA.h
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "field/SwapableCompare.h"
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace pystencils {
+
+class CollideSweepDoublePrecisionLeesEdwardsCUDA {
+public:
+  CollideSweepDoublePrecisionLeesEdwardsCUDA(BlockDataID forceID_,
+                                             BlockDataID pdfsID_,
+                                             double grid_size,
+                                             double omega_shear, double v_s)
+      : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size),
+        omega_shear_(omega_shear), v_s_(v_s){};
+
+  void run(IBlock *block, gpuStream_t stream = nullptr);
+
+  void runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers, IBlock *block,
+                         gpuStream_t stream = nullptr);
+
+  void operator()(IBlock *block, gpuStream_t stream = nullptr) {
+    run(block, stream);
+  }
+
+  static std::function<void(IBlock *)> getSweep(
+      const shared_ptr<CollideSweepDoublePrecisionLeesEdwardsCUDA> &kernel) {
+    return [kernel](IBlock *b) { kernel->run(b); };
+  }
+
+  static std::function<void(IBlock *, gpuStream_t)> getSweepOnCellInterval(
+      const shared_ptr<CollideSweepDoublePrecisionLeesEdwardsCUDA> &kernel,
+      const shared_ptr<StructuredBlockStorage> &blocks,
+      const CellInterval &globalCellInterval, cell_idx_t ghostLayers = 1) {
+    return [kernel, blocks, globalCellInterval,
+            ghostLayers](IBlock *b, gpuStream_t stream = nullptr) {
+      kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                                stream);
+    };
+  }
+
+  std::function<void(IBlock *)> getSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->run(b, stream); };
+  }
+
+  std::function<void(IBlock *)>
+  getSweepOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers = 1,
+                         gpuStream_t stream = nullptr) {
+    return [this, blocks, globalCellInterval, ghostLayers, stream](IBlock *b) {
+      this->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                              stream);
+    };
+  }
+
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
+  BlockDataID forceID;
+  BlockDataID pdfsID;
+  double grid_size_;
+  double omega_shear_;
+  double v_s_;
+};
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.cpp
index 94629fa8141..01d09d8366a 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file CollideSweepDoublePrecisionThermalized.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -63,74 +62,30 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalized_collidesweepdoubl
   const double rr_0 = 0.0;
   const double xi_53 = rr_0 * 0.041666666666666664;
   for (int64_t ctr_2 = 0; ctr_2 < _size_force_2; ctr_2 += 1) {
-    double *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    double *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    double *RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    double *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    double *RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3;
-    double *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    double *RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3;
     for (int64_t ctr_1 = 0; ctr_1 < _size_force_1; ctr_1 += 1) {
-      double *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      double *RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_36;
-      double *RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_315;
-      double *RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_310;
-      double *RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_312;
-      double *RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_318;
-      double *RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_39;
-      double *RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_31;
-      double *RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_37;
-      double *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      double *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      double *RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_316;
-      double *RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_313;
-      double *RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_38;
-      double *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      double *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      double *RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_314;
-      double *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      double *RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_317;
-      double *RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_311;
-      double *RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_32;
-      double *RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_35;
       for (int64_t ctr_0 = 0; ctr_0 < _size_force_0; ctr_0 += 1) {
-        const double xi_244 = _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0];
-        const double xi_245 = _data_pdfs_20_36_10[_stride_pdfs_0 * ctr_0];
-        const double xi_246 = _data_pdfs_20_315_10[_stride_pdfs_0 * ctr_0];
-        const double xi_247 = _data_pdfs_20_310_10[_stride_pdfs_0 * ctr_0];
-        const double xi_248 = _data_pdfs_20_312_10[_stride_pdfs_0 * ctr_0];
-        const double xi_249 = _data_pdfs_20_318_10[_stride_pdfs_0 * ctr_0];
-        const double xi_250 = _data_pdfs_20_39_10[_stride_pdfs_0 * ctr_0];
-        const double xi_251 = _data_pdfs_20_31_10[_stride_pdfs_0 * ctr_0];
-        const double xi_252 = _data_pdfs_20_37_10[_stride_pdfs_0 * ctr_0];
-        const double xi_253 = _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0];
-        const double xi_254 = _data_force_20_31_10[_stride_force_0 * ctr_0];
-        const double xi_255 = _data_pdfs_20_316_10[_stride_pdfs_0 * ctr_0];
-        const double xi_256 = _data_pdfs_20_313_10[_stride_pdfs_0 * ctr_0];
-        const double xi_257 = _data_pdfs_20_38_10[_stride_pdfs_0 * ctr_0];
-        const double xi_258 = _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0];
-        const double xi_259 = _data_force_20_32_10[_stride_force_0 * ctr_0];
-        const double xi_260 = _data_pdfs_20_314_10[_stride_pdfs_0 * ctr_0];
-        const double xi_261 = _data_force_20_30_10[_stride_force_0 * ctr_0];
-        const double xi_262 = _data_pdfs_20_317_10[_stride_pdfs_0 * ctr_0];
-        const double xi_263 = _data_pdfs_20_311_10[_stride_pdfs_0 * ctr_0];
-        const double xi_264 = _data_pdfs_20_32_10[_stride_pdfs_0 * ctr_0];
-        const double xi_265 = _data_pdfs_20_35_10[_stride_pdfs_0 * ctr_0];
+        const double xi_244 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+        const double xi_245 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+        const double xi_246 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3];
+        const double xi_247 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+        const double xi_248 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+        const double xi_249 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
+        const double xi_250 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+        const double xi_251 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3];
+        const double xi_252 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3];
+        const double xi_253 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3];
+        const double xi_254 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+        const double xi_255 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3];
+        const double xi_256 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3];
+        const double xi_257 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+        const double xi_258 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+        const double xi_259 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+        const double xi_260 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+        const double xi_261 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3];
+        const double xi_262 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3];
+        const double xi_263 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3];
+        const double xi_264 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+        const double xi_265 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3];
 
         double random_7_0{};
         double random_7_1{};
@@ -179,69 +134,69 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalized_collidesweepdoubl
         if (kT > 0.) {
           philox_double2(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 0, seed, random_0_0, random_0_1);
         }
-        const double xi_2 = xi_249 + xi_260;
-        const double xi_3 = xi_2 + xi_244;
-        const double xi_4 = xi_246 + xi_251 + xi_263;
-        const double xi_5 = xi_248 + xi_265;
-        const double xi_6 = xi_245 + xi_262;
-        const double xi_8 = xi_250 * -1.0;
-        const double xi_9 = xi_252 * -1.0;
-        const double xi_10 = xi_262 * -1.0;
-        const double xi_11 = xi_256 * -1.0;
-        const double xi_12 = xi_258 * -1.0;
+        const double xi_2 = xi_246 + xi_251;
+        const double xi_3 = xi_2 + xi_249;
+        const double xi_4 = xi_244 + xi_255 + xi_263;
+        const double xi_5 = xi_256 + xi_265;
+        const double xi_6 = xi_261 + xi_262;
+        const double xi_8 = -xi_257;
+        const double xi_9 = -xi_254;
+        const double xi_10 = -xi_261;
+        const double xi_11 = -xi_252;
+        const double xi_12 = -xi_258;
         const double xi_13 = xi_10 + xi_11 + xi_12;
-        const double xi_14 = xi_264 * -1.0;
-        const double xi_15 = xi_247 * -1.0;
+        const double xi_14 = -xi_260;
+        const double xi_15 = -xi_247;
         const double xi_16 = xi_14 + xi_15;
-        const double xi_17 = xi_255 * -1.0;
-        const double xi_18 = xi_248 * -1.0;
+        const double xi_17 = -xi_253;
+        const double xi_18 = -xi_265;
         const double xi_19 = xi_17 + xi_18;
-        const double xi_20 = xi_249 * -1.0;
+        const double xi_20 = -xi_246;
         const double xi_21 = xi_10 + xi_20;
-        const double xi_22 = xi_246 * -1.0;
-        const double xi_23 = xi_245 * -1.0;
+        const double xi_22 = -xi_255;
+        const double xi_23 = -xi_262;
         const double xi_24 = xi_17 + xi_22 + xi_23 + xi_263;
-        const double xi_29 = xi_254 * 0.16666666666666666;
-        const double xi_30 = xi_254 * 0.083333333333333329;
-        const double xi_42 = xi_261 * 0.16666666666666666;
-        const double xi_43 = xi_261 * 0.083333333333333329;
+        const double xi_29 = xi_250 * 0.16666666666666666;
+        const double xi_30 = xi_250 * 0.083333333333333329;
+        const double xi_42 = xi_248 * 0.16666666666666666;
+        const double xi_43 = xi_248 * 0.083333333333333329;
         const double xi_49 = xi_259 * 0.16666666666666666;
         const double xi_50 = xi_259 * 0.083333333333333329;
-        const double xi_67 = xi_254 * 0.25;
-        const double xi_72 = xi_254 * xi_71;
-        const double xi_114 = xi_253 * -1.0;
-        const double xi_118 = xi_263 * -1.0;
+        const double xi_67 = xi_250 * 0.25;
+        const double xi_72 = xi_250 * xi_71;
+        const double xi_114 = -xi_264;
+        const double xi_118 = -xi_263;
         const double xi_119 = xi_118 + xi_18;
-        const double xi_120 = xi_257 * -1.0 + xi_8;
-        const double xi_122 = xi_260 * -1.0;
+        const double xi_120 = -xi_245 + xi_8;
+        const double xi_122 = -xi_251;
         const double xi_123 = xi_11 + xi_122 + xi_15 + xi_21;
-        const double xi_125 = xi_246 * 2.0 + xi_248 * 2.0 + xi_255 * 2.0 + xi_263 * 2.0;
-        const double xi_126 = xi_125 + xi_244 * 5.0 + xi_258 * 5.0;
-        const double xi_128 = xi_256 * 2.0;
-        const double xi_129 = xi_260 * 2.0;
-        const double xi_130 = xi_249 * 2.0 + xi_262 * 2.0;
-        const double xi_132 = xi_118 + xi_248;
-        const double xi_133 = xi_132 + xi_14 + xi_22 + xi_251 + xi_255;
+        const double xi_125 = xi_253 * 2.0 + xi_255 * 2.0 + xi_263 * 2.0 + xi_265 * 2.0;
+        const double xi_126 = xi_125 + xi_249 * 5.0 + xi_258 * 5.0;
+        const double xi_128 = xi_252 * 2.0;
+        const double xi_129 = xi_251 * 2.0;
+        const double xi_130 = xi_246 * 2.0 + xi_261 * 2.0;
+        const double xi_132 = xi_118 + xi_265;
+        const double xi_133 = xi_132 + xi_14 + xi_22 + xi_244 + xi_253;
         const double xi_135 = xi_133 * xi_134;
         const double xi_136 = random_5_1 - 0.5;
-        const double xi_141 = xi_252 * 2.0;
+        const double xi_141 = xi_254 * 2.0;
         const double xi_142 = xi_247 * 2.0;
-        const double xi_143 = xi_250 * 2.0 + xi_257 * -2.0;
-        const double xi_144 = xi_14 + xi_141 * -1.0 + xi_142 + xi_143 + xi_19 + xi_4;
+        const double xi_143 = xi_245 * -2.0 + xi_257 * 2.0;
+        const double xi_144 = xi_14 - xi_141 + xi_142 + xi_143 + xi_19 + xi_4;
         const double xi_146 = xi_144 * xi_145;
         const double xi_147 = random_3_0 - 0.5;
         const double xi_152 = random_0_1 - 0.5;
-        const double xi_166 = xi_122 + xi_256;
-        const double xi_167 = xi_12 + xi_166 + xi_20 + xi_244 + xi_262;
+        const double xi_166 = xi_122 + xi_252;
+        const double xi_167 = xi_12 + xi_166 + xi_20 + xi_249 + xi_261;
         const double xi_168 = xi_134 * xi_167;
         const double xi_169 = random_4_1 - 0.5;
-        const double xi_171 = xi_13 + xi_141 + xi_142 * -1.0 + xi_143 + xi_3;
+        const double xi_171 = xi_13 + xi_141 - xi_142 + xi_143 + xi_3;
         const double xi_172 = xi_145 * xi_171;
         const double xi_173 = random_4_0 - 0.5;
-        const double xi_178 = xi_119 + xi_23 + xi_246 + xi_255 + xi_265;
+        const double xi_178 = xi_119 + xi_23 + xi_253 + xi_255 + xi_256;
         const double xi_179 = xi_134 * xi_178;
         const double xi_180 = random_5_0 - 0.5;
-        const double xi_182 = xi_128 * -1.0 + xi_129 * -1.0 + xi_130 + xi_24 + xi_5;
+        const double xi_182 = -xi_128 - xi_129 + xi_130 + xi_24 + xi_5;
         const double xi_183 = xi_145 * xi_182;
         const double xi_184 = random_3_1 - 0.5;
         const double xi_212 = xi_182 * xi_211;
@@ -253,28 +208,28 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalized_collidesweepdoubl
         const double xi_31 = rr_0 * xi_30;
         const double xi_44 = rr_0 * xi_43;
         const double xi_51 = rr_0 * xi_50;
-        const double xi_54 = xi_261 * xi_53;
-        const double xi_59 = xi_254 * xi_53;
+        const double xi_54 = xi_248 * xi_53;
+        const double xi_59 = xi_250 * xi_53;
         const double xi_81 = xi_259 * xi_53;
-        const double vel0Term = xi_247 + xi_257 + xi_3;
-        const double vel1Term = xi_252 + xi_4;
-        const double vel2Term = xi_256 + xi_5;
-        const double rho = vel0Term + vel1Term + vel2Term + xi_250 + xi_253 + xi_255 + xi_258 + xi_264 + xi_6;
+        const double vel0Term = xi_245 + xi_247 + xi_3;
+        const double vel1Term = xi_254 + xi_4;
+        const double vel2Term = xi_252 + xi_5;
+        const double rho = vel0Term + vel1Term + vel2Term + xi_253 + xi_257 + xi_258 + xi_260 + xi_264 + xi_6;
         const double xi_105 = kT * rho;
-        const double xi_106 = pow(xi_105 * (-1.0 * ((omega_even * -1.0 + 1.0) * (omega_even * -1.0 + 1.0)) + 1.0), 0.5);
+        const double xi_106 = pow(xi_105 * (1.0 - (-omega_even + 1.0) * (-omega_even + 1.0)), 0.5);
         const double xi_107 = xi_106 * (random_6_0 - 0.5) * 3.7416573867739413;
         const double xi_108 = xi_106 * (random_7_0 - 0.5) * 5.4772255750516612;
-        const double xi_110 = xi_109 * (random_2_1 - 0.5) * pow(xi_105 * (-1.0 * ((omega_bulk * -1.0 + 1.0) * (omega_bulk * -1.0 + 1.0)) + 1.0), 0.5);
+        const double xi_110 = xi_109 * (random_2_1 - 0.5) * pow(xi_105 * (1.0 - (-omega_bulk + 1.0) * (-omega_bulk + 1.0)), 0.5);
         const double xi_111 = xi_106 * (random_6_1 - 0.5) * 8.3666002653407556;
-        const double xi_137 = pow(xi_105 * (-1.0 * ((omega_odd * -1.0 + 1.0) * (omega_odd * -1.0 + 1.0)) + 1.0), 0.5);
+        const double xi_137 = pow(xi_105 * (1.0 - (-omega_odd + 1.0) * (-omega_odd + 1.0)), 0.5);
         const double xi_138 = xi_137 * 1.4142135623730951;
         const double xi_139 = xi_138 * 0.5;
         const double xi_140 = xi_136 * xi_139;
         const double xi_148 = xi_109 * xi_137;
         const double xi_149 = xi_148 * 0.16666666666666666;
         const double xi_150 = xi_147 * xi_149;
-        const double xi_151 = xi_146 * -1.0 + xi_150 * -1.0;
-        const double xi_153 = pow(xi_105 * (-1.0 * ((omega_shear * -1.0 + 1.0) * (omega_shear * -1.0 + 1.0)) + 1.0), 0.5);
+        const double xi_151 = -xi_146 - xi_150;
+        const double xi_153 = pow(xi_105 * (1.0 - (-omega_shear + 1.0) * (-omega_shear + 1.0)), 0.5);
         const double xi_154 = xi_153 * 0.5;
         const double xi_155 = xi_152 * xi_154;
         const double xi_161 = xi_153 * (random_0_0 - 0.5) * 1.7320508075688772;
@@ -282,10 +237,10 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalized_collidesweepdoubl
         const double xi_170 = xi_139 * xi_169;
         const double xi_174 = xi_149 * xi_173;
         const double xi_175 = xi_172 + xi_174;
-        const double xi_177 = xi_172 * -1.0 + xi_174 * -1.0;
+        const double xi_177 = -xi_172 - xi_174;
         const double xi_181 = xi_139 * xi_180;
         const double xi_185 = xi_149 * xi_184;
-        const double xi_186 = xi_183 * -1.0 + xi_185 * -1.0;
+        const double xi_186 = -xi_183 - xi_185;
         const double xi_188 = xi_183 + xi_185;
         const double xi_189 = xi_152 * xi_153 * 0.25;
         const double xi_192 = xi_107 * 0.083333333333333329;
@@ -297,108 +252,108 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalized_collidesweepdoubl
         const double xi_216 = xi_184 * xi_215;
         const double xi_217 = xi_138 * 0.25;
         const double xi_218 = xi_180 * xi_217;
-        const double xi_219 = xi_212 * -1.0 + xi_214 + xi_216 * -1.0 + xi_218;
+        const double xi_219 = -xi_212 + xi_214 - xi_216 + xi_218;
         const double xi_222 = xi_147 * xi_215;
         const double xi_223 = xi_136 * xi_217;
-        const double xi_224 = xi_220 * -1.0 + xi_221 + xi_222 * -1.0 + xi_223;
-        const double xi_225 = xi_220 + xi_221 * -1.0 + xi_222 + xi_223 * -1.0;
-        const double xi_227 = xi_189 * -1.0;
+        const double xi_224 = -xi_220 + xi_221 - xi_222 + xi_223;
+        const double xi_225 = xi_220 - xi_221 + xi_222 - xi_223;
+        const double xi_227 = -xi_189;
         const double xi_230 = xi_111 * 0.035714285714285712;
         const double xi_232 = xi_154 * (random_1_1 - 0.5);
         const double xi_237 = xi_169 * xi_217;
         const double xi_238 = xi_173 * xi_215;
-        const double xi_239 = xi_235 * -1.0 + xi_236 + xi_237 * -1.0 + xi_238;
-        const double xi_241 = xi_235 + xi_236 * -1.0 + xi_237 + xi_238 * -1.0;
-        const double xi_242 = xi_212 + xi_214 * -1.0 + xi_216 + xi_218 * -1.0;
+        const double xi_239 = -xi_235 + xi_236 - xi_237 + xi_238;
+        const double xi_241 = xi_235 - xi_236 + xi_237 - xi_238;
+        const double xi_242 = xi_212 - xi_214 + xi_216 - xi_218;
         const double xi_0 = ((1.0) / (rho));
         const double xi_7 = xi_0 * 0.5;
-        const double u_0 = xi_0 * (vel0Term + xi_13 + xi_8 + xi_9) + xi_261 * xi_7;
-        const double xi_25 = u_0 * xi_261;
+        const double u_0 = xi_0 * (vel0Term + xi_13 + xi_8 + xi_9) + xi_248 * xi_7;
+        const double xi_25 = u_0 * xi_248;
         const double xi_37 = xi_25 * 0.16666666666666666;
         const double xi_38 = xi_25 * 0.083333333333333329;
         const double xi_39 = omega_shear * xi_38;
-        const double xi_40 = xi_37 * -1.0 + xi_39;
-        const double xi_56 = xi_25 * xi_55 * -1.0 + xi_37;
-        const double xi_57 = xi_43 * -1.0 + xi_54 + xi_56;
-        const double xi_61 = xi_25 * xi_60 * -1.0;
+        const double xi_40 = -xi_37 + xi_39;
+        const double xi_56 = -xi_25 * xi_55 + xi_37;
+        const double xi_57 = -xi_43 + xi_54 + xi_56;
+        const double xi_61 = -xi_25 * xi_60;
         const double xi_68 = u_0 * xi_67;
         const double xi_73 = u_0 * xi_72;
-        const double xi_77 = xi_43 + xi_54 * -1.0 + xi_56;
-        const double xi_84 = xi_38 * -1.0;
+        const double xi_77 = xi_43 - xi_54 + xi_56;
+        const double xi_84 = -xi_38;
         const double xi_95 = u_0 * xi_259;
         const double xi_96 = xi_95 * 0.25;
         const double xi_99 = xi_71 * xi_95;
         const double xi_113 = rho * (u_0 * u_0);
-        const double u_1 = xi_0 * (vel1Term + xi_16 + xi_19 + xi_257 + xi_8) + xi_254 * xi_7;
-        const double xi_26 = u_1 * xi_254;
+        const double u_1 = xi_0 * (vel1Term + xi_16 + xi_19 + xi_245 + xi_8) + xi_250 * xi_7;
+        const double xi_26 = u_1 * xi_250;
         const double xi_32 = xi_26 * 0.16666666666666666;
         const double xi_45 = xi_26 * 0.083333333333333329;
         const double xi_46 = omega_shear * xi_45;
-        const double xi_47 = xi_32 * -1.0 + xi_46;
-        const double xi_62 = xi_26 * xi_60 * -1.0;
+        const double xi_47 = -xi_32 + xi_46;
+        const double xi_62 = -xi_26 * xi_60;
         const double xi_69 = u_1 * 0.25;
-        const double xi_70 = xi_261 * xi_69;
+        const double xi_70 = xi_248 * xi_69;
         const double xi_74 = u_1 * xi_71;
-        const double xi_75 = xi_261 * xi_74;
-        const double xi_76 = xi_68 * -1.0 + xi_70 * -1.0 + xi_73 + xi_75;
-        const double xi_78 = xi_68 + xi_70 + xi_73 * -1.0 + xi_75 * -1.0;
+        const double xi_75 = xi_248 * xi_74;
+        const double xi_76 = -xi_68 - xi_70 + xi_73 + xi_75;
+        const double xi_78 = xi_68 + xi_70 - xi_73 - xi_75;
         const double xi_86 = xi_259 * xi_69;
         const double xi_88 = xi_259 * xi_74;
-        const double xi_93 = xi_45 * -1.0;
+        const double xi_93 = -xi_45;
         const double xi_112 = rho * (u_1 * u_1);
         const double xi_121 = xi_112 + xi_120 + xi_9;
         const double xi_197 = rho * u_1;
-        const double xi_199 = xi_198 * (u_0 * xi_197 + xi_120 + xi_247 + xi_252);
-        const double xi_200 = xi_196 * -1.0 + xi_199 * -1.0;
+        const double xi_199 = xi_198 * (u_0 * xi_197 + xi_120 + xi_247 + xi_254);
+        const double xi_200 = -xi_196 - xi_199;
         const double xi_201 = xi_196 + xi_199;
-        const double u_2 = xi_0 * (vel2Term + xi_21 + xi_24 + xi_260) + xi_259 * xi_7;
+        const double u_2 = xi_0 * (vel2Term + xi_21 + xi_24 + xi_251) + xi_259 * xi_7;
         const double xi_27 = u_2 * xi_259;
         const double xi_33 = xi_27 * 0.16666666666666666;
         const double xi_34 = xi_27 * 0.083333333333333329;
         const double xi_35 = omega_shear * xi_34;
-        const double xi_36 = xi_33 * -1.0 + xi_35;
-        const double xi_41 = omega_shear * xi_32 * -1.0 + xi_26 * 0.33333333333333331 + xi_36 + xi_40;
-        const double xi_48 = omega_shear * xi_37 * -1.0 + xi_25 * 0.33333333333333331 + xi_36 + xi_47;
-        const double xi_52 = omega_shear * xi_33 * -1.0 + xi_27 * 0.33333333333333331 + xi_40 + xi_47;
-        const double xi_58 = xi_34 * -1.0;
-        const double xi_63 = xi_27 * xi_60 * -1.0;
-        const double xi_64 = xi_26 * xi_55 * -1.0 + xi_32 + xi_61 + xi_62 + xi_63;
-        const double xi_65 = xi_30 + xi_59 * -1.0 + xi_64;
+        const double xi_36 = -xi_33 + xi_35;
+        const double xi_41 = -omega_shear * xi_32 + xi_26 * 0.33333333333333331 + xi_36 + xi_40;
+        const double xi_48 = -omega_shear * xi_37 + xi_25 * 0.33333333333333331 + xi_36 + xi_47;
+        const double xi_52 = -omega_shear * xi_33 + xi_27 * 0.33333333333333331 + xi_40 + xi_47;
+        const double xi_58 = -xi_34;
+        const double xi_63 = -xi_27 * xi_60;
+        const double xi_64 = -xi_26 * xi_55 + xi_32 + xi_61 + xi_62 + xi_63;
+        const double xi_65 = xi_30 - xi_59 + xi_64;
         const double xi_66 = xi_35 + xi_58 + xi_65;
-        const double xi_79 = xi_30 * -1.0 + xi_59 + xi_64;
+        const double xi_79 = -xi_30 + xi_59 + xi_64;
         const double xi_80 = xi_35 + xi_58 + xi_79;
-        const double xi_82 = xi_27 * xi_55 * -1.0 + xi_33;
-        const double xi_83 = xi_50 + xi_81 * -1.0 + xi_82;
+        const double xi_82 = -xi_27 * xi_55 + xi_33;
+        const double xi_83 = xi_50 - xi_81 + xi_82;
         const double xi_85 = xi_39 + xi_65 + xi_84;
         const double xi_87 = u_2 * xi_67;
         const double xi_89 = u_2 * xi_72;
-        const double xi_90 = xi_86 + xi_87 + xi_88 * -1.0 + xi_89 * -1.0;
+        const double xi_90 = xi_86 + xi_87 - xi_88 - xi_89;
         const double xi_91 = xi_39 + xi_79 + xi_84;
-        const double xi_92 = xi_86 * -1.0 + xi_87 * -1.0 + xi_88 + xi_89;
+        const double xi_92 = -xi_86 - xi_87 + xi_88 + xi_89;
         const double xi_94 = xi_46 + xi_61 + xi_62 + xi_63 + xi_83 + xi_93;
-        const double xi_97 = u_2 * xi_261;
+        const double xi_97 = u_2 * xi_248;
         const double xi_98 = xi_97 * 0.25;
         const double xi_100 = xi_71 * xi_97;
-        const double xi_101 = xi_100 + xi_96 * -1.0 + xi_98 * -1.0 + xi_99;
-        const double xi_102 = xi_100 * -1.0 + xi_96 + xi_98 + xi_99 * -1.0;
-        const double xi_103 = xi_50 * -1.0 + xi_81 + xi_82;
+        const double xi_101 = xi_100 - xi_96 - xi_98 + xi_99;
+        const double xi_102 = -xi_100 + xi_96 + xi_98 - xi_99;
+        const double xi_103 = -xi_50 + xi_81 + xi_82;
         const double xi_104 = xi_103 + xi_46 + xi_61 + xi_62 + xi_63 + xi_93;
         const double xi_115 = rho * (u_2 * u_2);
-        const double xi_116 = xi_114 + xi_115 * 0.66666666666666663 + xi_245 * 3.0 + xi_265 * 3.0;
-        const double xi_117 = omega_even * (xi_112 * 0.66666666666666663 + xi_113 * 1.6666666666666667 + xi_116 + xi_246 * -3.0 + xi_248 * -3.0 + xi_251 * 3.0 + xi_255 * -3.0 + xi_263 * -3.0 + xi_264 * 3.0);
-        const double xi_124 = omega_bulk * (xi_113 + xi_115 + xi_119 + xi_121 + xi_123 + xi_17 + xi_22 + xi_253);
-        const double xi_127 = omega_even * (xi_112 * 2.3333333333333335 + xi_116 + xi_126 + xi_249 * -5.0 + xi_251 * -2.0 + xi_256 * -5.0 + xi_260 * -5.0 + xi_262 * -5.0 + xi_264 * -2.0);
-        const double xi_131 = omega_even * (xi_114 + xi_115 * 3.0 + xi_126 + xi_128 + xi_129 + xi_130 + xi_245 * -4.0 + xi_247 * -7.0 + xi_250 * -7.0 + xi_251 * 5.0 + xi_252 * -7.0 + xi_257 * -7.0 + xi_264 * 5.0 + xi_265 * -4.0);
-        const double xi_156 = xi_115 * -1.0 + xi_265;
-        const double xi_157 = omega_shear * (xi_121 + xi_156 + xi_16 + xi_2 + xi_251 * -1.0 + xi_256 + xi_6);
+        const double xi_116 = xi_114 + xi_115 * 0.66666666666666663 + xi_256 * 3.0 + xi_262 * 3.0;
+        const double xi_117 = omega_even * (xi_112 * 0.66666666666666663 + xi_113 * 1.6666666666666667 + xi_116 + xi_244 * 3.0 + xi_253 * -3.0 + xi_255 * -3.0 + xi_260 * 3.0 + xi_263 * -3.0 + xi_265 * -3.0);
+        const double xi_124 = omega_bulk * (xi_113 + xi_115 + xi_119 + xi_121 + xi_123 + xi_17 + xi_22 + xi_264);
+        const double xi_127 = omega_even * (xi_112 * 2.3333333333333335 + xi_116 + xi_126 + xi_244 * -2.0 + xi_246 * -5.0 + xi_251 * -5.0 + xi_252 * -5.0 + xi_260 * -2.0 + xi_261 * -5.0);
+        const double xi_131 = omega_even * (xi_114 + xi_115 * 3.0 + xi_126 + xi_128 + xi_129 + xi_130 + xi_244 * 5.0 + xi_245 * -7.0 + xi_247 * -7.0 + xi_254 * -7.0 + xi_256 * -4.0 + xi_257 * -7.0 + xi_260 * 5.0 + xi_262 * -4.0);
+        const double xi_156 = -xi_115 + xi_256;
+        const double xi_157 = omega_shear * (xi_121 + xi_156 + xi_16 + xi_2 - xi_244 + xi_252 + xi_6);
         const double xi_158 = xi_157 * 0.125;
         const double xi_159 = xi_107 * -0.11904761904761904 + xi_131 * -0.01984126984126984;
-        const double xi_160 = omega_shear * (xi_112 * -1.0 + xi_113 * 2.0 + xi_120 + xi_123 + xi_125 + xi_156 + xi_244 * -2.0 + xi_245 + xi_251 + xi_258 * -2.0 + xi_264 + xi_9);
+        const double xi_160 = omega_shear * (-xi_112 + xi_113 * 2.0 + xi_120 + xi_123 + xi_125 + xi_156 + xi_244 + xi_249 * -2.0 + xi_258 * -2.0 + xi_260 + xi_262 + xi_9);
         const double xi_162 = xi_160 * -0.041666666666666664 + xi_161 * -0.16666666666666666;
         const double xi_163 = xi_108 * -0.10000000000000001 + xi_117 * -0.050000000000000003 + xi_162;
         const double xi_164 = xi_111 * 0.028571428571428571 + xi_127 * 0.014285714285714285 + xi_155 + xi_158 + xi_159 + xi_163;
         const double xi_176 = xi_111 * -0.071428571428571425 + xi_127 * -0.035714285714285712 + xi_159 + xi_160 * 0.083333333333333329 + xi_161 * 0.33333333333333331;
-        const double xi_187 = xi_107 * 0.095238095238095233 + xi_111 * -0.042857142857142858 + xi_127 * -0.021428571428571429 + xi_131 * 0.015873015873015872 + xi_155 * -1.0 + xi_158 * -1.0 + xi_163;
+        const double xi_187 = xi_107 * 0.095238095238095233 + xi_111 * -0.042857142857142858 + xi_127 * -0.021428571428571429 + xi_131 * 0.015873015873015872 - xi_155 - xi_158 + xi_163;
         const double xi_190 = xi_157 * 0.0625;
         const double xi_191 = xi_131 * 0.013888888888888888;
         const double xi_193 = xi_110 * 0.083333333333333329 + xi_124 * 0.041666666666666664;
@@ -406,25 +361,25 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalized_collidesweepdoubl
         const double xi_195 = xi_165 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
         const double xi_202 = xi_151 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
         const double xi_204 = xi_127 * -0.0071428571428571426;
-        const double xi_205 = xi_198 * (u_2 * xi_197 + xi_132 + xi_17 + xi_246);
+        const double xi_205 = xi_198 * (u_2 * xi_197 + xi_132 + xi_17 + xi_255);
         const double xi_206 = xi_117 * 0.025000000000000001;
         const double xi_209 = xi_107 * -0.023809523809523808 + xi_131 * -0.003968253968253968;
         const double xi_210 = xi_162 + xi_193 + xi_203 + xi_204 + xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
-        const double xi_226 = xi_162 + xi_193 + xi_203 * -1.0 + xi_204 + xi_205 * -1.0 + xi_206 + xi_207 + xi_208 + xi_209;
-        const double xi_228 = xi_190 * -1.0;
+        const double xi_226 = xi_162 + xi_193 - xi_203 + xi_204 - xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
+        const double xi_228 = -xi_190;
         const double xi_229 = xi_127 * 0.017857142857142856;
         const double xi_231 = xi_188 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
-        const double xi_233 = xi_198 * (rho * u_0 * u_2 + xi_10 + xi_166 + xi_249);
-        const double xi_234 = xi_232 * -1.0 + xi_233 * -1.0;
+        const double xi_233 = xi_198 * (rho * u_0 * u_2 + xi_10 + xi_166 + xi_246);
+        const double xi_234 = -xi_232 - xi_233;
         const double xi_240 = xi_232 + xi_233;
         const double xi_243 = xi_186 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
-        const double forceTerm_0 = xi_25 * xi_28 + xi_25 * -1.0 + xi_26 * xi_28 + xi_26 * -1.0 + xi_27 * xi_28 + xi_27 * -1.0;
-        const double forceTerm_1 = xi_29 + xi_31 * -1.0 + xi_41;
-        const double forceTerm_2 = xi_29 * -1.0 + xi_31 + xi_41;
-        const double forceTerm_3 = xi_42 * -1.0 + xi_44 + xi_48;
-        const double forceTerm_4 = xi_42 + xi_44 * -1.0 + xi_48;
-        const double forceTerm_5 = xi_49 + xi_51 * -1.0 + xi_52;
-        const double forceTerm_6 = xi_49 * -1.0 + xi_51 + xi_52;
+        const double forceTerm_0 = xi_25 * xi_28 - xi_25 + xi_26 * xi_28 - xi_26 + xi_27 * xi_28 - xi_27;
+        const double forceTerm_1 = xi_29 - xi_31 + xi_41;
+        const double forceTerm_2 = -xi_29 + xi_31 + xi_41;
+        const double forceTerm_3 = -xi_42 + xi_44 + xi_48;
+        const double forceTerm_4 = xi_42 - xi_44 + xi_48;
+        const double forceTerm_5 = xi_49 - xi_51 + xi_52;
+        const double forceTerm_6 = -xi_49 + xi_51 + xi_52;
         const double forceTerm_7 = xi_57 + xi_66 + xi_76;
         const double forceTerm_8 = xi_66 + xi_77 + xi_78;
         const double forceTerm_9 = xi_57 + xi_78 + xi_80;
@@ -437,25 +392,25 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalized_collidesweepdoubl
         const double forceTerm_16 = xi_103 + xi_90 + xi_91;
         const double forceTerm_17 = xi_102 + xi_104 + xi_57;
         const double forceTerm_18 = xi_101 + xi_104 + xi_77;
-        _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0] = forceTerm_0 + xi_107 * 0.14285714285714285 + xi_108 * 0.20000000000000001 + xi_110 * -1.0 + xi_111 * 0.085714285714285715 + xi_117 * 0.10000000000000001 + xi_124 * -0.5 + xi_127 * 0.042857142857142858 + xi_131 * 0.023809523809523808 + xi_253;
-        _data_pdfs_20_31_10[_stride_pdfs_0 * ctr_0] = forceTerm_1 + xi_135 * -1.0 + xi_140 * -1.0 + xi_151 + xi_164 + xi_251;
-        _data_pdfs_20_32_10[_stride_pdfs_0 * ctr_0] = forceTerm_2 + xi_135 + xi_140 + xi_164 + xi_165 + xi_264;
-        _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0] = forceTerm_3 + xi_168 + xi_170 + xi_175 + xi_176 + xi_258;
-        _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0] = forceTerm_4 + xi_168 * -1.0 + xi_170 * -1.0 + xi_176 + xi_177 + xi_244;
-        _data_pdfs_20_35_10[_stride_pdfs_0 * ctr_0] = forceTerm_5 + xi_179 * -1.0 + xi_181 * -1.0 + xi_186 + xi_187 + xi_265;
-        _data_pdfs_20_36_10[_stride_pdfs_0 * ctr_0] = forceTerm_6 + xi_179 + xi_181 + xi_187 + xi_188 + xi_245;
-        _data_pdfs_20_37_10[_stride_pdfs_0 * ctr_0] = forceTerm_7 + xi_177 + xi_195 + xi_200 + xi_252;
-        _data_pdfs_20_38_10[_stride_pdfs_0 * ctr_0] = forceTerm_8 + xi_175 + xi_195 + xi_201 + xi_257;
-        _data_pdfs_20_39_10[_stride_pdfs_0 * ctr_0] = forceTerm_9 + xi_177 + xi_201 + xi_202 + xi_250;
-        _data_pdfs_20_310_10[_stride_pdfs_0 * ctr_0] = forceTerm_10 + xi_175 + xi_200 + xi_202 + xi_247;
-        _data_pdfs_20_311_10[_stride_pdfs_0 * ctr_0] = forceTerm_11 + xi_210 + xi_219 + xi_224 + xi_263;
-        _data_pdfs_20_312_10[_stride_pdfs_0 * ctr_0] = forceTerm_12 + xi_219 + xi_225 + xi_226 + xi_248;
-        _data_pdfs_20_313_10[_stride_pdfs_0 * ctr_0] = forceTerm_13 + xi_231 + xi_234 + xi_239 + xi_256;
-        _data_pdfs_20_314_10[_stride_pdfs_0 * ctr_0] = forceTerm_14 + xi_231 + xi_240 + xi_241 + xi_260;
-        _data_pdfs_20_315_10[_stride_pdfs_0 * ctr_0] = forceTerm_15 + xi_224 + xi_226 + xi_242 + xi_246;
-        _data_pdfs_20_316_10[_stride_pdfs_0 * ctr_0] = forceTerm_16 + xi_210 + xi_225 + xi_242 + xi_255;
-        _data_pdfs_20_317_10[_stride_pdfs_0 * ctr_0] = forceTerm_17 + xi_239 + xi_240 + xi_243 + xi_262;
-        _data_pdfs_20_318_10[_stride_pdfs_0 * ctr_0] = forceTerm_18 + xi_234 + xi_241 + xi_243 + xi_249;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = forceTerm_0 + xi_107 * 0.14285714285714285 + xi_108 * 0.20000000000000001 - xi_110 + xi_111 * 0.085714285714285715 + xi_117 * 0.10000000000000001 + xi_124 * -0.5 + xi_127 * 0.042857142857142858 + xi_131 * 0.023809523809523808 + xi_264;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = forceTerm_1 - xi_135 - xi_140 + xi_151 + xi_164 + xi_244;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = forceTerm_2 + xi_135 + xi_140 + xi_164 + xi_165 + xi_260;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = forceTerm_3 + xi_168 + xi_170 + xi_175 + xi_176 + xi_258;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = forceTerm_4 - xi_168 - xi_170 + xi_176 + xi_177 + xi_249;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = forceTerm_5 - xi_179 - xi_181 + xi_186 + xi_187 + xi_256;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = forceTerm_6 + xi_179 + xi_181 + xi_187 + xi_188 + xi_262;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = forceTerm_7 + xi_177 + xi_195 + xi_200 + xi_254;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = forceTerm_8 + xi_175 + xi_195 + xi_201 + xi_245;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = forceTerm_9 + xi_177 + xi_201 + xi_202 + xi_257;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = forceTerm_10 + xi_175 + xi_200 + xi_202 + xi_247;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = forceTerm_11 + xi_210 + xi_219 + xi_224 + xi_263;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = forceTerm_12 + xi_219 + xi_225 + xi_226 + xi_265;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = forceTerm_13 + xi_231 + xi_234 + xi_239 + xi_252;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = forceTerm_14 + xi_231 + xi_240 + xi_241 + xi_251;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = forceTerm_15 + xi_224 + xi_226 + xi_242 + xi_255;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = forceTerm_16 + xi_210 + xi_225 + xi_242 + xi_253;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = forceTerm_17 + xi_239 + xi_240 + xi_243 + xi_261;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = forceTerm_18 + xi_234 + xi_241 + xi_243 + xi_246;
       }
     }
   }
@@ -463,35 +418,37 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalized_collidesweepdoubl
 } // namespace internal_0d943397135d13b4628c5752888935d7
 
 void CollideSweepDoublePrecisionThermalized::run(IBlock *block) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
   auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
 
+  auto &time_step = this->time_step_;
+  auto &block_offset_1 = this->block_offset_1_;
+  auto &omega_even = this->omega_even_;
   auto &omega_bulk = this->omega_bulk_;
-  auto block_offset_1 = this->block_offset_1_;
   auto &seed = this->seed_;
-  auto &omega_even = this->omega_even_;
   auto &kT = this->kT_;
-  auto &omega_odd = this->omega_odd_;
-  auto block_offset_2 = this->block_offset_2_;
-  auto &time_step = this->time_step_;
-  auto block_offset_0 = this->block_offset_0_;
   auto &omega_shear = this->omega_shear_;
-  block_offset_generator(block, block_offset_0, block_offset_1, block_offset_2);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()));
+  auto &omega_odd = this->omega_odd_;
+  auto &block_offset_0 = this->block_offset_0_;
+  auto &block_offset_2 = this->block_offset_2_;
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -504,6 +461,9 @@ void CollideSweepDoublePrecisionThermalized::run(IBlock *block) {
 }
 
 void CollideSweepDoublePrecisionThermalized::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -515,36 +475,35 @@ void CollideSweepDoublePrecisionThermalized::runOnCellInterval(const shared_ptr<
   auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
 
+  auto &time_step = this->time_step_;
+  auto &block_offset_1 = this->block_offset_1_;
+  auto &omega_even = this->omega_even_;
   auto &omega_bulk = this->omega_bulk_;
-  auto block_offset_1 = this->block_offset_1_;
   auto &seed = this->seed_;
-  auto &omega_even = this->omega_even_;
   auto &kT = this->kT_;
-  auto &omega_odd = this->omega_odd_;
-  auto block_offset_2 = this->block_offset_2_;
-  auto &time_step = this->time_step_;
-  auto block_offset_0 = this->block_offset_0_;
   auto &omega_shear = this->omega_shear_;
-  block_offset_generator(block, block_offset_0, block_offset_1, block_offset_2);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()));
+  auto &omega_odd = this->omega_odd_;
+  auto &block_offset_0 = this->block_offset_0_;
+  auto &block_offset_2 = this->block_offset_2_;
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -565,4 +524,4 @@ void CollideSweepDoublePrecisionThermalized::runOnCellInterval(const shared_ptr<
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h
index d5b207b7b3d..192b2fcf077 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalized.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -51,16 +52,15 @@ namespace pystencils {
 
 class CollideSweepDoublePrecisionThermalized {
 public:
-  CollideSweepDoublePrecisionThermalized(
-      BlockDataID forceID_, BlockDataID pdfsID_, uint32_t block_offset_0,
-      uint32_t block_offset_1, uint32_t block_offset_2, double kT,
-      double omega_bulk, double omega_even, double omega_odd,
-      double omega_shear, uint32_t seed, uint32_t time_step)
-      : forceID(forceID_), pdfsID(pdfsID_), block_offset_0_(block_offset_0),
-        block_offset_1_(block_offset_1), block_offset_2_(block_offset_2),
-        kT_(kT), omega_bulk_(omega_bulk), omega_even_(omega_even),
-        omega_odd_(omega_odd), omega_shear_(omega_shear), seed_(seed),
-        time_step_(time_step){};
+  CollideSweepDoublePrecisionThermalized(BlockDataID forceID_,
+                                         BlockDataID pdfsID_, double kT,
+                                         double omega_bulk, double omega_even,
+                                         double omega_odd, double omega_shear,
+                                         uint32_t seed, uint32_t time_step)
+      : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
+        omega_even_(omega_even), omega_odd_(omega_odd),
+        omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
+        configured_(false){};
 
   void run(IBlock *block);
 
@@ -97,6 +97,15 @@ class CollideSweepDoublePrecisionThermalized {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {
+    Cell BlockCellBB = blocks->getBlockCellBB(*block).min();
+    block_offset_0_ = uint32_t(BlockCellBB[0]);
+    block_offset_1_ = uint32_t(BlockCellBB[1]);
+    block_offset_2_ = uint32_t(BlockCellBB[2]);
+    configured_ = true;
+  }
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   uint32_t block_offset_0_;
@@ -109,9 +118,7 @@ class CollideSweepDoublePrecisionThermalized {
   double omega_shear_;
   uint32_t seed_;
   uint32_t time_step_;
-  std::function<void(IBlock *, uint32_t &, uint32_t &, uint32_t &)>
-      block_offset_generator =
-          [](IBlock *const, uint32_t &, uint32_t &, uint32_t &) {};
+  bool configured_;
 };
 
 } // namespace pystencils
@@ -120,4 +127,4 @@ class CollideSweepDoublePrecisionThermalized {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.cpp
index 18b00c96698..dffc06cbc68 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file CollideSweepDoublePrecisionThermalizedAVX.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -26,10 +25,10 @@
 #include "core/DataTypes.h"
 #include "core/Macros.h"
 
-#include <immintrin.h>
-
 #include "philox_rand.h"
 
+#include <immintrin.h>
+
 #define FUNC_PREFIX
 
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
@@ -65,75 +64,31 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
   const double rr_0 = 0.0;
   const double xi_53 = rr_0 * 0.041666666666666664;
   for (int64_t ctr_2 = 0; ctr_2 < _size_force_2; ctr_2 += 1) {
-    double *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    double *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    double *RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    double *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    double *RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3;
-    double *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    double *RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3;
     for (int64_t ctr_1 = 0; ctr_1 < _size_force_1; ctr_1 += 1) {
-      double *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      double *RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_36;
-      double *RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_315;
-      double *RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_310;
-      double *RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_312;
-      double *RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_318;
-      double *RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_39;
-      double *RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_31;
-      double *RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_37;
-      double *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      double *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      double *RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_316;
-      double *RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_313;
-      double *RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_38;
-      double *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      double *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      double *RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_314;
-      double *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      double *RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_317;
-      double *RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_311;
-      double *RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_32;
-      double *RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_35;
       {
         for (int64_t ctr_0 = 0; ctr_0 < (int64_t)((_size_force_0) / (4)) * (4); ctr_0 += 4) {
-          const __m256d xi_244 = _mm256_load_pd(&_data_pdfs_20_34_10[ctr_0]);
-          const __m256d xi_245 = _mm256_load_pd(&_data_pdfs_20_36_10[ctr_0]);
-          const __m256d xi_246 = _mm256_load_pd(&_data_pdfs_20_315_10[ctr_0]);
-          const __m256d xi_247 = _mm256_load_pd(&_data_pdfs_20_310_10[ctr_0]);
-          const __m256d xi_248 = _mm256_load_pd(&_data_pdfs_20_312_10[ctr_0]);
-          const __m256d xi_249 = _mm256_load_pd(&_data_pdfs_20_318_10[ctr_0]);
-          const __m256d xi_250 = _mm256_load_pd(&_data_pdfs_20_39_10[ctr_0]);
-          const __m256d xi_251 = _mm256_load_pd(&_data_pdfs_20_31_10[ctr_0]);
-          const __m256d xi_252 = _mm256_load_pd(&_data_pdfs_20_37_10[ctr_0]);
-          const __m256d xi_253 = _mm256_load_pd(&_data_pdfs_20_30_10[ctr_0]);
-          const __m256d xi_254 = _mm256_load_pd(&_data_force_20_31_10[ctr_0]);
-          const __m256d xi_255 = _mm256_load_pd(&_data_pdfs_20_316_10[ctr_0]);
-          const __m256d xi_256 = _mm256_load_pd(&_data_pdfs_20_313_10[ctr_0]);
-          const __m256d xi_257 = _mm256_load_pd(&_data_pdfs_20_38_10[ctr_0]);
-          const __m256d xi_258 = _mm256_load_pd(&_data_pdfs_20_33_10[ctr_0]);
-          const __m256d xi_259 = _mm256_load_pd(&_data_force_20_32_10[ctr_0]);
-          const __m256d xi_260 = _mm256_load_pd(&_data_pdfs_20_314_10[ctr_0]);
-          const __m256d xi_261 = _mm256_load_pd(&_data_force_20_30_10[ctr_0]);
-          const __m256d xi_262 = _mm256_load_pd(&_data_pdfs_20_317_10[ctr_0]);
-          const __m256d xi_263 = _mm256_load_pd(&_data_pdfs_20_311_10[ctr_0]);
-          const __m256d xi_264 = _mm256_load_pd(&_data_pdfs_20_32_10[ctr_0]);
-          const __m256d xi_265 = _mm256_load_pd(&_data_pdfs_20_35_10[ctr_0]);
+          const __m256d xi_244 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_245 = _mm256_load_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_246 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_247 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_248 = _mm256_load_pd(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0]);
+          const __m256d xi_249 = _mm256_load_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_250 = _mm256_loadu_pd(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0]);
+          const __m256d xi_251 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_252 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_253 = _mm256_load_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_254 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_255 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_256 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_257 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_258 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_259 = _mm256_loadu_pd(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0]);
+          const __m256d xi_260 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_261 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_262 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_263 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0]);
+          const __m256d xi_264 = _mm256_load_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0]);
+          const __m256d xi_265 = _mm256_load_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0]);
 
           __m256d random_7_0{};
           __m256d random_7_1{};
@@ -182,66 +137,66 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           if (kT > 0.) {
             philox_double2(time_step, _mm256_add_epi32(_mm256_add_epi32(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), _mm256_set_epi32(ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0)), _mm256_set_epi32(((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)))), block_offset_1 + ctr_1, block_offset_2 + ctr_2, 0, seed, random_0_0, random_0_1);
           }
-          const __m256d xi_2 = _mm256_add_pd(xi_249, xi_260);
-          const __m256d xi_3 = _mm256_add_pd(xi_2, xi_244);
-          const __m256d xi_4 = _mm256_add_pd(_mm256_add_pd(xi_246, xi_251), xi_263);
-          const __m256d xi_5 = _mm256_add_pd(xi_248, xi_265);
-          const __m256d xi_6 = _mm256_add_pd(xi_245, xi_262);
-          const __m256d xi_8 = _mm256_mul_pd(xi_250, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
-          const __m256d xi_9 = _mm256_mul_pd(xi_252, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
-          const __m256d xi_10 = _mm256_mul_pd(xi_262, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
-          const __m256d xi_11 = _mm256_mul_pd(xi_256, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_2 = _mm256_add_pd(xi_246, xi_251);
+          const __m256d xi_3 = _mm256_add_pd(xi_2, xi_249);
+          const __m256d xi_4 = _mm256_add_pd(_mm256_add_pd(xi_244, xi_255), xi_263);
+          const __m256d xi_5 = _mm256_add_pd(xi_256, xi_265);
+          const __m256d xi_6 = _mm256_add_pd(xi_261, xi_262);
+          const __m256d xi_8 = _mm256_mul_pd(xi_257, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_9 = _mm256_mul_pd(xi_254, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_10 = _mm256_mul_pd(xi_261, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_11 = _mm256_mul_pd(xi_252, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
           const __m256d xi_12 = _mm256_mul_pd(xi_258, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
           const __m256d xi_13 = _mm256_add_pd(_mm256_add_pd(xi_10, xi_11), xi_12);
-          const __m256d xi_14 = _mm256_mul_pd(xi_264, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_14 = _mm256_mul_pd(xi_260, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
           const __m256d xi_15 = _mm256_mul_pd(xi_247, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
           const __m256d xi_16 = _mm256_add_pd(xi_14, xi_15);
-          const __m256d xi_17 = _mm256_mul_pd(xi_255, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
-          const __m256d xi_18 = _mm256_mul_pd(xi_248, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_17 = _mm256_mul_pd(xi_253, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_18 = _mm256_mul_pd(xi_265, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
           const __m256d xi_19 = _mm256_add_pd(xi_17, xi_18);
-          const __m256d xi_20 = _mm256_mul_pd(xi_249, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_20 = _mm256_mul_pd(xi_246, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
           const __m256d xi_21 = _mm256_add_pd(xi_10, xi_20);
-          const __m256d xi_22 = _mm256_mul_pd(xi_246, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
-          const __m256d xi_23 = _mm256_mul_pd(xi_245, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_22 = _mm256_mul_pd(xi_255, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_23 = _mm256_mul_pd(xi_262, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
           const __m256d xi_24 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_17, xi_22), xi_23), xi_263);
-          const __m256d xi_29 = _mm256_mul_pd(xi_254, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666));
-          const __m256d xi_30 = _mm256_mul_pd(xi_254, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329));
-          const __m256d xi_42 = _mm256_mul_pd(xi_261, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666));
-          const __m256d xi_43 = _mm256_mul_pd(xi_261, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329));
+          const __m256d xi_29 = _mm256_mul_pd(xi_250, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666));
+          const __m256d xi_30 = _mm256_mul_pd(xi_250, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329));
+          const __m256d xi_42 = _mm256_mul_pd(xi_248, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666));
+          const __m256d xi_43 = _mm256_mul_pd(xi_248, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329));
           const __m256d xi_49 = _mm256_mul_pd(xi_259, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666));
           const __m256d xi_50 = _mm256_mul_pd(xi_259, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329));
-          const __m256d xi_67 = _mm256_mul_pd(xi_254, _mm256_set_pd(0.25, 0.25, 0.25, 0.25));
-          const __m256d xi_72 = _mm256_mul_pd(xi_254, _mm256_set_pd(xi_71, xi_71, xi_71, xi_71));
-          const __m256d xi_114 = _mm256_mul_pd(xi_253, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_67 = _mm256_mul_pd(xi_250, _mm256_set_pd(0.25, 0.25, 0.25, 0.25));
+          const __m256d xi_72 = _mm256_mul_pd(xi_250, _mm256_set_pd(xi_71, xi_71, xi_71, xi_71));
+          const __m256d xi_114 = _mm256_mul_pd(xi_264, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
           const __m256d xi_118 = _mm256_mul_pd(xi_263, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
           const __m256d xi_119 = _mm256_add_pd(xi_118, xi_18);
-          const __m256d xi_120 = _mm256_add_pd(_mm256_mul_pd(xi_257, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), xi_8);
-          const __m256d xi_122 = _mm256_mul_pd(xi_260, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
+          const __m256d xi_120 = _mm256_add_pd(_mm256_mul_pd(xi_245, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), xi_8);
+          const __m256d xi_122 = _mm256_mul_pd(xi_251, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
           const __m256d xi_123 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_11, xi_122), xi_15), xi_21);
-          const __m256d xi_125 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_246, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)), _mm256_mul_pd(xi_248, _mm256_set_pd(2.0, 2.0, 2.0, 2.0))), _mm256_mul_pd(xi_255, _mm256_set_pd(2.0, 2.0, 2.0, 2.0))), _mm256_mul_pd(xi_263, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)));
-          const __m256d xi_126 = _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_244, _mm256_set_pd(5.0, 5.0, 5.0, 5.0)), _mm256_mul_pd(xi_258, _mm256_set_pd(5.0, 5.0, 5.0, 5.0))), xi_125);
-          const __m256d xi_128 = _mm256_mul_pd(xi_256, _mm256_set_pd(2.0, 2.0, 2.0, 2.0));
-          const __m256d xi_129 = _mm256_mul_pd(xi_260, _mm256_set_pd(2.0, 2.0, 2.0, 2.0));
-          const __m256d xi_130 = _mm256_add_pd(_mm256_mul_pd(xi_249, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)), _mm256_mul_pd(xi_262, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)));
-          const __m256d xi_132 = _mm256_add_pd(xi_118, xi_248);
-          const __m256d xi_133 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_132, xi_14), xi_22), xi_251), xi_255);
+          const __m256d xi_125 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_253, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)), _mm256_mul_pd(xi_255, _mm256_set_pd(2.0, 2.0, 2.0, 2.0))), _mm256_mul_pd(xi_263, _mm256_set_pd(2.0, 2.0, 2.0, 2.0))), _mm256_mul_pd(xi_265, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)));
+          const __m256d xi_126 = _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_249, _mm256_set_pd(5.0, 5.0, 5.0, 5.0)), _mm256_mul_pd(xi_258, _mm256_set_pd(5.0, 5.0, 5.0, 5.0))), xi_125);
+          const __m256d xi_128 = _mm256_mul_pd(xi_252, _mm256_set_pd(2.0, 2.0, 2.0, 2.0));
+          const __m256d xi_129 = _mm256_mul_pd(xi_251, _mm256_set_pd(2.0, 2.0, 2.0, 2.0));
+          const __m256d xi_130 = _mm256_add_pd(_mm256_mul_pd(xi_246, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)), _mm256_mul_pd(xi_261, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)));
+          const __m256d xi_132 = _mm256_add_pd(xi_118, xi_265);
+          const __m256d xi_133 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_132, xi_14), xi_22), xi_244), xi_253);
           const __m256d xi_135 = _mm256_mul_pd(xi_133, _mm256_set_pd(xi_134, xi_134, xi_134, xi_134));
           const __m256d xi_136 = _mm256_add_pd(_mm256_set_pd(-0.5, -0.5, -0.5, -0.5), random_5_1);
-          const __m256d xi_141 = _mm256_mul_pd(xi_252, _mm256_set_pd(2.0, 2.0, 2.0, 2.0));
+          const __m256d xi_141 = _mm256_mul_pd(xi_254, _mm256_set_pd(2.0, 2.0, 2.0, 2.0));
           const __m256d xi_142 = _mm256_mul_pd(xi_247, _mm256_set_pd(2.0, 2.0, 2.0, 2.0));
-          const __m256d xi_143 = _mm256_add_pd(_mm256_mul_pd(xi_250, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)), _mm256_mul_pd(xi_257, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0)));
+          const __m256d xi_143 = _mm256_add_pd(_mm256_mul_pd(xi_257, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)), _mm256_mul_pd(xi_245, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0)));
           const __m256d xi_144 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_141, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), xi_14), xi_142), xi_143), xi_19), xi_4);
           const __m256d xi_146 = _mm256_mul_pd(xi_144, _mm256_set_pd(xi_145, xi_145, xi_145, xi_145));
           const __m256d xi_147 = _mm256_add_pd(_mm256_set_pd(-0.5, -0.5, -0.5, -0.5), random_3_0);
           const __m256d xi_152 = _mm256_add_pd(_mm256_set_pd(-0.5, -0.5, -0.5, -0.5), random_0_1);
-          const __m256d xi_166 = _mm256_add_pd(xi_122, xi_256);
-          const __m256d xi_167 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_12, xi_166), xi_20), xi_244), xi_262);
+          const __m256d xi_166 = _mm256_add_pd(xi_122, xi_252);
+          const __m256d xi_167 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_12, xi_166), xi_20), xi_249), xi_261);
           const __m256d xi_168 = _mm256_mul_pd(xi_167, _mm256_set_pd(xi_134, xi_134, xi_134, xi_134));
           const __m256d xi_169 = _mm256_add_pd(_mm256_set_pd(-0.5, -0.5, -0.5, -0.5), random_4_1);
           const __m256d xi_171 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_142, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), xi_13), xi_141), xi_143), xi_3);
           const __m256d xi_172 = _mm256_mul_pd(xi_171, _mm256_set_pd(xi_145, xi_145, xi_145, xi_145));
           const __m256d xi_173 = _mm256_add_pd(_mm256_set_pd(-0.5, -0.5, -0.5, -0.5), random_4_0);
-          const __m256d xi_178 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_119, xi_23), xi_246), xi_255), xi_265);
+          const __m256d xi_178 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_119, xi_23), xi_253), xi_255), xi_256);
           const __m256d xi_179 = _mm256_mul_pd(xi_178, _mm256_set_pd(xi_134, xi_134, xi_134, xi_134));
           const __m256d xi_180 = _mm256_add_pd(_mm256_set_pd(-0.5, -0.5, -0.5, -0.5), random_5_0);
           const __m256d xi_182 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_128, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_129, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), xi_130), xi_24), xi_5);
@@ -256,13 +211,13 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const __m256d xi_31 = _mm256_mul_pd(xi_30, _mm256_set_pd(rr_0, rr_0, rr_0, rr_0));
           const __m256d xi_44 = _mm256_mul_pd(xi_43, _mm256_set_pd(rr_0, rr_0, rr_0, rr_0));
           const __m256d xi_51 = _mm256_mul_pd(xi_50, _mm256_set_pd(rr_0, rr_0, rr_0, rr_0));
-          const __m256d xi_54 = _mm256_mul_pd(xi_261, _mm256_set_pd(xi_53, xi_53, xi_53, xi_53));
-          const __m256d xi_59 = _mm256_mul_pd(xi_254, _mm256_set_pd(xi_53, xi_53, xi_53, xi_53));
+          const __m256d xi_54 = _mm256_mul_pd(xi_248, _mm256_set_pd(xi_53, xi_53, xi_53, xi_53));
+          const __m256d xi_59 = _mm256_mul_pd(xi_250, _mm256_set_pd(xi_53, xi_53, xi_53, xi_53));
           const __m256d xi_81 = _mm256_mul_pd(xi_259, _mm256_set_pd(xi_53, xi_53, xi_53, xi_53));
-          const __m256d vel0Term = _mm256_add_pd(_mm256_add_pd(xi_247, xi_257), xi_3);
-          const __m256d vel1Term = _mm256_add_pd(xi_252, xi_4);
-          const __m256d vel2Term = _mm256_add_pd(xi_256, xi_5);
-          const __m256d rho = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(vel0Term, vel1Term), vel2Term), xi_250), xi_253), xi_255), xi_258), xi_264), xi_6);
+          const __m256d vel0Term = _mm256_add_pd(_mm256_add_pd(xi_245, xi_247), xi_3);
+          const __m256d vel1Term = _mm256_add_pd(xi_254, xi_4);
+          const __m256d vel2Term = _mm256_add_pd(xi_252, xi_5);
+          const __m256d rho = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(vel0Term, vel1Term), vel2Term), xi_253), xi_257), xi_258), xi_260), xi_264), xi_6);
           const __m256d xi_105 = _mm256_mul_pd(rho, _mm256_set_pd(kT, kT, kT, kT));
           const __m256d xi_106 = _mm256_sqrt_pd(_mm256_mul_pd(xi_105, _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(omega_even, omega_even, omega_even, omega_even)), _mm256_set_pd(1.0, 1.0, 1.0, 1.0)), _mm256_add_pd(_mm256_mul_pd(_mm256_set_pd(-1.0, -1.0, -1.0, -1.0), _mm256_set_pd(omega_even, omega_even, omega_even, omega_even)), _mm256_set_pd(1.0, 1.0, 1.0, 1.0)))), _mm256_set_pd(1.0, 1.0, 1.0, 1.0))));
           const __m256d xi_107 = _mm256_mul_pd(_mm256_mul_pd(xi_106, _mm256_add_pd(_mm256_set_pd(-0.5, -0.5, -0.5, -0.5), random_6_0)), _mm256_set_pd(3.7416573867739413, 3.7416573867739413, 3.7416573867739413, 3.7416573867739413));
@@ -315,8 +270,8 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const __m256d xi_242 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_214, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_218, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), xi_212), xi_216);
           const __m256d xi_0 = _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho);
           const __m256d xi_7 = _mm256_mul_pd(xi_0, _mm256_set_pd(0.5, 0.5, 0.5, 0.5));
-          const __m256d u_0 = _mm256_add_pd(_mm256_mul_pd(xi_0, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(vel0Term, xi_13), xi_8), xi_9)), _mm256_mul_pd(xi_261, xi_7));
-          const __m256d xi_25 = _mm256_mul_pd(u_0, xi_261);
+          const __m256d u_0 = _mm256_add_pd(_mm256_mul_pd(xi_0, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(vel0Term, xi_13), xi_8), xi_9)), _mm256_mul_pd(xi_248, xi_7));
+          const __m256d xi_25 = _mm256_mul_pd(u_0, xi_248);
           const __m256d xi_37 = _mm256_mul_pd(xi_25, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666));
           const __m256d xi_38 = _mm256_mul_pd(xi_25, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329));
           const __m256d xi_39 = _mm256_mul_pd(xi_38, _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear));
@@ -332,17 +287,17 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const __m256d xi_96 = _mm256_mul_pd(xi_95, _mm256_set_pd(0.25, 0.25, 0.25, 0.25));
           const __m256d xi_99 = _mm256_mul_pd(xi_95, _mm256_set_pd(xi_71, xi_71, xi_71, xi_71));
           const __m256d xi_113 = _mm256_mul_pd(rho, _mm256_mul_pd(u_0, u_0));
-          const __m256d u_1 = _mm256_add_pd(_mm256_mul_pd(xi_0, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(vel1Term, xi_16), xi_19), xi_257), xi_8)), _mm256_mul_pd(xi_254, xi_7));
-          const __m256d xi_26 = _mm256_mul_pd(u_1, xi_254);
+          const __m256d u_1 = _mm256_add_pd(_mm256_mul_pd(xi_0, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(vel1Term, xi_16), xi_19), xi_245), xi_8)), _mm256_mul_pd(xi_250, xi_7));
+          const __m256d xi_26 = _mm256_mul_pd(u_1, xi_250);
           const __m256d xi_32 = _mm256_mul_pd(xi_26, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666));
           const __m256d xi_45 = _mm256_mul_pd(xi_26, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329));
           const __m256d xi_46 = _mm256_mul_pd(xi_45, _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear));
           const __m256d xi_47 = _mm256_add_pd(_mm256_mul_pd(xi_32, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), xi_46);
           const __m256d xi_62 = _mm256_mul_pd(_mm256_mul_pd(xi_26, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_set_pd(xi_60, xi_60, xi_60, xi_60));
           const __m256d xi_69 = _mm256_mul_pd(u_1, _mm256_set_pd(0.25, 0.25, 0.25, 0.25));
-          const __m256d xi_70 = _mm256_mul_pd(xi_261, xi_69);
+          const __m256d xi_70 = _mm256_mul_pd(xi_248, xi_69);
           const __m256d xi_74 = _mm256_mul_pd(u_1, _mm256_set_pd(xi_71, xi_71, xi_71, xi_71));
-          const __m256d xi_75 = _mm256_mul_pd(xi_261, xi_74);
+          const __m256d xi_75 = _mm256_mul_pd(xi_248, xi_74);
           const __m256d xi_76 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_68, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_70, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), xi_73), xi_75);
           const __m256d xi_78 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_73, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_75, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), xi_68), xi_70);
           const __m256d xi_86 = _mm256_mul_pd(xi_259, xi_69);
@@ -351,10 +306,10 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const __m256d xi_112 = _mm256_mul_pd(rho, _mm256_mul_pd(u_1, u_1));
           const __m256d xi_121 = _mm256_add_pd(_mm256_add_pd(xi_112, xi_120), xi_9);
           const __m256d xi_197 = _mm256_mul_pd(rho, u_1);
-          const __m256d xi_199 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(u_0, xi_197), xi_120), xi_247), xi_252), _mm256_set_pd(xi_198, xi_198, xi_198, xi_198));
+          const __m256d xi_199 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(u_0, xi_197), xi_120), xi_247), xi_254), _mm256_set_pd(xi_198, xi_198, xi_198, xi_198));
           const __m256d xi_200 = _mm256_add_pd(_mm256_mul_pd(xi_196, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_199, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)));
           const __m256d xi_201 = _mm256_add_pd(xi_196, xi_199);
-          const __m256d u_2 = _mm256_add_pd(_mm256_mul_pd(xi_0, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(vel2Term, xi_21), xi_24), xi_260)), _mm256_mul_pd(xi_259, xi_7));
+          const __m256d u_2 = _mm256_add_pd(_mm256_mul_pd(xi_0, _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(vel2Term, xi_21), xi_24), xi_251)), _mm256_mul_pd(xi_259, xi_7));
           const __m256d xi_27 = _mm256_mul_pd(u_2, xi_259);
           const __m256d xi_33 = _mm256_mul_pd(xi_27, _mm256_set_pd(0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666));
           const __m256d xi_34 = _mm256_mul_pd(xi_27, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329));
@@ -379,7 +334,7 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const __m256d xi_91 = _mm256_add_pd(_mm256_add_pd(xi_39, xi_79), xi_84);
           const __m256d xi_92 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_86, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_87, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), xi_88), xi_89);
           const __m256d xi_94 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_46, xi_61), xi_62), xi_63), xi_83), xi_93);
-          const __m256d xi_97 = _mm256_mul_pd(u_2, xi_261);
+          const __m256d xi_97 = _mm256_mul_pd(u_2, xi_248);
           const __m256d xi_98 = _mm256_mul_pd(xi_97, _mm256_set_pd(0.25, 0.25, 0.25, 0.25));
           const __m256d xi_100 = _mm256_mul_pd(xi_97, _mm256_set_pd(xi_71, xi_71, xi_71, xi_71));
           const __m256d xi_101 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_96, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_98, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), xi_100), xi_99);
@@ -387,21 +342,21 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const __m256d xi_103 = _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_50, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), xi_81), xi_82);
           const __m256d xi_104 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_103, xi_46), xi_61), xi_62), xi_63), xi_93);
           const __m256d xi_115 = _mm256_mul_pd(rho, _mm256_mul_pd(u_2, u_2));
-          const __m256d xi_116 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_245, _mm256_set_pd(3.0, 3.0, 3.0, 3.0)), _mm256_mul_pd(xi_265, _mm256_set_pd(3.0, 3.0, 3.0, 3.0))), _mm256_mul_pd(xi_115, _mm256_set_pd(0.66666666666666663, 0.66666666666666663, 0.66666666666666663, 0.66666666666666663))), xi_114);
-          const __m256d xi_117 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_251, _mm256_set_pd(3.0, 3.0, 3.0, 3.0)), _mm256_mul_pd(xi_264, _mm256_set_pd(3.0, 3.0, 3.0, 3.0))), _mm256_mul_pd(xi_112, _mm256_set_pd(0.66666666666666663, 0.66666666666666663, 0.66666666666666663, 0.66666666666666663))), _mm256_mul_pd(xi_113, _mm256_set_pd(1.6666666666666667, 1.6666666666666667, 1.6666666666666667, 1.6666666666666667))), _mm256_mul_pd(xi_246, _mm256_set_pd(-3.0, -3.0, -3.0, -3.0))), _mm256_mul_pd(xi_248, _mm256_set_pd(-3.0, -3.0, -3.0, -3.0))), _mm256_mul_pd(xi_255, _mm256_set_pd(-3.0, -3.0, -3.0, -3.0))), _mm256_mul_pd(xi_263, _mm256_set_pd(-3.0, -3.0, -3.0, -3.0))), xi_116), _mm256_set_pd(omega_even, omega_even, omega_even, omega_even));
-          const __m256d xi_124 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_113, xi_115), xi_119), xi_121), xi_123), xi_17), xi_22), xi_253), _mm256_set_pd(omega_bulk, omega_bulk, omega_bulk, omega_bulk));
-          const __m256d xi_127 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_112, _mm256_set_pd(2.3333333333333335, 2.3333333333333335, 2.3333333333333335, 2.3333333333333335)), _mm256_mul_pd(xi_251, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0))), _mm256_mul_pd(xi_264, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0))), _mm256_mul_pd(xi_249, _mm256_set_pd(-5.0, -5.0, -5.0, -5.0))), _mm256_mul_pd(xi_256, _mm256_set_pd(-5.0, -5.0, -5.0, -5.0))), _mm256_mul_pd(xi_260, _mm256_set_pd(-5.0, -5.0, -5.0, -5.0))), _mm256_mul_pd(xi_262, _mm256_set_pd(-5.0, -5.0, -5.0, -5.0))), xi_116), xi_126), _mm256_set_pd(omega_even, omega_even, omega_even, omega_even));
-          const __m256d xi_131 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_115, _mm256_set_pd(3.0, 3.0, 3.0, 3.0)), _mm256_mul_pd(xi_251, _mm256_set_pd(5.0, 5.0, 5.0, 5.0))), _mm256_mul_pd(xi_264, _mm256_set_pd(5.0, 5.0, 5.0, 5.0))), _mm256_mul_pd(xi_245, _mm256_set_pd(-4.0, -4.0, -4.0, -4.0))), _mm256_mul_pd(xi_265, _mm256_set_pd(-4.0, -4.0, -4.0, -4.0))), _mm256_mul_pd(xi_247, _mm256_set_pd(-7.0, -7.0, -7.0, -7.0))), _mm256_mul_pd(xi_250, _mm256_set_pd(-7.0, -7.0, -7.0, -7.0))), _mm256_mul_pd(xi_252, _mm256_set_pd(-7.0, -7.0, -7.0, -7.0))), _mm256_mul_pd(xi_257, _mm256_set_pd(-7.0, -7.0, -7.0, -7.0))), xi_114), xi_126), xi_128), xi_129), xi_130), _mm256_set_pd(omega_even, omega_even, omega_even, omega_even));
-          const __m256d xi_156 = _mm256_add_pd(_mm256_mul_pd(xi_115, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), xi_265);
-          const __m256d xi_157 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_251, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), xi_121), xi_156), xi_16), xi_2), xi_256), xi_6), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear));
+          const __m256d xi_116 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_256, _mm256_set_pd(3.0, 3.0, 3.0, 3.0)), _mm256_mul_pd(xi_262, _mm256_set_pd(3.0, 3.0, 3.0, 3.0))), _mm256_mul_pd(xi_115, _mm256_set_pd(0.66666666666666663, 0.66666666666666663, 0.66666666666666663, 0.66666666666666663))), xi_114);
+          const __m256d xi_117 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_244, _mm256_set_pd(3.0, 3.0, 3.0, 3.0)), _mm256_mul_pd(xi_260, _mm256_set_pd(3.0, 3.0, 3.0, 3.0))), _mm256_mul_pd(xi_112, _mm256_set_pd(0.66666666666666663, 0.66666666666666663, 0.66666666666666663, 0.66666666666666663))), _mm256_mul_pd(xi_113, _mm256_set_pd(1.6666666666666667, 1.6666666666666667, 1.6666666666666667, 1.6666666666666667))), _mm256_mul_pd(xi_253, _mm256_set_pd(-3.0, -3.0, -3.0, -3.0))), _mm256_mul_pd(xi_255, _mm256_set_pd(-3.0, -3.0, -3.0, -3.0))), _mm256_mul_pd(xi_263, _mm256_set_pd(-3.0, -3.0, -3.0, -3.0))), _mm256_mul_pd(xi_265, _mm256_set_pd(-3.0, -3.0, -3.0, -3.0))), xi_116), _mm256_set_pd(omega_even, omega_even, omega_even, omega_even));
+          const __m256d xi_124 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_113, xi_115), xi_119), xi_121), xi_123), xi_17), xi_22), xi_264), _mm256_set_pd(omega_bulk, omega_bulk, omega_bulk, omega_bulk));
+          const __m256d xi_127 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_112, _mm256_set_pd(2.3333333333333335, 2.3333333333333335, 2.3333333333333335, 2.3333333333333335)), _mm256_mul_pd(xi_244, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0))), _mm256_mul_pd(xi_260, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0))), _mm256_mul_pd(xi_246, _mm256_set_pd(-5.0, -5.0, -5.0, -5.0))), _mm256_mul_pd(xi_251, _mm256_set_pd(-5.0, -5.0, -5.0, -5.0))), _mm256_mul_pd(xi_252, _mm256_set_pd(-5.0, -5.0, -5.0, -5.0))), _mm256_mul_pd(xi_261, _mm256_set_pd(-5.0, -5.0, -5.0, -5.0))), xi_116), xi_126), _mm256_set_pd(omega_even, omega_even, omega_even, omega_even));
+          const __m256d xi_131 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_115, _mm256_set_pd(3.0, 3.0, 3.0, 3.0)), _mm256_mul_pd(xi_244, _mm256_set_pd(5.0, 5.0, 5.0, 5.0))), _mm256_mul_pd(xi_260, _mm256_set_pd(5.0, 5.0, 5.0, 5.0))), _mm256_mul_pd(xi_256, _mm256_set_pd(-4.0, -4.0, -4.0, -4.0))), _mm256_mul_pd(xi_262, _mm256_set_pd(-4.0, -4.0, -4.0, -4.0))), _mm256_mul_pd(xi_245, _mm256_set_pd(-7.0, -7.0, -7.0, -7.0))), _mm256_mul_pd(xi_247, _mm256_set_pd(-7.0, -7.0, -7.0, -7.0))), _mm256_mul_pd(xi_254, _mm256_set_pd(-7.0, -7.0, -7.0, -7.0))), _mm256_mul_pd(xi_257, _mm256_set_pd(-7.0, -7.0, -7.0, -7.0))), xi_114), xi_126), xi_128), xi_129), xi_130), _mm256_set_pd(omega_even, omega_even, omega_even, omega_even));
+          const __m256d xi_156 = _mm256_add_pd(_mm256_mul_pd(xi_115, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), xi_256);
+          const __m256d xi_157 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_244, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), xi_121), xi_156), xi_16), xi_2), xi_252), xi_6), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear));
           const __m256d xi_158 = _mm256_mul_pd(xi_157, _mm256_set_pd(0.125, 0.125, 0.125, 0.125));
           const __m256d xi_159 = _mm256_add_pd(_mm256_mul_pd(xi_131, _mm256_set_pd(-0.01984126984126984, -0.01984126984126984, -0.01984126984126984, -0.01984126984126984)), _mm256_mul_pd(xi_107, _mm256_set_pd(-0.11904761904761904, -0.11904761904761904, -0.11904761904761904, -0.11904761904761904)));
-          const __m256d xi_160 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_112, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_113, _mm256_set_pd(2.0, 2.0, 2.0, 2.0))), _mm256_mul_pd(xi_244, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0))), _mm256_mul_pd(xi_258, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0))), xi_120), xi_123), xi_125), xi_156), xi_245), xi_251), xi_264), xi_9), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear));
+          const __m256d xi_160 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_113, _mm256_set_pd(2.0, 2.0, 2.0, 2.0)), _mm256_mul_pd(xi_112, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_249, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0))), _mm256_mul_pd(xi_258, _mm256_set_pd(-2.0, -2.0, -2.0, -2.0))), xi_120), xi_123), xi_125), xi_156), xi_244), xi_260), xi_262), xi_9), _mm256_set_pd(omega_shear, omega_shear, omega_shear, omega_shear));
           const __m256d xi_162 = _mm256_add_pd(_mm256_mul_pd(xi_160, _mm256_set_pd(-0.041666666666666664, -0.041666666666666664, -0.041666666666666664, -0.041666666666666664)), _mm256_mul_pd(xi_161, _mm256_set_pd(-0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666)));
           const __m256d xi_163 = _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_117, _mm256_set_pd(-0.050000000000000003, -0.050000000000000003, -0.050000000000000003, -0.050000000000000003)), _mm256_mul_pd(xi_108, _mm256_set_pd(-0.10000000000000001, -0.10000000000000001, -0.10000000000000001, -0.10000000000000001))), xi_162);
           const __m256d xi_164 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_127, _mm256_set_pd(0.014285714285714285, 0.014285714285714285, 0.014285714285714285, 0.014285714285714285)), _mm256_mul_pd(xi_111, _mm256_set_pd(0.028571428571428571, 0.028571428571428571, 0.028571428571428571, 0.028571428571428571))), xi_155), xi_158), xi_159), xi_163);
           const __m256d xi_176 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_160, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)), _mm256_mul_pd(xi_161, _mm256_set_pd(0.33333333333333331, 0.33333333333333331, 0.33333333333333331, 0.33333333333333331))), _mm256_mul_pd(xi_127, _mm256_set_pd(-0.035714285714285712, -0.035714285714285712, -0.035714285714285712, -0.035714285714285712))), _mm256_mul_pd(xi_111, _mm256_set_pd(-0.071428571428571425, -0.071428571428571425, -0.071428571428571425, -0.071428571428571425))), xi_159);
-          const __m256d xi_187 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_155, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_158, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_131, _mm256_set_pd(0.015873015873015872, 0.015873015873015872, 0.015873015873015872, 0.015873015873015872))), _mm256_mul_pd(xi_107, _mm256_set_pd(0.095238095238095233, 0.095238095238095233, 0.095238095238095233, 0.095238095238095233))), _mm256_mul_pd(xi_127, _mm256_set_pd(-0.021428571428571429, -0.021428571428571429, -0.021428571428571429, -0.021428571428571429))), _mm256_mul_pd(xi_111, _mm256_set_pd(-0.042857142857142858, -0.042857142857142858, -0.042857142857142858, -0.042857142857142858))), xi_163);
+          const __m256d xi_187 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_131, _mm256_set_pd(0.015873015873015872, 0.015873015873015872, 0.015873015873015872, 0.015873015873015872)), _mm256_mul_pd(xi_107, _mm256_set_pd(0.095238095238095233, 0.095238095238095233, 0.095238095238095233, 0.095238095238095233))), _mm256_mul_pd(xi_155, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_158, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(xi_127, _mm256_set_pd(-0.021428571428571429, -0.021428571428571429, -0.021428571428571429, -0.021428571428571429))), _mm256_mul_pd(xi_111, _mm256_set_pd(-0.042857142857142858, -0.042857142857142858, -0.042857142857142858, -0.042857142857142858))), xi_163);
           const __m256d xi_190 = _mm256_mul_pd(xi_157, _mm256_set_pd(0.0625, 0.0625, 0.0625, 0.0625));
           const __m256d xi_191 = _mm256_mul_pd(xi_131, _mm256_set_pd(0.013888888888888888, 0.013888888888888888, 0.013888888888888888, 0.013888888888888888));
           const __m256d xi_193 = _mm256_add_pd(_mm256_mul_pd(xi_124, _mm256_set_pd(0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664)), _mm256_mul_pd(xi_110, _mm256_set_pd(0.083333333333333329, 0.083333333333333329, 0.083333333333333329, 0.083333333333333329)));
@@ -409,7 +364,7 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const __m256d xi_195 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_165, xi_189), xi_190), xi_191), xi_192), xi_194);
           const __m256d xi_202 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_151, xi_189), xi_190), xi_191), xi_192), xi_194);
           const __m256d xi_204 = _mm256_mul_pd(xi_127, _mm256_set_pd(-0.0071428571428571426, -0.0071428571428571426, -0.0071428571428571426, -0.0071428571428571426));
-          const __m256d xi_205 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(u_2, xi_197), xi_132), xi_17), xi_246), _mm256_set_pd(xi_198, xi_198, xi_198, xi_198));
+          const __m256d xi_205 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(u_2, xi_197), xi_132), xi_17), xi_255), _mm256_set_pd(xi_198, xi_198, xi_198, xi_198));
           const __m256d xi_206 = _mm256_mul_pd(xi_117, _mm256_set_pd(0.025000000000000001, 0.025000000000000001, 0.025000000000000001, 0.025000000000000001));
           const __m256d xi_209 = _mm256_add_pd(_mm256_mul_pd(xi_131, _mm256_set_pd(-0.003968253968253968, -0.003968253968253968, -0.003968253968253968, -0.003968253968253968)), _mm256_mul_pd(xi_107, _mm256_set_pd(-0.023809523809523808, -0.023809523809523808, -0.023809523809523808, -0.023809523809523808)));
           const __m256d xi_210 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_162, xi_193), xi_203), xi_204), xi_205), xi_206), xi_207), xi_208), xi_209);
@@ -417,7 +372,7 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const __m256d xi_228 = _mm256_mul_pd(xi_190, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0));
           const __m256d xi_229 = _mm256_mul_pd(xi_127, _mm256_set_pd(0.017857142857142856, 0.017857142857142856, 0.017857142857142856, 0.017857142857142856));
           const __m256d xi_231 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_188, xi_194), xi_209), xi_227), xi_228), xi_229), xi_230);
-          const __m256d xi_233 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(rho, u_0), u_2), xi_10), xi_166), xi_249), _mm256_set_pd(xi_198, xi_198, xi_198, xi_198));
+          const __m256d xi_233 = _mm256_mul_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(rho, u_0), u_2), xi_10), xi_166), xi_246), _mm256_set_pd(xi_198, xi_198, xi_198, xi_198));
           const __m256d xi_234 = _mm256_add_pd(_mm256_mul_pd(xi_232, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_233, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)));
           const __m256d xi_240 = _mm256_add_pd(xi_232, xi_233);
           const __m256d xi_243 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(xi_186, xi_194), xi_209), xi_227), xi_228), xi_229), xi_230);
@@ -440,49 +395,49 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const __m256d forceTerm_16 = _mm256_add_pd(_mm256_add_pd(xi_103, xi_90), xi_91);
           const __m256d forceTerm_17 = _mm256_add_pd(_mm256_add_pd(xi_102, xi_104), xi_57);
           const __m256d forceTerm_18 = _mm256_add_pd(_mm256_add_pd(xi_101, xi_104), xi_77);
-          _mm256_store_pd(&_data_pdfs_20_30_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_110, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_131, _mm256_set_pd(0.023809523809523808, 0.023809523809523808, 0.023809523809523808, 0.023809523809523808))), _mm256_mul_pd(xi_107, _mm256_set_pd(0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285))), _mm256_mul_pd(xi_127, _mm256_set_pd(0.042857142857142858, 0.042857142857142858, 0.042857142857142858, 0.042857142857142858))), _mm256_mul_pd(xi_111, _mm256_set_pd(0.085714285714285715, 0.085714285714285715, 0.085714285714285715, 0.085714285714285715))), _mm256_mul_pd(xi_117, _mm256_set_pd(0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001))), _mm256_mul_pd(xi_108, _mm256_set_pd(0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001))), _mm256_mul_pd(xi_124, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), forceTerm_0), xi_253));
-          _mm256_store_pd(&_data_pdfs_20_31_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_135, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_140, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), forceTerm_1), xi_151), xi_164), xi_251));
-          _mm256_store_pd(&_data_pdfs_20_32_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_2, xi_135), xi_140), xi_164), xi_165), xi_264));
-          _mm256_store_pd(&_data_pdfs_20_33_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_3, xi_168), xi_170), xi_175), xi_176), xi_258));
-          _mm256_store_pd(&_data_pdfs_20_34_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_168, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_170, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), forceTerm_4), xi_176), xi_177), xi_244));
-          _mm256_store_pd(&_data_pdfs_20_35_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_179, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_181, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), forceTerm_5), xi_186), xi_187), xi_265));
-          _mm256_store_pd(&_data_pdfs_20_36_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_6, xi_179), xi_181), xi_187), xi_188), xi_245));
-          _mm256_store_pd(&_data_pdfs_20_37_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_7, xi_177), xi_195), xi_200), xi_252));
-          _mm256_store_pd(&_data_pdfs_20_38_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_8, xi_175), xi_195), xi_201), xi_257));
-          _mm256_store_pd(&_data_pdfs_20_39_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_9, xi_177), xi_201), xi_202), xi_250));
-          _mm256_store_pd(&_data_pdfs_20_310_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_10, xi_175), xi_200), xi_202), xi_247));
-          _mm256_store_pd(&_data_pdfs_20_311_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_11, xi_210), xi_219), xi_224), xi_263));
-          _mm256_store_pd(&_data_pdfs_20_312_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_12, xi_219), xi_225), xi_226), xi_248));
-          _mm256_store_pd(&_data_pdfs_20_313_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_13, xi_231), xi_234), xi_239), xi_256));
-          _mm256_store_pd(&_data_pdfs_20_314_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_14, xi_231), xi_240), xi_241), xi_260));
-          _mm256_store_pd(&_data_pdfs_20_315_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_15, xi_224), xi_226), xi_242), xi_246));
-          _mm256_store_pd(&_data_pdfs_20_316_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_16, xi_210), xi_225), xi_242), xi_255));
-          _mm256_store_pd(&_data_pdfs_20_317_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_17, xi_239), xi_240), xi_243), xi_262));
-          _mm256_store_pd(&_data_pdfs_20_318_10[ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_18, xi_234), xi_241), xi_243), xi_249));
+          _mm256_store_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_131, _mm256_set_pd(0.023809523809523808, 0.023809523809523808, 0.023809523809523808, 0.023809523809523808)), _mm256_mul_pd(xi_107, _mm256_set_pd(0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285))), _mm256_mul_pd(xi_127, _mm256_set_pd(0.042857142857142858, 0.042857142857142858, 0.042857142857142858, 0.042857142857142858))), _mm256_mul_pd(xi_111, _mm256_set_pd(0.085714285714285715, 0.085714285714285715, 0.085714285714285715, 0.085714285714285715))), _mm256_mul_pd(xi_117, _mm256_set_pd(0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001))), _mm256_mul_pd(xi_108, _mm256_set_pd(0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001))), _mm256_mul_pd(xi_124, _mm256_set_pd(-0.5, -0.5, -0.5, -0.5))), _mm256_mul_pd(xi_110, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), forceTerm_0), xi_264));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_135, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_140, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), forceTerm_1), xi_151), xi_164), xi_244));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_2, xi_135), xi_140), xi_164), xi_165), xi_260));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_3, xi_168), xi_170), xi_175), xi_176), xi_258));
+          _mm256_store_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_168, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_170, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), forceTerm_4), xi_176), xi_177), xi_249));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(xi_179, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(xi_181, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), forceTerm_5), xi_186), xi_187), xi_256));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_6, xi_179), xi_181), xi_187), xi_188), xi_262));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_7, xi_177), xi_195), xi_200), xi_254));
+          _mm256_store_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_8, xi_175), xi_195), xi_201), xi_245));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_9, xi_177), xi_201), xi_202), xi_257));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_10, xi_175), xi_200), xi_202), xi_247));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_11, xi_210), xi_219), xi_224), xi_263));
+          _mm256_store_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_12, xi_219), xi_225), xi_226), xi_265));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_13, xi_231), xi_234), xi_239), xi_252));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_14, xi_231), xi_240), xi_241), xi_251));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_15, xi_224), xi_226), xi_242), xi_255));
+          _mm256_store_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_16, xi_210), xi_225), xi_242), xi_253));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_17, xi_239), xi_240), xi_243), xi_261));
+          _mm256_storeu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0], _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(forceTerm_18, xi_234), xi_241), xi_243), xi_246));
         }
         for (int64_t ctr_0 = (int64_t)((_size_force_0) / (4)) * (4); ctr_0 < _size_force_0; ctr_0 += 1) {
-          const double xi_244 = _data_pdfs_20_34_10[ctr_0];
-          const double xi_245 = _data_pdfs_20_36_10[ctr_0];
-          const double xi_246 = _data_pdfs_20_315_10[ctr_0];
-          const double xi_247 = _data_pdfs_20_310_10[ctr_0];
-          const double xi_248 = _data_pdfs_20_312_10[ctr_0];
-          const double xi_249 = _data_pdfs_20_318_10[ctr_0];
-          const double xi_250 = _data_pdfs_20_39_10[ctr_0];
-          const double xi_251 = _data_pdfs_20_31_10[ctr_0];
-          const double xi_252 = _data_pdfs_20_37_10[ctr_0];
-          const double xi_253 = _data_pdfs_20_30_10[ctr_0];
-          const double xi_254 = _data_force_20_31_10[ctr_0];
-          const double xi_255 = _data_pdfs_20_316_10[ctr_0];
-          const double xi_256 = _data_pdfs_20_313_10[ctr_0];
-          const double xi_257 = _data_pdfs_20_38_10[ctr_0];
-          const double xi_258 = _data_pdfs_20_33_10[ctr_0];
-          const double xi_259 = _data_force_20_32_10[ctr_0];
-          const double xi_260 = _data_pdfs_20_314_10[ctr_0];
-          const double xi_261 = _data_force_20_30_10[ctr_0];
-          const double xi_262 = _data_pdfs_20_317_10[ctr_0];
-          const double xi_263 = _data_pdfs_20_311_10[ctr_0];
-          const double xi_264 = _data_pdfs_20_32_10[ctr_0];
-          const double xi_265 = _data_pdfs_20_35_10[ctr_0];
+          const double xi_244 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0];
+          const double xi_245 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0];
+          const double xi_246 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0];
+          const double xi_247 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0];
+          const double xi_248 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0];
+          const double xi_249 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0];
+          const double xi_250 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0];
+          const double xi_251 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0];
+          const double xi_252 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0];
+          const double xi_253 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0];
+          const double xi_254 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0];
+          const double xi_255 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0];
+          const double xi_256 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0];
+          const double xi_257 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0];
+          const double xi_258 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0];
+          const double xi_259 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0];
+          const double xi_260 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0];
+          const double xi_261 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0];
+          const double xi_262 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0];
+          const double xi_263 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0];
+          const double xi_264 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0];
+          const double xi_265 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0];
 
           double random_7_0{};
           double random_7_1{};
@@ -531,69 +486,69 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           if (kT > 0.) {
             philox_double2(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 0, seed, random_0_0, random_0_1);
           }
-          const double xi_2 = xi_249 + xi_260;
-          const double xi_3 = xi_2 + xi_244;
-          const double xi_4 = xi_246 + xi_251 + xi_263;
-          const double xi_5 = xi_248 + xi_265;
-          const double xi_6 = xi_245 + xi_262;
-          const double xi_8 = xi_250 * -1.0;
-          const double xi_9 = xi_252 * -1.0;
-          const double xi_10 = xi_262 * -1.0;
-          const double xi_11 = xi_256 * -1.0;
-          const double xi_12 = xi_258 * -1.0;
+          const double xi_2 = xi_246 + xi_251;
+          const double xi_3 = xi_2 + xi_249;
+          const double xi_4 = xi_244 + xi_255 + xi_263;
+          const double xi_5 = xi_256 + xi_265;
+          const double xi_6 = xi_261 + xi_262;
+          const double xi_8 = -xi_257;
+          const double xi_9 = -xi_254;
+          const double xi_10 = -xi_261;
+          const double xi_11 = -xi_252;
+          const double xi_12 = -xi_258;
           const double xi_13 = xi_10 + xi_11 + xi_12;
-          const double xi_14 = xi_264 * -1.0;
-          const double xi_15 = xi_247 * -1.0;
+          const double xi_14 = -xi_260;
+          const double xi_15 = -xi_247;
           const double xi_16 = xi_14 + xi_15;
-          const double xi_17 = xi_255 * -1.0;
-          const double xi_18 = xi_248 * -1.0;
+          const double xi_17 = -xi_253;
+          const double xi_18 = -xi_265;
           const double xi_19 = xi_17 + xi_18;
-          const double xi_20 = xi_249 * -1.0;
+          const double xi_20 = -xi_246;
           const double xi_21 = xi_10 + xi_20;
-          const double xi_22 = xi_246 * -1.0;
-          const double xi_23 = xi_245 * -1.0;
+          const double xi_22 = -xi_255;
+          const double xi_23 = -xi_262;
           const double xi_24 = xi_17 + xi_22 + xi_23 + xi_263;
-          const double xi_29 = xi_254 * 0.16666666666666666;
-          const double xi_30 = xi_254 * 0.083333333333333329;
-          const double xi_42 = xi_261 * 0.16666666666666666;
-          const double xi_43 = xi_261 * 0.083333333333333329;
+          const double xi_29 = xi_250 * 0.16666666666666666;
+          const double xi_30 = xi_250 * 0.083333333333333329;
+          const double xi_42 = xi_248 * 0.16666666666666666;
+          const double xi_43 = xi_248 * 0.083333333333333329;
           const double xi_49 = xi_259 * 0.16666666666666666;
           const double xi_50 = xi_259 * 0.083333333333333329;
-          const double xi_67 = xi_254 * 0.25;
-          const double xi_72 = xi_254 * xi_71;
-          const double xi_114 = xi_253 * -1.0;
-          const double xi_118 = xi_263 * -1.0;
+          const double xi_67 = xi_250 * 0.25;
+          const double xi_72 = xi_250 * xi_71;
+          const double xi_114 = -xi_264;
+          const double xi_118 = -xi_263;
           const double xi_119 = xi_118 + xi_18;
-          const double xi_120 = xi_257 * -1.0 + xi_8;
-          const double xi_122 = xi_260 * -1.0;
+          const double xi_120 = -xi_245 + xi_8;
+          const double xi_122 = -xi_251;
           const double xi_123 = xi_11 + xi_122 + xi_15 + xi_21;
-          const double xi_125 = xi_246 * 2.0 + xi_248 * 2.0 + xi_255 * 2.0 + xi_263 * 2.0;
-          const double xi_126 = xi_125 + xi_244 * 5.0 + xi_258 * 5.0;
-          const double xi_128 = xi_256 * 2.0;
-          const double xi_129 = xi_260 * 2.0;
-          const double xi_130 = xi_249 * 2.0 + xi_262 * 2.0;
-          const double xi_132 = xi_118 + xi_248;
-          const double xi_133 = xi_132 + xi_14 + xi_22 + xi_251 + xi_255;
+          const double xi_125 = xi_253 * 2.0 + xi_255 * 2.0 + xi_263 * 2.0 + xi_265 * 2.0;
+          const double xi_126 = xi_125 + xi_249 * 5.0 + xi_258 * 5.0;
+          const double xi_128 = xi_252 * 2.0;
+          const double xi_129 = xi_251 * 2.0;
+          const double xi_130 = xi_246 * 2.0 + xi_261 * 2.0;
+          const double xi_132 = xi_118 + xi_265;
+          const double xi_133 = xi_132 + xi_14 + xi_22 + xi_244 + xi_253;
           const double xi_135 = xi_133 * xi_134;
           const double xi_136 = random_5_1 - 0.5;
-          const double xi_141 = xi_252 * 2.0;
+          const double xi_141 = xi_254 * 2.0;
           const double xi_142 = xi_247 * 2.0;
-          const double xi_143 = xi_250 * 2.0 + xi_257 * -2.0;
-          const double xi_144 = xi_14 + xi_141 * -1.0 + xi_142 + xi_143 + xi_19 + xi_4;
+          const double xi_143 = xi_245 * -2.0 + xi_257 * 2.0;
+          const double xi_144 = xi_14 - xi_141 + xi_142 + xi_143 + xi_19 + xi_4;
           const double xi_146 = xi_144 * xi_145;
           const double xi_147 = random_3_0 - 0.5;
           const double xi_152 = random_0_1 - 0.5;
-          const double xi_166 = xi_122 + xi_256;
-          const double xi_167 = xi_12 + xi_166 + xi_20 + xi_244 + xi_262;
+          const double xi_166 = xi_122 + xi_252;
+          const double xi_167 = xi_12 + xi_166 + xi_20 + xi_249 + xi_261;
           const double xi_168 = xi_134 * xi_167;
           const double xi_169 = random_4_1 - 0.5;
-          const double xi_171 = xi_13 + xi_141 + xi_142 * -1.0 + xi_143 + xi_3;
+          const double xi_171 = xi_13 + xi_141 - xi_142 + xi_143 + xi_3;
           const double xi_172 = xi_145 * xi_171;
           const double xi_173 = random_4_0 - 0.5;
-          const double xi_178 = xi_119 + xi_23 + xi_246 + xi_255 + xi_265;
+          const double xi_178 = xi_119 + xi_23 + xi_253 + xi_255 + xi_256;
           const double xi_179 = xi_134 * xi_178;
           const double xi_180 = random_5_0 - 0.5;
-          const double xi_182 = xi_128 * -1.0 + xi_129 * -1.0 + xi_130 + xi_24 + xi_5;
+          const double xi_182 = -xi_128 - xi_129 + xi_130 + xi_24 + xi_5;
           const double xi_183 = xi_145 * xi_182;
           const double xi_184 = random_3_1 - 0.5;
           const double xi_212 = xi_182 * xi_211;
@@ -605,28 +560,28 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const double xi_31 = rr_0 * xi_30;
           const double xi_44 = rr_0 * xi_43;
           const double xi_51 = rr_0 * xi_50;
-          const double xi_54 = xi_261 * xi_53;
-          const double xi_59 = xi_254 * xi_53;
+          const double xi_54 = xi_248 * xi_53;
+          const double xi_59 = xi_250 * xi_53;
           const double xi_81 = xi_259 * xi_53;
-          const double vel0Term = xi_247 + xi_257 + xi_3;
-          const double vel1Term = xi_252 + xi_4;
-          const double vel2Term = xi_256 + xi_5;
-          const double rho = vel0Term + vel1Term + vel2Term + xi_250 + xi_253 + xi_255 + xi_258 + xi_264 + xi_6;
+          const double vel0Term = xi_245 + xi_247 + xi_3;
+          const double vel1Term = xi_254 + xi_4;
+          const double vel2Term = xi_252 + xi_5;
+          const double rho = vel0Term + vel1Term + vel2Term + xi_253 + xi_257 + xi_258 + xi_260 + xi_264 + xi_6;
           const double xi_105 = kT * rho;
-          const double xi_106 = pow(xi_105 * (-1.0 * (omega_even * -1.0 + 1.0) * (omega_even * -1.0 + 1.0) + 1.0), 0.5);
+          const double xi_106 = pow(xi_105 * (1.0 - ((-omega_even + 1.0) * (-omega_even + 1.0))), 0.5);
           const double xi_107 = xi_106 * (random_6_0 - 0.5) * 3.7416573867739413;
           const double xi_108 = xi_106 * (random_7_0 - 0.5) * 5.4772255750516612;
-          const double xi_110 = xi_109 * (random_2_1 - 0.5) * pow(xi_105 * (-1.0 * (omega_bulk * -1.0 + 1.0) * (omega_bulk * -1.0 + 1.0) + 1.0), 0.5);
+          const double xi_110 = xi_109 * (random_2_1 - 0.5) * pow(xi_105 * (1.0 - ((-omega_bulk + 1.0) * (-omega_bulk + 1.0))), 0.5);
           const double xi_111 = xi_106 * (random_6_1 - 0.5) * 8.3666002653407556;
-          const double xi_137 = pow(xi_105 * (-1.0 * (omega_odd * -1.0 + 1.0) * (omega_odd * -1.0 + 1.0) + 1.0), 0.5);
+          const double xi_137 = pow(xi_105 * (1.0 - ((-omega_odd + 1.0) * (-omega_odd + 1.0))), 0.5);
           const double xi_138 = xi_137 * 1.4142135623730951;
           const double xi_139 = xi_138 * 0.5;
           const double xi_140 = xi_136 * xi_139;
           const double xi_148 = xi_109 * xi_137;
           const double xi_149 = xi_148 * 0.16666666666666666;
           const double xi_150 = xi_147 * xi_149;
-          const double xi_151 = xi_146 * -1.0 + xi_150 * -1.0;
-          const double xi_153 = pow(xi_105 * (-1.0 * (omega_shear * -1.0 + 1.0) * (omega_shear * -1.0 + 1.0) + 1.0), 0.5);
+          const double xi_151 = -xi_146 - xi_150;
+          const double xi_153 = pow(xi_105 * (1.0 - ((-omega_shear + 1.0) * (-omega_shear + 1.0))), 0.5);
           const double xi_154 = xi_153 * 0.5;
           const double xi_155 = xi_152 * xi_154;
           const double xi_161 = xi_153 * (random_0_0 - 0.5) * 1.7320508075688772;
@@ -634,10 +589,10 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const double xi_170 = xi_139 * xi_169;
           const double xi_174 = xi_149 * xi_173;
           const double xi_175 = xi_172 + xi_174;
-          const double xi_177 = xi_172 * -1.0 + xi_174 * -1.0;
+          const double xi_177 = -xi_172 - xi_174;
           const double xi_181 = xi_139 * xi_180;
           const double xi_185 = xi_149 * xi_184;
-          const double xi_186 = xi_183 * -1.0 + xi_185 * -1.0;
+          const double xi_186 = -xi_183 - xi_185;
           const double xi_188 = xi_183 + xi_185;
           const double xi_189 = xi_152 * xi_153 * 0.25;
           const double xi_192 = xi_107 * 0.083333333333333329;
@@ -649,108 +604,108 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const double xi_216 = xi_184 * xi_215;
           const double xi_217 = xi_138 * 0.25;
           const double xi_218 = xi_180 * xi_217;
-          const double xi_219 = xi_212 * -1.0 + xi_214 + xi_216 * -1.0 + xi_218;
+          const double xi_219 = -xi_212 + xi_214 - xi_216 + xi_218;
           const double xi_222 = xi_147 * xi_215;
           const double xi_223 = xi_136 * xi_217;
-          const double xi_224 = xi_220 * -1.0 + xi_221 + xi_222 * -1.0 + xi_223;
-          const double xi_225 = xi_220 + xi_221 * -1.0 + xi_222 + xi_223 * -1.0;
-          const double xi_227 = xi_189 * -1.0;
+          const double xi_224 = -xi_220 + xi_221 - xi_222 + xi_223;
+          const double xi_225 = xi_220 - xi_221 + xi_222 - xi_223;
+          const double xi_227 = -xi_189;
           const double xi_230 = xi_111 * 0.035714285714285712;
           const double xi_232 = xi_154 * (random_1_1 - 0.5);
           const double xi_237 = xi_169 * xi_217;
           const double xi_238 = xi_173 * xi_215;
-          const double xi_239 = xi_235 * -1.0 + xi_236 + xi_237 * -1.0 + xi_238;
-          const double xi_241 = xi_235 + xi_236 * -1.0 + xi_237 + xi_238 * -1.0;
-          const double xi_242 = xi_212 + xi_214 * -1.0 + xi_216 + xi_218 * -1.0;
+          const double xi_239 = -xi_235 + xi_236 - xi_237 + xi_238;
+          const double xi_241 = xi_235 - xi_236 + xi_237 - xi_238;
+          const double xi_242 = xi_212 - xi_214 + xi_216 - xi_218;
           const double xi_0 = ((1.0) / (rho));
           const double xi_7 = xi_0 * 0.5;
-          const double u_0 = xi_0 * (vel0Term + xi_13 + xi_8 + xi_9) + xi_261 * xi_7;
-          const double xi_25 = u_0 * xi_261;
+          const double u_0 = xi_0 * (vel0Term + xi_13 + xi_8 + xi_9) + xi_248 * xi_7;
+          const double xi_25 = u_0 * xi_248;
           const double xi_37 = xi_25 * 0.16666666666666666;
           const double xi_38 = xi_25 * 0.083333333333333329;
           const double xi_39 = omega_shear * xi_38;
-          const double xi_40 = xi_37 * -1.0 + xi_39;
-          const double xi_56 = xi_25 * xi_55 * -1.0 + xi_37;
-          const double xi_57 = xi_43 * -1.0 + xi_54 + xi_56;
-          const double xi_61 = xi_25 * xi_60 * -1.0;
+          const double xi_40 = -xi_37 + xi_39;
+          const double xi_56 = -xi_25 * xi_55 + xi_37;
+          const double xi_57 = -xi_43 + xi_54 + xi_56;
+          const double xi_61 = -xi_25 * xi_60;
           const double xi_68 = u_0 * xi_67;
           const double xi_73 = u_0 * xi_72;
-          const double xi_77 = xi_43 + xi_54 * -1.0 + xi_56;
-          const double xi_84 = xi_38 * -1.0;
+          const double xi_77 = xi_43 - xi_54 + xi_56;
+          const double xi_84 = -xi_38;
           const double xi_95 = u_0 * xi_259;
           const double xi_96 = xi_95 * 0.25;
           const double xi_99 = xi_71 * xi_95;
-          const double xi_113 = rho * u_0 * u_0;
-          const double u_1 = xi_0 * (vel1Term + xi_16 + xi_19 + xi_257 + xi_8) + xi_254 * xi_7;
-          const double xi_26 = u_1 * xi_254;
+          const double xi_113 = rho * (u_0 * u_0);
+          const double u_1 = xi_0 * (vel1Term + xi_16 + xi_19 + xi_245 + xi_8) + xi_250 * xi_7;
+          const double xi_26 = u_1 * xi_250;
           const double xi_32 = xi_26 * 0.16666666666666666;
           const double xi_45 = xi_26 * 0.083333333333333329;
           const double xi_46 = omega_shear * xi_45;
-          const double xi_47 = xi_32 * -1.0 + xi_46;
-          const double xi_62 = xi_26 * xi_60 * -1.0;
+          const double xi_47 = -xi_32 + xi_46;
+          const double xi_62 = -xi_26 * xi_60;
           const double xi_69 = u_1 * 0.25;
-          const double xi_70 = xi_261 * xi_69;
+          const double xi_70 = xi_248 * xi_69;
           const double xi_74 = u_1 * xi_71;
-          const double xi_75 = xi_261 * xi_74;
-          const double xi_76 = xi_68 * -1.0 + xi_70 * -1.0 + xi_73 + xi_75;
-          const double xi_78 = xi_68 + xi_70 + xi_73 * -1.0 + xi_75 * -1.0;
+          const double xi_75 = xi_248 * xi_74;
+          const double xi_76 = -xi_68 - xi_70 + xi_73 + xi_75;
+          const double xi_78 = xi_68 + xi_70 - xi_73 - xi_75;
           const double xi_86 = xi_259 * xi_69;
           const double xi_88 = xi_259 * xi_74;
-          const double xi_93 = xi_45 * -1.0;
-          const double xi_112 = rho * u_1 * u_1;
+          const double xi_93 = -xi_45;
+          const double xi_112 = rho * (u_1 * u_1);
           const double xi_121 = xi_112 + xi_120 + xi_9;
           const double xi_197 = rho * u_1;
-          const double xi_199 = xi_198 * (u_0 * xi_197 + xi_120 + xi_247 + xi_252);
-          const double xi_200 = xi_196 * -1.0 + xi_199 * -1.0;
+          const double xi_199 = xi_198 * (u_0 * xi_197 + xi_120 + xi_247 + xi_254);
+          const double xi_200 = -xi_196 - xi_199;
           const double xi_201 = xi_196 + xi_199;
-          const double u_2 = xi_0 * (vel2Term + xi_21 + xi_24 + xi_260) + xi_259 * xi_7;
+          const double u_2 = xi_0 * (vel2Term + xi_21 + xi_24 + xi_251) + xi_259 * xi_7;
           const double xi_27 = u_2 * xi_259;
           const double xi_33 = xi_27 * 0.16666666666666666;
           const double xi_34 = xi_27 * 0.083333333333333329;
           const double xi_35 = omega_shear * xi_34;
-          const double xi_36 = xi_33 * -1.0 + xi_35;
-          const double xi_41 = omega_shear * xi_32 * -1.0 + xi_26 * 0.33333333333333331 + xi_36 + xi_40;
-          const double xi_48 = omega_shear * xi_37 * -1.0 + xi_25 * 0.33333333333333331 + xi_36 + xi_47;
-          const double xi_52 = omega_shear * xi_33 * -1.0 + xi_27 * 0.33333333333333331 + xi_40 + xi_47;
-          const double xi_58 = xi_34 * -1.0;
-          const double xi_63 = xi_27 * xi_60 * -1.0;
-          const double xi_64 = xi_26 * xi_55 * -1.0 + xi_32 + xi_61 + xi_62 + xi_63;
-          const double xi_65 = xi_30 + xi_59 * -1.0 + xi_64;
+          const double xi_36 = -xi_33 + xi_35;
+          const double xi_41 = -omega_shear * xi_32 + xi_26 * 0.33333333333333331 + xi_36 + xi_40;
+          const double xi_48 = -omega_shear * xi_37 + xi_25 * 0.33333333333333331 + xi_36 + xi_47;
+          const double xi_52 = -omega_shear * xi_33 + xi_27 * 0.33333333333333331 + xi_40 + xi_47;
+          const double xi_58 = -xi_34;
+          const double xi_63 = -xi_27 * xi_60;
+          const double xi_64 = -xi_26 * xi_55 + xi_32 + xi_61 + xi_62 + xi_63;
+          const double xi_65 = xi_30 - xi_59 + xi_64;
           const double xi_66 = xi_35 + xi_58 + xi_65;
-          const double xi_79 = xi_30 * -1.0 + xi_59 + xi_64;
+          const double xi_79 = -xi_30 + xi_59 + xi_64;
           const double xi_80 = xi_35 + xi_58 + xi_79;
-          const double xi_82 = xi_27 * xi_55 * -1.0 + xi_33;
-          const double xi_83 = xi_50 + xi_81 * -1.0 + xi_82;
+          const double xi_82 = -xi_27 * xi_55 + xi_33;
+          const double xi_83 = xi_50 - xi_81 + xi_82;
           const double xi_85 = xi_39 + xi_65 + xi_84;
           const double xi_87 = u_2 * xi_67;
           const double xi_89 = u_2 * xi_72;
-          const double xi_90 = xi_86 + xi_87 + xi_88 * -1.0 + xi_89 * -1.0;
+          const double xi_90 = xi_86 + xi_87 - xi_88 - xi_89;
           const double xi_91 = xi_39 + xi_79 + xi_84;
-          const double xi_92 = xi_86 * -1.0 + xi_87 * -1.0 + xi_88 + xi_89;
+          const double xi_92 = -xi_86 - xi_87 + xi_88 + xi_89;
           const double xi_94 = xi_46 + xi_61 + xi_62 + xi_63 + xi_83 + xi_93;
-          const double xi_97 = u_2 * xi_261;
+          const double xi_97 = u_2 * xi_248;
           const double xi_98 = xi_97 * 0.25;
           const double xi_100 = xi_71 * xi_97;
-          const double xi_101 = xi_100 + xi_96 * -1.0 + xi_98 * -1.0 + xi_99;
-          const double xi_102 = xi_100 * -1.0 + xi_96 + xi_98 + xi_99 * -1.0;
-          const double xi_103 = xi_50 * -1.0 + xi_81 + xi_82;
+          const double xi_101 = xi_100 - xi_96 - xi_98 + xi_99;
+          const double xi_102 = -xi_100 + xi_96 + xi_98 - xi_99;
+          const double xi_103 = -xi_50 + xi_81 + xi_82;
           const double xi_104 = xi_103 + xi_46 + xi_61 + xi_62 + xi_63 + xi_93;
-          const double xi_115 = rho * u_2 * u_2;
-          const double xi_116 = xi_114 + xi_115 * 0.66666666666666663 + xi_245 * 3.0 + xi_265 * 3.0;
-          const double xi_117 = omega_even * (xi_112 * 0.66666666666666663 + xi_113 * 1.6666666666666667 + xi_116 + xi_246 * -3.0 + xi_248 * -3.0 + xi_251 * 3.0 + xi_255 * -3.0 + xi_263 * -3.0 + xi_264 * 3.0);
-          const double xi_124 = omega_bulk * (xi_113 + xi_115 + xi_119 + xi_121 + xi_123 + xi_17 + xi_22 + xi_253);
-          const double xi_127 = omega_even * (xi_112 * 2.3333333333333335 + xi_116 + xi_126 + xi_249 * -5.0 + xi_251 * -2.0 + xi_256 * -5.0 + xi_260 * -5.0 + xi_262 * -5.0 + xi_264 * -2.0);
-          const double xi_131 = omega_even * (xi_114 + xi_115 * 3.0 + xi_126 + xi_128 + xi_129 + xi_130 + xi_245 * -4.0 + xi_247 * -7.0 + xi_250 * -7.0 + xi_251 * 5.0 + xi_252 * -7.0 + xi_257 * -7.0 + xi_264 * 5.0 + xi_265 * -4.0);
-          const double xi_156 = xi_115 * -1.0 + xi_265;
-          const double xi_157 = omega_shear * (xi_121 + xi_156 + xi_16 + xi_2 + xi_251 * -1.0 + xi_256 + xi_6);
+          const double xi_115 = rho * (u_2 * u_2);
+          const double xi_116 = xi_114 + xi_115 * 0.66666666666666663 + xi_256 * 3.0 + xi_262 * 3.0;
+          const double xi_117 = omega_even * (xi_112 * 0.66666666666666663 + xi_113 * 1.6666666666666667 + xi_116 + xi_244 * 3.0 + xi_253 * -3.0 + xi_255 * -3.0 + xi_260 * 3.0 + xi_263 * -3.0 + xi_265 * -3.0);
+          const double xi_124 = omega_bulk * (xi_113 + xi_115 + xi_119 + xi_121 + xi_123 + xi_17 + xi_22 + xi_264);
+          const double xi_127 = omega_even * (xi_112 * 2.3333333333333335 + xi_116 + xi_126 + xi_244 * -2.0 + xi_246 * -5.0 + xi_251 * -5.0 + xi_252 * -5.0 + xi_260 * -2.0 + xi_261 * -5.0);
+          const double xi_131 = omega_even * (xi_114 + xi_115 * 3.0 + xi_126 + xi_128 + xi_129 + xi_130 + xi_244 * 5.0 + xi_245 * -7.0 + xi_247 * -7.0 + xi_254 * -7.0 + xi_256 * -4.0 + xi_257 * -7.0 + xi_260 * 5.0 + xi_262 * -4.0);
+          const double xi_156 = -xi_115 + xi_256;
+          const double xi_157 = omega_shear * (xi_121 + xi_156 + xi_16 + xi_2 - xi_244 + xi_252 + xi_6);
           const double xi_158 = xi_157 * 0.125;
           const double xi_159 = xi_107 * -0.11904761904761904 + xi_131 * -0.01984126984126984;
-          const double xi_160 = omega_shear * (xi_112 * -1.0 + xi_113 * 2.0 + xi_120 + xi_123 + xi_125 + xi_156 + xi_244 * -2.0 + xi_245 + xi_251 + xi_258 * -2.0 + xi_264 + xi_9);
+          const double xi_160 = omega_shear * (-xi_112 + xi_113 * 2.0 + xi_120 + xi_123 + xi_125 + xi_156 + xi_244 + xi_249 * -2.0 + xi_258 * -2.0 + xi_260 + xi_262 + xi_9);
           const double xi_162 = xi_160 * -0.041666666666666664 + xi_161 * -0.16666666666666666;
           const double xi_163 = xi_108 * -0.10000000000000001 + xi_117 * -0.050000000000000003 + xi_162;
           const double xi_164 = xi_111 * 0.028571428571428571 + xi_127 * 0.014285714285714285 + xi_155 + xi_158 + xi_159 + xi_163;
           const double xi_176 = xi_111 * -0.071428571428571425 + xi_127 * -0.035714285714285712 + xi_159 + xi_160 * 0.083333333333333329 + xi_161 * 0.33333333333333331;
-          const double xi_187 = xi_107 * 0.095238095238095233 + xi_111 * -0.042857142857142858 + xi_127 * -0.021428571428571429 + xi_131 * 0.015873015873015872 + xi_155 * -1.0 + xi_158 * -1.0 + xi_163;
+          const double xi_187 = xi_107 * 0.095238095238095233 + xi_111 * -0.042857142857142858 + xi_127 * -0.021428571428571429 + xi_131 * 0.015873015873015872 - xi_155 - xi_158 + xi_163;
           const double xi_190 = xi_157 * 0.0625;
           const double xi_191 = xi_131 * 0.013888888888888888;
           const double xi_193 = xi_110 * 0.083333333333333329 + xi_124 * 0.041666666666666664;
@@ -758,25 +713,25 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const double xi_195 = xi_165 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
           const double xi_202 = xi_151 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
           const double xi_204 = xi_127 * -0.0071428571428571426;
-          const double xi_205 = xi_198 * (u_2 * xi_197 + xi_132 + xi_17 + xi_246);
+          const double xi_205 = xi_198 * (u_2 * xi_197 + xi_132 + xi_17 + xi_255);
           const double xi_206 = xi_117 * 0.025000000000000001;
           const double xi_209 = xi_107 * -0.023809523809523808 + xi_131 * -0.003968253968253968;
           const double xi_210 = xi_162 + xi_193 + xi_203 + xi_204 + xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
-          const double xi_226 = xi_162 + xi_193 + xi_203 * -1.0 + xi_204 + xi_205 * -1.0 + xi_206 + xi_207 + xi_208 + xi_209;
-          const double xi_228 = xi_190 * -1.0;
+          const double xi_226 = xi_162 + xi_193 - xi_203 + xi_204 - xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
+          const double xi_228 = -xi_190;
           const double xi_229 = xi_127 * 0.017857142857142856;
           const double xi_231 = xi_188 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
-          const double xi_233 = xi_198 * (rho * u_0 * u_2 + xi_10 + xi_166 + xi_249);
-          const double xi_234 = xi_232 * -1.0 + xi_233 * -1.0;
+          const double xi_233 = xi_198 * (rho * u_0 * u_2 + xi_10 + xi_166 + xi_246);
+          const double xi_234 = -xi_232 - xi_233;
           const double xi_240 = xi_232 + xi_233;
           const double xi_243 = xi_186 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
-          const double forceTerm_0 = xi_25 * xi_28 + xi_25 * -1.0 + xi_26 * xi_28 + xi_26 * -1.0 + xi_27 * xi_28 + xi_27 * -1.0;
-          const double forceTerm_1 = xi_29 + xi_31 * -1.0 + xi_41;
-          const double forceTerm_2 = xi_29 * -1.0 + xi_31 + xi_41;
-          const double forceTerm_3 = xi_42 * -1.0 + xi_44 + xi_48;
-          const double forceTerm_4 = xi_42 + xi_44 * -1.0 + xi_48;
-          const double forceTerm_5 = xi_49 + xi_51 * -1.0 + xi_52;
-          const double forceTerm_6 = xi_49 * -1.0 + xi_51 + xi_52;
+          const double forceTerm_0 = xi_25 * xi_28 - xi_25 + xi_26 * xi_28 - xi_26 + xi_27 * xi_28 - xi_27;
+          const double forceTerm_1 = xi_29 - xi_31 + xi_41;
+          const double forceTerm_2 = -xi_29 + xi_31 + xi_41;
+          const double forceTerm_3 = -xi_42 + xi_44 + xi_48;
+          const double forceTerm_4 = xi_42 - xi_44 + xi_48;
+          const double forceTerm_5 = xi_49 - xi_51 + xi_52;
+          const double forceTerm_6 = -xi_49 + xi_51 + xi_52;
           const double forceTerm_7 = xi_57 + xi_66 + xi_76;
           const double forceTerm_8 = xi_66 + xi_77 + xi_78;
           const double forceTerm_9 = xi_57 + xi_78 + xi_80;
@@ -789,25 +744,25 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
           const double forceTerm_16 = xi_103 + xi_90 + xi_91;
           const double forceTerm_17 = xi_102 + xi_104 + xi_57;
           const double forceTerm_18 = xi_101 + xi_104 + xi_77;
-          _data_pdfs_20_30_10[ctr_0] = forceTerm_0 + xi_107 * 0.14285714285714285 + xi_108 * 0.20000000000000001 + xi_110 * -1.0 + xi_111 * 0.085714285714285715 + xi_117 * 0.10000000000000001 + xi_124 * -0.5 + xi_127 * 0.042857142857142858 + xi_131 * 0.023809523809523808 + xi_253;
-          _data_pdfs_20_31_10[ctr_0] = forceTerm_1 + xi_135 * -1.0 + xi_140 * -1.0 + xi_151 + xi_164 + xi_251;
-          _data_pdfs_20_32_10[ctr_0] = forceTerm_2 + xi_135 + xi_140 + xi_164 + xi_165 + xi_264;
-          _data_pdfs_20_33_10[ctr_0] = forceTerm_3 + xi_168 + xi_170 + xi_175 + xi_176 + xi_258;
-          _data_pdfs_20_34_10[ctr_0] = forceTerm_4 + xi_168 * -1.0 + xi_170 * -1.0 + xi_176 + xi_177 + xi_244;
-          _data_pdfs_20_35_10[ctr_0] = forceTerm_5 + xi_179 * -1.0 + xi_181 * -1.0 + xi_186 + xi_187 + xi_265;
-          _data_pdfs_20_36_10[ctr_0] = forceTerm_6 + xi_179 + xi_181 + xi_187 + xi_188 + xi_245;
-          _data_pdfs_20_37_10[ctr_0] = forceTerm_7 + xi_177 + xi_195 + xi_200 + xi_252;
-          _data_pdfs_20_38_10[ctr_0] = forceTerm_8 + xi_175 + xi_195 + xi_201 + xi_257;
-          _data_pdfs_20_39_10[ctr_0] = forceTerm_9 + xi_177 + xi_201 + xi_202 + xi_250;
-          _data_pdfs_20_310_10[ctr_0] = forceTerm_10 + xi_175 + xi_200 + xi_202 + xi_247;
-          _data_pdfs_20_311_10[ctr_0] = forceTerm_11 + xi_210 + xi_219 + xi_224 + xi_263;
-          _data_pdfs_20_312_10[ctr_0] = forceTerm_12 + xi_219 + xi_225 + xi_226 + xi_248;
-          _data_pdfs_20_313_10[ctr_0] = forceTerm_13 + xi_231 + xi_234 + xi_239 + xi_256;
-          _data_pdfs_20_314_10[ctr_0] = forceTerm_14 + xi_231 + xi_240 + xi_241 + xi_260;
-          _data_pdfs_20_315_10[ctr_0] = forceTerm_15 + xi_224 + xi_226 + xi_242 + xi_246;
-          _data_pdfs_20_316_10[ctr_0] = forceTerm_16 + xi_210 + xi_225 + xi_242 + xi_255;
-          _data_pdfs_20_317_10[ctr_0] = forceTerm_17 + xi_239 + xi_240 + xi_243 + xi_262;
-          _data_pdfs_20_318_10[ctr_0] = forceTerm_18 + xi_234 + xi_241 + xi_243 + xi_249;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0] = forceTerm_0 + xi_107 * 0.14285714285714285 + xi_108 * 0.20000000000000001 - xi_110 + xi_111 * 0.085714285714285715 + xi_117 * 0.10000000000000001 + xi_124 * -0.5 + xi_127 * 0.042857142857142858 + xi_131 * 0.023809523809523808 + xi_264;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0] = forceTerm_1 - xi_135 - xi_140 + xi_151 + xi_164 + xi_244;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0] = forceTerm_2 + xi_135 + xi_140 + xi_164 + xi_165 + xi_260;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0] = forceTerm_3 + xi_168 + xi_170 + xi_175 + xi_176 + xi_258;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0] = forceTerm_4 - xi_168 - xi_170 + xi_176 + xi_177 + xi_249;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0] = forceTerm_5 - xi_179 - xi_181 + xi_186 + xi_187 + xi_256;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0] = forceTerm_6 + xi_179 + xi_181 + xi_187 + xi_188 + xi_262;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0] = forceTerm_7 + xi_177 + xi_195 + xi_200 + xi_254;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0] = forceTerm_8 + xi_175 + xi_195 + xi_201 + xi_245;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0] = forceTerm_9 + xi_177 + xi_201 + xi_202 + xi_257;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0] = forceTerm_10 + xi_175 + xi_200 + xi_202 + xi_247;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0] = forceTerm_11 + xi_210 + xi_219 + xi_224 + xi_263;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0] = forceTerm_12 + xi_219 + xi_225 + xi_226 + xi_265;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0] = forceTerm_13 + xi_231 + xi_234 + xi_239 + xi_252;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0] = forceTerm_14 + xi_231 + xi_240 + xi_241 + xi_251;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0] = forceTerm_15 + xi_224 + xi_226 + xi_242 + xi_255;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0] = forceTerm_16 + xi_210 + xi_225 + xi_242 + xi_253;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0] = forceTerm_17 + xi_239 + xi_240 + xi_243 + xi_261;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0] = forceTerm_18 + xi_234 + xi_241 + xi_243 + xi_246;
         }
       }
     }
@@ -816,40 +771,42 @@ static FUNC_PREFIX void collidesweepdoubleprecisionthermalizedavx_collidesweepdo
 } // namespace internal_25bc51f30ec2c20f3ee9796f7dcb65c6
 
 void CollideSweepDoublePrecisionThermalizedAVX::run(IBlock *block) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
   auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
 
+  auto &time_step = this->time_step_;
+  auto &block_offset_1 = this->block_offset_1_;
+  auto &omega_even = this->omega_even_;
   auto &omega_bulk = this->omega_bulk_;
-  auto block_offset_1 = this->block_offset_1_;
   auto &seed = this->seed_;
-  auto &omega_even = this->omega_even_;
   auto &kT = this->kT_;
-  auto &omega_odd = this->omega_odd_;
-  auto block_offset_2 = this->block_offset_2_;
-  auto &time_step = this->time_step_;
-  auto block_offset_0 = this->block_offset_0_;
   auto &omega_shear = this->omega_shear_;
-  block_offset_generator(block, block_offset_0, block_offset_1, block_offset_2);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()));
+  auto &omega_odd = this->omega_odd_;
+  auto &block_offset_0 = this->block_offset_0_;
+  auto &block_offset_2 = this->block_offset_2_;
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -860,6 +817,9 @@ void CollideSweepDoublePrecisionThermalizedAVX::run(IBlock *block) {
 }
 
 void CollideSweepDoublePrecisionThermalizedAVX::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -871,41 +831,40 @@ void CollideSweepDoublePrecisionThermalizedAVX::runOnCellInterval(const shared_p
   auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
 
+  auto &time_step = this->time_step_;
+  auto &block_offset_1 = this->block_offset_1_;
+  auto &omega_even = this->omega_even_;
   auto &omega_bulk = this->omega_bulk_;
-  auto block_offset_1 = this->block_offset_1_;
   auto &seed = this->seed_;
-  auto &omega_even = this->omega_even_;
   auto &kT = this->kT_;
-  auto &omega_odd = this->omega_odd_;
-  auto block_offset_2 = this->block_offset_2_;
-  auto &time_step = this->time_step_;
-  auto block_offset_0 = this->block_offset_0_;
   auto &omega_shear = this->omega_shear_;
-  block_offset_generator(block, block_offset_0, block_offset_1, block_offset_2);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()));
+  auto &omega_odd = this->omega_odd_;
+  auto &block_offset_0 = this->block_offset_0_;
+  auto &block_offset_2 = this->block_offset_2_;
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -924,4 +883,4 @@ void CollideSweepDoublePrecisionThermalizedAVX::runOnCellInterval(const shared_p
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h
index f822ebf2955..00c0575773c 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedAVX.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -51,16 +52,16 @@ namespace pystencils {
 
 class CollideSweepDoublePrecisionThermalizedAVX {
 public:
-  CollideSweepDoublePrecisionThermalizedAVX(
-      BlockDataID forceID_, BlockDataID pdfsID_, uint32_t block_offset_0,
-      uint32_t block_offset_1, uint32_t block_offset_2, double kT,
-      double omega_bulk, double omega_even, double omega_odd,
-      double omega_shear, uint32_t seed, uint32_t time_step)
-      : forceID(forceID_), pdfsID(pdfsID_), block_offset_0_(block_offset_0),
-        block_offset_1_(block_offset_1), block_offset_2_(block_offset_2),
-        kT_(kT), omega_bulk_(omega_bulk), omega_even_(omega_even),
-        omega_odd_(omega_odd), omega_shear_(omega_shear), seed_(seed),
-        time_step_(time_step){};
+  CollideSweepDoublePrecisionThermalizedAVX(BlockDataID forceID_,
+                                            BlockDataID pdfsID_, double kT,
+                                            double omega_bulk,
+                                            double omega_even, double omega_odd,
+                                            double omega_shear, uint32_t seed,
+                                            uint32_t time_step)
+      : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
+        omega_even_(omega_even), omega_odd_(omega_odd),
+        omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
+        configured_(false){};
 
   void run(IBlock *block);
 
@@ -97,6 +98,15 @@ class CollideSweepDoublePrecisionThermalizedAVX {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {
+    Cell BlockCellBB = blocks->getBlockCellBB(*block).min();
+    block_offset_0_ = uint32_t(BlockCellBB[0]);
+    block_offset_1_ = uint32_t(BlockCellBB[1]);
+    block_offset_2_ = uint32_t(BlockCellBB[2]);
+    configured_ = true;
+  }
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   uint32_t block_offset_0_;
@@ -109,9 +119,7 @@ class CollideSweepDoublePrecisionThermalizedAVX {
   double omega_shear_;
   uint32_t seed_;
   uint32_t time_step_;
-  std::function<void(IBlock *, uint32_t &, uint32_t &, uint32_t &)>
-      block_offset_generator =
-          [](IBlock *const, uint32_t &, uint32_t &, uint32_t &) {};
+  bool configured_;
 };
 
 } // namespace pystencils
@@ -120,4 +128,4 @@ class CollideSweepDoublePrecisionThermalizedAVX {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.cu
new file mode 100644
index 00000000000..c0f39f81582
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.cu
@@ -0,0 +1,530 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file CollideSweepDoublePrecisionThermalizedCUDA.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#include <cmath>
+
+#include "CollideSweepDoublePrecisionThermalizedCUDA.h"
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+
+#include "philox_rand.h"
+
+#define FUNC_PREFIX __global__
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#pragma GCC diagnostic ignored "-Wshadow"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning push
+#pragma warning(disable : 1599)
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace pystencils {
+
+namespace internal_collidesweepdoubleprecisionthermalizedcuda_collidesweepdoubleprecisionthermalizedcuda {
+static FUNC_PREFIX __launch_bounds__(256) void collidesweepdoubleprecisionthermalizedcuda_collidesweepdoubleprecisionthermalizedcuda(double *RESTRICT const _data_force, double *RESTRICT _data_pdfs, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, uint32_t block_offset_0, uint32_t block_offset_1, uint32_t block_offset_2, double kT, double omega_bulk, double omega_even, double omega_odd, double omega_shear, uint32_t seed, uint32_t time_step) {
+  if (blockDim.x * blockIdx.x + threadIdx.x < _size_force_0 && blockDim.y * blockIdx.y + threadIdx.y < _size_force_1 && blockDim.z * blockIdx.z + threadIdx.z < _size_force_2) {
+    const int64_t ctr_0 = blockDim.x * blockIdx.x + threadIdx.x;
+    const int64_t ctr_1 = blockDim.y * blockIdx.y + threadIdx.y;
+    const int64_t ctr_2 = blockDim.z * blockIdx.z + threadIdx.z;
+    const double xi_244 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+    const double xi_245 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3];
+    const double xi_246 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3];
+    const double xi_247 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3];
+    const double xi_248 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+    const double xi_249 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3];
+    const double xi_250 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+    const double xi_251 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3];
+    const double xi_252 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+    const double xi_253 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+    const double xi_254 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+    const double xi_255 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+    const double xi_256 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3];
+    const double xi_257 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3];
+    const double xi_258 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3];
+    const double xi_259 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+    const double xi_260 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
+    const double xi_261 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+    const double xi_262 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+    const double xi_263 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3];
+    const double xi_264 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3];
+    const double xi_265 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+
+    double random_7_0{};
+    double random_7_1{};
+    if (kT > 0.) {
+      philox_double2(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 7, seed, random_7_0, random_7_1);
+    }
+
+    double random_6_0{};
+    double random_6_1{};
+    if (kT > 0.) {
+      philox_double2(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 6, seed, random_6_0, random_6_1);
+    }
+
+    double random_5_0{};
+    double random_5_1{};
+    if (kT > 0.) {
+      philox_double2(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 5, seed, random_5_0, random_5_1);
+    }
+
+    double random_4_0{};
+    double random_4_1{};
+    if (kT > 0.) {
+      philox_double2(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 4, seed, random_4_0, random_4_1);
+    }
+
+    double random_3_0{};
+    double random_3_1{};
+    if (kT > 0.) {
+      philox_double2(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 3, seed, random_3_0, random_3_1);
+    }
+
+    double random_2_0{};
+    double random_2_1{};
+    if (kT > 0.) {
+      philox_double2(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 2, seed, random_2_0, random_2_1);
+    }
+
+    double random_1_0{};
+    double random_1_1{};
+    if (kT > 0.) {
+      philox_double2(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 1, seed, random_1_0, random_1_1);
+    }
+
+    double random_0_0{};
+    double random_0_1{};
+    if (kT > 0.) {
+      philox_double2(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 0, seed, random_0_0, random_0_1);
+    }
+    const double xi_2 = xi_245 + xi_249;
+    const double xi_3 = xi_2 + xi_260;
+    const double xi_4 = xi_251 + xi_258 + xi_261;
+    const double xi_5 = xi_247 + xi_257;
+    const double xi_6 = xi_263 + xi_264;
+    const double xi_8 = -xi_248;
+    const double xi_9 = -xi_255;
+    const double xi_10 = -xi_263;
+    const double xi_11 = -xi_246;
+    const double xi_12 = -xi_262;
+    const double xi_13 = xi_10 + xi_11 + xi_12;
+    const double xi_14 = -xi_252;
+    const double xi_15 = -xi_259;
+    const double xi_16 = xi_14 + xi_15;
+    const double xi_17 = -xi_256;
+    const double xi_18 = -xi_257;
+    const double xi_19 = xi_17 + xi_18;
+    const double xi_20 = -xi_245;
+    const double xi_21 = xi_10 + xi_20;
+    const double xi_22 = -xi_258;
+    const double xi_23 = -xi_264;
+    const double xi_24 = xi_17 + xi_22 + xi_23 + xi_251;
+    const double xi_28 = omega_bulk * 0.5;
+    const double xi_29 = xi_265 * 0.16666666666666666;
+    const double xi_30 = xi_265 * 0.083333333333333329;
+    const double xi_42 = xi_254 * 0.16666666666666666;
+    const double xi_43 = xi_254 * 0.083333333333333329;
+    const double xi_49 = xi_253 * 0.16666666666666666;
+    const double xi_50 = xi_253 * 0.083333333333333329;
+    const double xi_55 = omega_shear * 0.041666666666666664;
+    const double xi_60 = omega_bulk * 0.041666666666666664;
+    const double xi_67 = xi_265 * 0.25;
+    const double xi_71 = omega_shear * 0.125;
+    const double xi_72 = xi_265 * xi_71;
+    const double xi_109 = 2.4494897427831779;
+    const double xi_114 = -xi_244;
+    const double xi_118 = -xi_251;
+    const double xi_119 = xi_118 + xi_18;
+    const double xi_120 = -xi_250 + xi_8;
+    const double xi_122 = -xi_249;
+    const double xi_123 = xi_11 + xi_122 + xi_15 + xi_21;
+    const double xi_125 = xi_251 * 2.0 + xi_256 * 2.0 + xi_257 * 2.0 + xi_258 * 2.0;
+    const double xi_126 = xi_125 + xi_260 * 5.0 + xi_262 * 5.0;
+    const double xi_128 = xi_246 * 2.0;
+    const double xi_129 = xi_249 * 2.0;
+    const double xi_130 = xi_245 * 2.0 + xi_263 * 2.0;
+    const double xi_132 = xi_118 + xi_257;
+    const double xi_133 = xi_132 + xi_14 + xi_22 + xi_256 + xi_261;
+    const double xi_134 = omega_odd * 0.25;
+    const double xi_135 = xi_133 * xi_134;
+    const double xi_136 = random_5_1 - 0.5;
+    const double xi_141 = xi_255 * 2.0;
+    const double xi_142 = xi_259 * 2.0;
+    const double xi_143 = xi_248 * 2.0 + xi_250 * -2.0;
+    const double xi_144 = xi_14 - xi_141 + xi_142 + xi_143 + xi_19 + xi_4;
+    const double xi_145 = omega_odd * 0.083333333333333329;
+    const double xi_146 = xi_144 * xi_145;
+    const double xi_147 = random_3_0 - 0.5;
+    const double xi_152 = random_0_1 - 0.5;
+    const double xi_166 = xi_122 + xi_246;
+    const double xi_167 = xi_12 + xi_166 + xi_20 + xi_260 + xi_263;
+    const double xi_168 = xi_134 * xi_167;
+    const double xi_169 = random_4_1 - 0.5;
+    const double xi_171 = xi_13 + xi_141 - xi_142 + xi_143 + xi_3;
+    const double xi_172 = xi_145 * xi_171;
+    const double xi_173 = random_4_0 - 0.5;
+    const double xi_178 = xi_119 + xi_23 + xi_247 + xi_256 + xi_258;
+    const double xi_179 = xi_134 * xi_178;
+    const double xi_180 = random_5_0 - 0.5;
+    const double xi_182 = -xi_128 - xi_129 + xi_130 + xi_24 + xi_5;
+    const double xi_183 = xi_145 * xi_182;
+    const double xi_184 = random_3_1 - 0.5;
+    const double xi_198 = omega_shear * 0.25;
+    const double xi_211 = omega_odd * 0.041666666666666664;
+    const double xi_212 = xi_182 * xi_211;
+    const double xi_213 = omega_odd * 0.125;
+    const double xi_214 = xi_178 * xi_213;
+    const double xi_220 = xi_144 * xi_211;
+    const double xi_221 = xi_133 * xi_213;
+    const double xi_235 = xi_167 * xi_213;
+    const double xi_236 = xi_171 * xi_211;
+    const double rr_0 = 0.0;
+    const double xi_31 = rr_0 * xi_30;
+    const double xi_44 = rr_0 * xi_43;
+    const double xi_51 = rr_0 * xi_50;
+    const double xi_53 = rr_0 * 0.041666666666666664;
+    const double xi_54 = xi_254 * xi_53;
+    const double xi_59 = xi_265 * xi_53;
+    const double xi_81 = xi_253 * xi_53;
+    const double vel0Term = xi_250 + xi_259 + xi_3;
+    const double vel1Term = xi_255 + xi_4;
+    const double vel2Term = xi_246 + xi_5;
+    const double rho = vel0Term + vel1Term + vel2Term + xi_244 + xi_248 + xi_252 + xi_256 + xi_262 + xi_6;
+    const double xi_105 = kT * rho;
+    const double xi_106 = pow(xi_105 * (1.0 - (-omega_even + 1.0) * (-omega_even + 1.0)), 0.5);
+    const double xi_107 = xi_106 * (random_6_0 - 0.5) * 3.7416573867739413;
+    const double xi_108 = xi_106 * (random_7_0 - 0.5) * 5.4772255750516612;
+    const double xi_110 = xi_109 * (random_2_1 - 0.5) * pow(xi_105 * (1.0 - (-omega_bulk + 1.0) * (-omega_bulk + 1.0)), 0.5);
+    const double xi_111 = xi_106 * (random_6_1 - 0.5) * 8.3666002653407556;
+    const double xi_137 = pow(xi_105 * (1.0 - (-omega_odd + 1.0) * (-omega_odd + 1.0)), 0.5);
+    const double xi_138 = xi_137 * 1.4142135623730951;
+    const double xi_139 = xi_138 * 0.5;
+    const double xi_140 = xi_136 * xi_139;
+    const double xi_148 = xi_109 * xi_137;
+    const double xi_149 = xi_148 * 0.16666666666666666;
+    const double xi_150 = xi_147 * xi_149;
+    const double xi_151 = -xi_146 - xi_150;
+    const double xi_153 = pow(xi_105 * (1.0 - (-omega_shear + 1.0) * (-omega_shear + 1.0)), 0.5);
+    const double xi_154 = xi_153 * 0.5;
+    const double xi_155 = xi_152 * xi_154;
+    const double xi_161 = xi_153 * (random_0_0 - 0.5) * 1.7320508075688772;
+    const double xi_165 = xi_146 + xi_150;
+    const double xi_170 = xi_139 * xi_169;
+    const double xi_174 = xi_149 * xi_173;
+    const double xi_175 = xi_172 + xi_174;
+    const double xi_177 = -xi_172 - xi_174;
+    const double xi_181 = xi_139 * xi_180;
+    const double xi_185 = xi_149 * xi_184;
+    const double xi_186 = -xi_183 - xi_185;
+    const double xi_188 = xi_183 + xi_185;
+    const double xi_189 = xi_152 * xi_153 * 0.25;
+    const double xi_192 = xi_107 * 0.083333333333333329;
+    const double xi_196 = xi_154 * (random_1_0 - 0.5);
+    const double xi_203 = xi_154 * (random_2_0 - 0.5);
+    const double xi_207 = xi_111 * -0.014285714285714285;
+    const double xi_208 = xi_108 * 0.050000000000000003;
+    const double xi_215 = xi_148 * 0.083333333333333329;
+    const double xi_216 = xi_184 * xi_215;
+    const double xi_217 = xi_138 * 0.25;
+    const double xi_218 = xi_180 * xi_217;
+    const double xi_219 = -xi_212 + xi_214 - xi_216 + xi_218;
+    const double xi_222 = xi_147 * xi_215;
+    const double xi_223 = xi_136 * xi_217;
+    const double xi_224 = -xi_220 + xi_221 - xi_222 + xi_223;
+    const double xi_225 = xi_220 - xi_221 + xi_222 - xi_223;
+    const double xi_227 = -xi_189;
+    const double xi_230 = xi_111 * 0.035714285714285712;
+    const double xi_232 = xi_154 * (random_1_1 - 0.5);
+    const double xi_237 = xi_169 * xi_217;
+    const double xi_238 = xi_173 * xi_215;
+    const double xi_239 = -xi_235 + xi_236 - xi_237 + xi_238;
+    const double xi_241 = xi_235 - xi_236 + xi_237 - xi_238;
+    const double xi_242 = xi_212 - xi_214 + xi_216 - xi_218;
+    const double xi_0 = ((1.0) / (rho));
+    const double xi_7 = xi_0 * 0.5;
+    const double u_0 = xi_0 * (vel0Term + xi_13 + xi_8 + xi_9) + xi_254 * xi_7;
+    const double xi_25 = u_0 * xi_254;
+    const double xi_37 = xi_25 * 0.16666666666666666;
+    const double xi_38 = xi_25 * 0.083333333333333329;
+    const double xi_39 = omega_shear * xi_38;
+    const double xi_40 = -xi_37 + xi_39;
+    const double xi_56 = -xi_25 * xi_55 + xi_37;
+    const double xi_57 = -xi_43 + xi_54 + xi_56;
+    const double xi_61 = -xi_25 * xi_60;
+    const double xi_68 = u_0 * xi_67;
+    const double xi_73 = u_0 * xi_72;
+    const double xi_77 = xi_43 - xi_54 + xi_56;
+    const double xi_84 = -xi_38;
+    const double xi_95 = u_0 * xi_253;
+    const double xi_96 = xi_95 * 0.25;
+    const double xi_99 = xi_71 * xi_95;
+    const double xi_113 = rho * (u_0 * u_0);
+    const double u_1 = xi_0 * (vel1Term + xi_16 + xi_19 + xi_250 + xi_8) + xi_265 * xi_7;
+    const double xi_26 = u_1 * xi_265;
+    const double xi_32 = xi_26 * 0.16666666666666666;
+    const double xi_45 = xi_26 * 0.083333333333333329;
+    const double xi_46 = omega_shear * xi_45;
+    const double xi_47 = -xi_32 + xi_46;
+    const double xi_62 = -xi_26 * xi_60;
+    const double xi_69 = u_1 * 0.25;
+    const double xi_70 = xi_254 * xi_69;
+    const double xi_74 = u_1 * xi_71;
+    const double xi_75 = xi_254 * xi_74;
+    const double xi_76 = -xi_68 - xi_70 + xi_73 + xi_75;
+    const double xi_78 = xi_68 + xi_70 - xi_73 - xi_75;
+    const double xi_86 = xi_253 * xi_69;
+    const double xi_88 = xi_253 * xi_74;
+    const double xi_93 = -xi_45;
+    const double xi_112 = rho * (u_1 * u_1);
+    const double xi_121 = xi_112 + xi_120 + xi_9;
+    const double xi_197 = rho * u_1;
+    const double xi_199 = xi_198 * (u_0 * xi_197 + xi_120 + xi_255 + xi_259);
+    const double xi_200 = -xi_196 - xi_199;
+    const double xi_201 = xi_196 + xi_199;
+    const double u_2 = xi_0 * (vel2Term + xi_21 + xi_24 + xi_249) + xi_253 * xi_7;
+    const double xi_27 = u_2 * xi_253;
+    const double xi_33 = xi_27 * 0.16666666666666666;
+    const double xi_34 = xi_27 * 0.083333333333333329;
+    const double xi_35 = omega_shear * xi_34;
+    const double xi_36 = -xi_33 + xi_35;
+    const double xi_41 = -omega_shear * xi_32 + xi_26 * 0.33333333333333331 + xi_36 + xi_40;
+    const double xi_48 = -omega_shear * xi_37 + xi_25 * 0.33333333333333331 + xi_36 + xi_47;
+    const double xi_52 = -omega_shear * xi_33 + xi_27 * 0.33333333333333331 + xi_40 + xi_47;
+    const double xi_58 = -xi_34;
+    const double xi_63 = -xi_27 * xi_60;
+    const double xi_64 = -xi_26 * xi_55 + xi_32 + xi_61 + xi_62 + xi_63;
+    const double xi_65 = xi_30 - xi_59 + xi_64;
+    const double xi_66 = xi_35 + xi_58 + xi_65;
+    const double xi_79 = -xi_30 + xi_59 + xi_64;
+    const double xi_80 = xi_35 + xi_58 + xi_79;
+    const double xi_82 = -xi_27 * xi_55 + xi_33;
+    const double xi_83 = xi_50 - xi_81 + xi_82;
+    const double xi_85 = xi_39 + xi_65 + xi_84;
+    const double xi_87 = u_2 * xi_67;
+    const double xi_89 = u_2 * xi_72;
+    const double xi_90 = xi_86 + xi_87 - xi_88 - xi_89;
+    const double xi_91 = xi_39 + xi_79 + xi_84;
+    const double xi_92 = -xi_86 - xi_87 + xi_88 + xi_89;
+    const double xi_94 = xi_46 + xi_61 + xi_62 + xi_63 + xi_83 + xi_93;
+    const double xi_97 = u_2 * xi_254;
+    const double xi_98 = xi_97 * 0.25;
+    const double xi_100 = xi_71 * xi_97;
+    const double xi_101 = xi_100 - xi_96 - xi_98 + xi_99;
+    const double xi_102 = -xi_100 + xi_96 + xi_98 - xi_99;
+    const double xi_103 = -xi_50 + xi_81 + xi_82;
+    const double xi_104 = xi_103 + xi_46 + xi_61 + xi_62 + xi_63 + xi_93;
+    const double xi_115 = rho * (u_2 * u_2);
+    const double xi_116 = xi_114 + xi_115 * 0.66666666666666663 + xi_247 * 3.0 + xi_264 * 3.0;
+    const double xi_117 = omega_even * (xi_112 * 0.66666666666666663 + xi_113 * 1.6666666666666667 + xi_116 + xi_251 * -3.0 + xi_252 * 3.0 + xi_256 * -3.0 + xi_257 * -3.0 + xi_258 * -3.0 + xi_261 * 3.0);
+    const double xi_124 = omega_bulk * (xi_113 + xi_115 + xi_119 + xi_121 + xi_123 + xi_17 + xi_22 + xi_244);
+    const double xi_127 = omega_even * (xi_112 * 2.3333333333333335 + xi_116 + xi_126 + xi_245 * -5.0 + xi_246 * -5.0 + xi_249 * -5.0 + xi_252 * -2.0 + xi_261 * -2.0 + xi_263 * -5.0);
+    const double xi_131 = omega_even * (xi_114 + xi_115 * 3.0 + xi_126 + xi_128 + xi_129 + xi_130 + xi_247 * -4.0 + xi_248 * -7.0 + xi_250 * -7.0 + xi_252 * 5.0 + xi_255 * -7.0 + xi_259 * -7.0 + xi_261 * 5.0 + xi_264 * -4.0);
+    const double xi_156 = -xi_115 + xi_247;
+    const double xi_157 = omega_shear * (xi_121 + xi_156 + xi_16 + xi_2 + xi_246 - xi_261 + xi_6);
+    const double xi_158 = xi_157 * 0.125;
+    const double xi_159 = xi_107 * -0.11904761904761904 + xi_131 * -0.01984126984126984;
+    const double xi_160 = omega_shear * (-xi_112 + xi_113 * 2.0 + xi_120 + xi_123 + xi_125 + xi_156 + xi_252 + xi_260 * -2.0 + xi_261 + xi_262 * -2.0 + xi_264 + xi_9);
+    const double xi_162 = xi_160 * -0.041666666666666664 + xi_161 * -0.16666666666666666;
+    const double xi_163 = xi_108 * -0.10000000000000001 + xi_117 * -0.050000000000000003 + xi_162;
+    const double xi_164 = xi_111 * 0.028571428571428571 + xi_127 * 0.014285714285714285 + xi_155 + xi_158 + xi_159 + xi_163;
+    const double xi_176 = xi_111 * -0.071428571428571425 + xi_127 * -0.035714285714285712 + xi_159 + xi_160 * 0.083333333333333329 + xi_161 * 0.33333333333333331;
+    const double xi_187 = xi_107 * 0.095238095238095233 + xi_111 * -0.042857142857142858 + xi_127 * -0.021428571428571429 + xi_131 * 0.015873015873015872 - xi_155 - xi_158 + xi_163;
+    const double xi_190 = xi_157 * 0.0625;
+    const double xi_191 = xi_131 * 0.013888888888888888;
+    const double xi_193 = xi_110 * 0.083333333333333329 + xi_124 * 0.041666666666666664;
+    const double xi_194 = xi_160 * 0.020833333333333332 + xi_161 * 0.083333333333333329 + xi_193;
+    const double xi_195 = xi_165 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
+    const double xi_202 = xi_151 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
+    const double xi_204 = xi_127 * -0.0071428571428571426;
+    const double xi_205 = xi_198 * (u_2 * xi_197 + xi_132 + xi_17 + xi_258);
+    const double xi_206 = xi_117 * 0.025000000000000001;
+    const double xi_209 = xi_107 * -0.023809523809523808 + xi_131 * -0.003968253968253968;
+    const double xi_210 = xi_162 + xi_193 + xi_203 + xi_204 + xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
+    const double xi_226 = xi_162 + xi_193 - xi_203 + xi_204 - xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
+    const double xi_228 = -xi_190;
+    const double xi_229 = xi_127 * 0.017857142857142856;
+    const double xi_231 = xi_188 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
+    const double xi_233 = xi_198 * (rho * u_0 * u_2 + xi_10 + xi_166 + xi_245);
+    const double xi_234 = -xi_232 - xi_233;
+    const double xi_240 = xi_232 + xi_233;
+    const double xi_243 = xi_186 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
+    const double forceTerm_0 = xi_25 * xi_28 - xi_25 + xi_26 * xi_28 - xi_26 + xi_27 * xi_28 - xi_27;
+    const double forceTerm_1 = xi_29 - xi_31 + xi_41;
+    const double forceTerm_2 = -xi_29 + xi_31 + xi_41;
+    const double forceTerm_3 = -xi_42 + xi_44 + xi_48;
+    const double forceTerm_4 = xi_42 - xi_44 + xi_48;
+    const double forceTerm_5 = xi_49 - xi_51 + xi_52;
+    const double forceTerm_6 = -xi_49 + xi_51 + xi_52;
+    const double forceTerm_7 = xi_57 + xi_66 + xi_76;
+    const double forceTerm_8 = xi_66 + xi_77 + xi_78;
+    const double forceTerm_9 = xi_57 + xi_78 + xi_80;
+    const double forceTerm_10 = xi_76 + xi_77 + xi_80;
+    const double forceTerm_11 = xi_83 + xi_85 + xi_90;
+    const double forceTerm_12 = xi_83 + xi_91 + xi_92;
+    const double forceTerm_13 = xi_101 + xi_57 + xi_94;
+    const double forceTerm_14 = xi_102 + xi_77 + xi_94;
+    const double forceTerm_15 = xi_103 + xi_85 + xi_92;
+    const double forceTerm_16 = xi_103 + xi_90 + xi_91;
+    const double forceTerm_17 = xi_102 + xi_104 + xi_57;
+    const double forceTerm_18 = xi_101 + xi_104 + xi_77;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = forceTerm_0 + xi_107 * 0.14285714285714285 + xi_108 * 0.20000000000000001 - xi_110 + xi_111 * 0.085714285714285715 + xi_117 * 0.10000000000000001 + xi_124 * -0.5 + xi_127 * 0.042857142857142858 + xi_131 * 0.023809523809523808 + xi_244;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = forceTerm_1 - xi_135 - xi_140 + xi_151 + xi_164 + xi_261;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = forceTerm_2 + xi_135 + xi_140 + xi_164 + xi_165 + xi_252;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = forceTerm_3 + xi_168 + xi_170 + xi_175 + xi_176 + xi_262;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = forceTerm_4 - xi_168 - xi_170 + xi_176 + xi_177 + xi_260;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = forceTerm_5 - xi_179 - xi_181 + xi_186 + xi_187 + xi_247;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = forceTerm_6 + xi_179 + xi_181 + xi_187 + xi_188 + xi_264;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = forceTerm_7 + xi_177 + xi_195 + xi_200 + xi_255;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = forceTerm_8 + xi_175 + xi_195 + xi_201 + xi_250;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = forceTerm_9 + xi_177 + xi_201 + xi_202 + xi_248;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = forceTerm_10 + xi_175 + xi_200 + xi_202 + xi_259;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = forceTerm_11 + xi_210 + xi_219 + xi_224 + xi_251;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = forceTerm_12 + xi_219 + xi_225 + xi_226 + xi_257;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = forceTerm_13 + xi_231 + xi_234 + xi_239 + xi_246;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = forceTerm_14 + xi_231 + xi_240 + xi_241 + xi_249;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = forceTerm_15 + xi_224 + xi_226 + xi_242 + xi_258;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = forceTerm_16 + xi_210 + xi_225 + xi_242 + xi_256;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = forceTerm_17 + xi_239 + xi_240 + xi_243 + xi_263;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = forceTerm_18 + xi_234 + xi_241 + xi_243 + xi_245;
+  }
+}
+} // namespace internal_collidesweepdoubleprecisionthermalizedcuda_collidesweepdoubleprecisionthermalizedcuda
+
+void CollideSweepDoublePrecisionThermalizedCUDA::run(IBlock *block, gpuStream_t stream) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
+  auto force = block->getData<gpu::GPUField<double>>(forceID);
+  auto pdfs = block->getData<gpu::GPUField<double>>(pdfsID);
+
+  auto &block_offset_2 = this->block_offset_2_;
+  auto &omega_even = this->omega_even_;
+  auto &time_step = this->time_step_;
+  auto &block_offset_0 = this->block_offset_0_;
+  auto &omega_bulk = this->omega_bulk_;
+  auto &omega_shear = this->omega_shear_;
+  auto &omega_odd = this->omega_odd_;
+  auto &kT = this->kT_;
+  auto &seed = this->seed_;
+  auto &block_offset_1 = this->block_offset_1_;
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
+  double *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+  double *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_collidesweepdoubleprecisionthermalizedcuda_collidesweepdoubleprecisionthermalizedcuda::collidesweepdoubleprecisionthermalizedcuda_collidesweepdoubleprecisionthermalizedcuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, block_offset_0, block_offset_1, block_offset_2, kT, omega_bulk, omega_even, omega_odd, omega_shear, seed, time_step);
+}
+
+void CollideSweepDoublePrecisionThermalizedCUDA::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block, gpuStream_t stream) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
+  CellInterval ci = globalCellInterval;
+  CellInterval blockBB = blocks->getBlockCellBB(*block);
+  blockBB.expand(ghostLayers);
+  ci.intersect(blockBB);
+  blocks->transformGlobalToBlockLocalCellInterval(ci, *block);
+  if (ci.empty())
+    return;
+
+  auto force = block->getData<gpu::GPUField<double>>(forceID);
+  auto pdfs = block->getData<gpu::GPUField<double>>(pdfsID);
+
+  auto &block_offset_2 = this->block_offset_2_;
+  auto &omega_even = this->omega_even_;
+  auto &time_step = this->time_step_;
+  auto &block_offset_0 = this->block_offset_0_;
+  auto &omega_bulk = this->omega_bulk_;
+  auto &omega_shear = this->omega_shear_;
+  auto &omega_odd = this->omega_odd_;
+  auto &kT = this->kT_;
+  auto &seed = this->seed_;
+  auto &block_offset_1 = this->block_offset_1_;
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
+  double *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+  double *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_collidesweepdoubleprecisionthermalizedcuda_collidesweepdoubleprecisionthermalizedcuda::collidesweepdoubleprecisionthermalizedcuda_collidesweepdoubleprecisionthermalizedcuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, block_offset_0, block_offset_1, block_offset_2, kT, omega_bulk, omega_even, omega_odd, omega_shear, seed, time_step);
+}
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h
new file mode 100644
index 00000000000..bc7ac5f69e7
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h
@@ -0,0 +1,138 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file CollideSweepDoublePrecisionThermalizedCUDA.h
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "field/SwapableCompare.h"
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace pystencils {
+
+class CollideSweepDoublePrecisionThermalizedCUDA {
+public:
+  CollideSweepDoublePrecisionThermalizedCUDA(
+      BlockDataID forceID_, BlockDataID pdfsID_, double kT, double omega_bulk,
+      double omega_even, double omega_odd, double omega_shear, uint32_t seed,
+      uint32_t time_step)
+      : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
+        omega_even_(omega_even), omega_odd_(omega_odd),
+        omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
+        configured_(false){};
+
+  void run(IBlock *block, gpuStream_t stream = nullptr);
+
+  void runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers, IBlock *block,
+                         gpuStream_t stream = nullptr);
+
+  void operator()(IBlock *block, gpuStream_t stream = nullptr) {
+    run(block, stream);
+  }
+
+  static std::function<void(IBlock *)> getSweep(
+      const shared_ptr<CollideSweepDoublePrecisionThermalizedCUDA> &kernel) {
+    return [kernel](IBlock *b) { kernel->run(b); };
+  }
+
+  static std::function<void(IBlock *, gpuStream_t)> getSweepOnCellInterval(
+      const shared_ptr<CollideSweepDoublePrecisionThermalizedCUDA> &kernel,
+      const shared_ptr<StructuredBlockStorage> &blocks,
+      const CellInterval &globalCellInterval, cell_idx_t ghostLayers = 1) {
+    return [kernel, blocks, globalCellInterval,
+            ghostLayers](IBlock *b, gpuStream_t stream = nullptr) {
+      kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                                stream);
+    };
+  }
+
+  std::function<void(IBlock *)> getSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->run(b, stream); };
+  }
+
+  std::function<void(IBlock *)>
+  getSweepOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers = 1,
+                         gpuStream_t stream = nullptr) {
+    return [this, blocks, globalCellInterval, ghostLayers, stream](IBlock *b) {
+      this->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                              stream);
+    };
+  }
+
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {
+    Cell BlockCellBB = blocks->getBlockCellBB(*block).min();
+    block_offset_0_ = uint32_t(BlockCellBB[0]);
+    block_offset_1_ = uint32_t(BlockCellBB[1]);
+    block_offset_2_ = uint32_t(BlockCellBB[2]);
+    configured_ = true;
+  }
+
+  BlockDataID forceID;
+  BlockDataID pdfsID;
+  uint32_t block_offset_0_;
+  uint32_t block_offset_1_;
+  uint32_t block_offset_2_;
+  double kT_;
+  double omega_bulk_;
+  double omega_even_;
+  double omega_odd_;
+  double omega_shear_;
+  uint32_t seed_;
+  uint32_t time_step_;
+  bool configured_;
+};
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.cpp
index 560fbac086b..c78c87f5e36 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file CollideSweepSinglePrecisionLeesEdwards.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -51,149 +50,105 @@ static FUNC_PREFIX void collidesweepsingleprecisionleesedwards_collidesweepsingl
   const float xi_0 = ((1.0f) / (omega_shear * -0.25f + 2.0f));
   const float rr_0 = xi_0 * (omega_shear * -2.0f + 4.0f);
   for (int64_t ctr_2 = 0; ctr_2 < _size_force_2; ctr_2 += 1) {
-    float *RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    float *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    float *RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    float *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    float *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
     for (int64_t ctr_1 = 0; ctr_1 < _size_force_1; ctr_1 += 1) {
-      float *RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_36;
-      float *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      float *RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_31;
-      float *RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_32;
-      float *RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_311;
-      float *RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_318;
-      float *RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_313;
-      float *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      float *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      float *RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_317;
-      float *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      float *RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_35;
-      float *RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_314;
-      float *RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_38;
-      float *RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_312;
-      float *RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_316;
-      float *RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_39;
-      float *RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_315;
-      float *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      float *RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_310;
-      float *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      float *RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_37;
       for (int64_t ctr_0 = 0; ctr_0 < _size_force_0; ctr_0 += 1) {
-        const float xi_25 = _data_pdfs_20_36_10[_stride_pdfs_0 * ctr_0];
-        const float xi_26 = _data_force_20_32_10[_stride_force_0 * ctr_0];
-        const float xi_27 = _data_pdfs_20_31_10[_stride_pdfs_0 * ctr_0];
-        const float xi_28 = _data_pdfs_20_32_10[_stride_pdfs_0 * ctr_0];
-        const float xi_29 = _data_pdfs_20_311_10[_stride_pdfs_0 * ctr_0];
-        const float xi_30 = _data_pdfs_20_318_10[_stride_pdfs_0 * ctr_0];
-        const float xi_31 = _data_pdfs_20_313_10[_stride_pdfs_0 * ctr_0];
-        const float xi_32 = _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0];
-        const float xi_33 = _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0];
-        const float xi_34 = _data_pdfs_20_317_10[_stride_pdfs_0 * ctr_0];
-        const float xi_35 = _data_force_20_30_10[_stride_force_0 * ctr_0];
-        const float xi_36 = _data_pdfs_20_35_10[_stride_pdfs_0 * ctr_0];
-        const float xi_37 = _data_pdfs_20_314_10[_stride_pdfs_0 * ctr_0];
-        const float xi_38 = _data_pdfs_20_38_10[_stride_pdfs_0 * ctr_0];
-        const float xi_39 = _data_pdfs_20_312_10[_stride_pdfs_0 * ctr_0];
-        const float xi_40 = _data_pdfs_20_316_10[_stride_pdfs_0 * ctr_0];
-        const float xi_41 = _data_pdfs_20_39_10[_stride_pdfs_0 * ctr_0];
-        const float xi_42 = _data_pdfs_20_315_10[_stride_pdfs_0 * ctr_0];
-        const float xi_43 = _data_force_20_31_10[_stride_force_0 * ctr_0];
-        const float xi_44 = _data_pdfs_20_310_10[_stride_pdfs_0 * ctr_0];
-        const float xi_45 = _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0];
-        const float xi_46 = _data_pdfs_20_37_10[_stride_pdfs_0 * ctr_0];
-        const float xi_3 = xi_25;
-        const float xi_4 = xi_26;
-        const float xi_5 = xi_27;
-        const float xi_6 = xi_28;
-        const float xi_7 = xi_29;
-        const float xi_8 = xi_30;
-        const float xi_9 = xi_31;
-        const float xi_10 = xi_45;
-        const float xi_11 = xi_32;
-        const float xi_12 = xi_33;
-        const float xi_13 = xi_34;
-        const float xi_14 = xi_35;
-        const float xi_15 = xi_36;
-        const float xi_16 = xi_37;
-        const float xi_17 = xi_38;
-        const float xi_18 = xi_39;
-        const float xi_19 = xi_40;
-        const float xi_20 = xi_42;
-        const float xi_21 = xi_43;
+        const float xi_25 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+        const float xi_26 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3];
+        const float xi_27 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+        const float xi_28 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3];
+        const float xi_29 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+        const float xi_30 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+        const float xi_31 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+        const float xi_32 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+        const float xi_33 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+        const float xi_34 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3];
+        const float xi_35 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3];
+        const float xi_36 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+        const float xi_37 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+        const float xi_38 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3];
+        const float xi_39 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+        const float xi_40 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3];
+        const float xi_41 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3];
+        const float xi_42 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+        const float xi_43 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3];
+        const float xi_44 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3];
+        const float xi_45 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3];
+        const float xi_46 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
+        const float xi_3 = xi_26;
+        const float xi_4 = xi_28;
+        const float xi_5 = xi_37;
+        const float xi_6 = xi_29;
+        const float xi_7 = xi_36;
+        const float xi_8 = xi_32;
+        const float xi_9 = xi_25;
+        const float xi_10 = xi_27;
+        const float xi_11 = xi_43;
+        const float xi_12 = xi_30;
+        const float xi_13 = xi_42;
+        const float xi_14 = xi_46;
+        const float xi_15 = xi_40;
+        const float xi_16 = xi_41;
+        const float xi_17 = xi_31;
+        const float xi_18 = xi_38;
+        const float xi_19 = xi_33;
+        const float xi_20 = xi_34;
+        const float xi_21 = xi_45;
         const float xi_22 = xi_44;
-        const float xi_23 = xi_41;
-        const float xi_24 = xi_46;
-        const float vel0Term = xi_11 + xi_16 + xi_17 + xi_22 + xi_8;
-        const float vel1Term = xi_20 + xi_24 + xi_5 + xi_7;
-        const float vel2Term = xi_15 + xi_18 + xi_9;
-        const float rho = vel0Term + vel1Term + vel2Term + xi_10 + xi_12 + xi_13 + xi_19 + xi_23 + xi_3 + xi_6;
+        const float xi_23 = xi_35;
+        const float xi_24 = xi_39;
+        const float vel0Term = xi_11 + xi_12 + xi_14 + xi_18 + xi_24;
+        const float vel1Term = xi_20 + xi_21 + xi_7 + xi_8;
+        const float vel2Term = xi_15 + xi_22 + xi_23;
+        const float rho = vel0Term + vel1Term + vel2Term + xi_13 + xi_16 + xi_17 + xi_19 + xi_3 + xi_4 + xi_6;
         const float xi_1 = ((1.0f) / (rho));
-        const float u_0 = xi_1 * xi_14 * 0.5f + xi_1 * (vel0Term + xi_12 * -1.0f + xi_13 * -1.0f + xi_23 * -1.0f + xi_24 * -1.0f + xi_9 * -1.0f);
-        const float u_1 = xi_1 * xi_21 * 0.5f + xi_1 * (vel1Term + xi_17 + xi_18 * -1.0f + xi_19 * -1.0f + xi_22 * -1.0f + xi_23 * -1.0f + xi_6 * -1.0f);
-        const float u_2 = xi_1 * xi_4 * 0.5f + xi_1 * (vel2Term + xi_13 * -1.0f + xi_16 + xi_19 * -1.0f + xi_20 * -1.0f + xi_3 * -1.0f + xi_7 + xi_8 * -1.0f);
-        const float forceTerm_0 = omega_shear * u_0 * xi_14 * 0.5f + omega_shear * u_1 * xi_21 * 0.5f + omega_shear * u_2 * xi_4 * 0.5f + u_0 * xi_14 * -1.0f + u_1 * xi_21 * -1.0f + u_2 * xi_4 * -1.0f;
-        const float forceTerm_1 = omega_shear * u_0 * xi_14 * 0.083333333333333329f + omega_shear * u_1 * xi_21 * -0.16666666666666666f + omega_shear * u_2 * xi_4 * 0.083333333333333329f + rr_0 * xi_21 * -0.083333333333333329f + u_0 * xi_14 * -0.16666666666666666f + u_1 * xi_21 * 0.33333333333333331f + u_2 * xi_4 * -0.16666666666666666f + xi_21 * 0.16666666666666666f;
-        const float forceTerm_2 = omega_shear * u_0 * xi_14 * 0.083333333333333329f + omega_shear * u_1 * xi_21 * -0.16666666666666666f + omega_shear * u_2 * xi_4 * 0.083333333333333329f + rr_0 * xi_21 * 0.083333333333333329f + u_0 * xi_14 * -0.16666666666666666f + u_1 * xi_21 * 0.33333333333333331f + u_2 * xi_4 * -0.16666666666666666f + xi_21 * -0.16666666666666666f;
-        const float forceTerm_3 = omega_shear * u_0 * xi_14 * -0.16666666666666666f + omega_shear * u_1 * xi_21 * 0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.083333333333333329f + rr_0 * xi_14 * 0.083333333333333329f + u_0 * xi_14 * 0.33333333333333331f + u_1 * xi_21 * -0.16666666666666666f + u_2 * xi_4 * -0.16666666666666666f + xi_14 * -0.16666666666666666f;
-        const float forceTerm_4 = omega_shear * u_0 * xi_14 * -0.16666666666666666f + omega_shear * u_1 * xi_21 * 0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.083333333333333329f + rr_0 * xi_14 * -0.083333333333333329f + u_0 * xi_14 * 0.33333333333333331f + u_1 * xi_21 * -0.16666666666666666f + u_2 * xi_4 * -0.16666666666666666f + xi_14 * 0.16666666666666666f;
-        const float forceTerm_5 = omega_shear * u_0 * xi_14 * 0.083333333333333329f + omega_shear * u_1 * xi_21 * 0.083333333333333329f + omega_shear * u_2 * xi_4 * -0.16666666666666666f + rr_0 * xi_4 * -0.083333333333333329f + u_0 * xi_14 * -0.16666666666666666f + u_1 * xi_21 * -0.16666666666666666f + u_2 * xi_4 * 0.33333333333333331f + xi_4 * 0.16666666666666666f;
-        const float forceTerm_6 = omega_shear * u_0 * xi_14 * 0.083333333333333329f + omega_shear * u_1 * xi_21 * 0.083333333333333329f + omega_shear * u_2 * xi_4 * -0.16666666666666666f + rr_0 * xi_4 * 0.083333333333333329f + u_0 * xi_14 * -0.16666666666666666f + u_1 * xi_21 * -0.16666666666666666f + u_2 * xi_4 * 0.33333333333333331f + xi_4 * -0.16666666666666666f;
-        const float forceTerm_7 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_21 * 0.125f + omega_shear * u_1 * xi_14 * 0.125f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.041666666666666664f + rr_0 * xi_14 * 0.041666666666666664f + rr_0 * xi_21 * -0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_21 * -0.25f + u_1 * xi_14 * -0.25f + u_1 * xi_21 * 0.16666666666666666f + u_2 * xi_4 * -0.083333333333333329f + xi_14 * -0.083333333333333329f + xi_21 * 0.083333333333333329f;
-        const float forceTerm_8 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_21 * -0.125f + omega_shear * u_1 * xi_14 * -0.125f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.041666666666666664f + rr_0 * xi_14 * -0.041666666666666664f + rr_0 * xi_21 * -0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_21 * 0.25f + u_1 * xi_14 * 0.25f + u_1 * xi_21 * 0.16666666666666666f + u_2 * xi_4 * -0.083333333333333329f + xi_14 * 0.083333333333333329f + xi_21 * 0.083333333333333329f;
-        const float forceTerm_9 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_21 * -0.125f + omega_shear * u_1 * xi_14 * -0.125f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.041666666666666664f + rr_0 * xi_14 * 0.041666666666666664f + rr_0 * xi_21 * 0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_21 * 0.25f + u_1 * xi_14 * 0.25f + u_1 * xi_21 * 0.16666666666666666f + u_2 * xi_4 * -0.083333333333333329f + xi_14 * -0.083333333333333329f + xi_21 * -0.083333333333333329f;
-        const float forceTerm_10 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_21 * 0.125f + omega_shear * u_1 * xi_14 * 0.125f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.041666666666666664f + rr_0 * xi_14 * -0.041666666666666664f + rr_0 * xi_21 * 0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_21 * -0.25f + u_1 * xi_14 * -0.25f + u_1 * xi_21 * 0.16666666666666666f + u_2 * xi_4 * -0.083333333333333329f + xi_14 * 0.083333333333333329f + xi_21 * -0.083333333333333329f;
-        const float forceTerm_11 = omega_shear * u_0 * xi_14 * 0.041666666666666664f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_1 * xi_4 * -0.125f + omega_shear * u_2 * xi_21 * -0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_21 * -0.041666666666666664f + rr_0 * xi_4 * -0.041666666666666664f + u_0 * xi_14 * -0.083333333333333329f + u_1 * xi_21 * 0.16666666666666666f + u_1 * xi_4 * 0.25f + u_2 * xi_21 * 0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_21 * 0.083333333333333329f + xi_4 * 0.083333333333333329f;
-        const float forceTerm_12 = omega_shear * u_0 * xi_14 * 0.041666666666666664f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_1 * xi_4 * 0.125f + omega_shear * u_2 * xi_21 * 0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_21 * 0.041666666666666664f + rr_0 * xi_4 * -0.041666666666666664f + u_0 * xi_14 * -0.083333333333333329f + u_1 * xi_21 * 0.16666666666666666f + u_1 * xi_4 * -0.25f + u_2 * xi_21 * -0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_21 * -0.083333333333333329f + xi_4 * 0.083333333333333329f;
-        const float forceTerm_13 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_4 * 0.125f + omega_shear * u_1 * xi_21 * 0.041666666666666664f + omega_shear * u_2 * xi_14 * 0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_14 * 0.041666666666666664f + rr_0 * xi_4 * -0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_4 * -0.25f + u_1 * xi_21 * -0.083333333333333329f + u_2 * xi_14 * -0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_14 * -0.083333333333333329f + xi_4 * 0.083333333333333329f;
-        const float forceTerm_14 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_4 * -0.125f + omega_shear * u_1 * xi_21 * 0.041666666666666664f + omega_shear * u_2 * xi_14 * -0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_14 * -0.041666666666666664f + rr_0 * xi_4 * -0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_4 * 0.25f + u_1 * xi_21 * -0.083333333333333329f + u_2 * xi_14 * 0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_14 * 0.083333333333333329f + xi_4 * 0.083333333333333329f;
-        const float forceTerm_15 = omega_shear * u_0 * xi_14 * 0.041666666666666664f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_1 * xi_4 * 0.125f + omega_shear * u_2 * xi_21 * 0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_21 * -0.041666666666666664f + rr_0 * xi_4 * 0.041666666666666664f + u_0 * xi_14 * -0.083333333333333329f + u_1 * xi_21 * 0.16666666666666666f + u_1 * xi_4 * -0.25f + u_2 * xi_21 * -0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_21 * 0.083333333333333329f + xi_4 * -0.083333333333333329f;
-        const float forceTerm_16 = omega_shear * u_0 * xi_14 * 0.041666666666666664f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_1 * xi_4 * -0.125f + omega_shear * u_2 * xi_21 * -0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_21 * 0.041666666666666664f + rr_0 * xi_4 * 0.041666666666666664f + u_0 * xi_14 * -0.083333333333333329f + u_1 * xi_21 * 0.16666666666666666f + u_1 * xi_4 * 0.25f + u_2 * xi_21 * 0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_21 * -0.083333333333333329f + xi_4 * -0.083333333333333329f;
-        const float forceTerm_17 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_4 * -0.125f + omega_shear * u_1 * xi_21 * 0.041666666666666664f + omega_shear * u_2 * xi_14 * -0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_14 * 0.041666666666666664f + rr_0 * xi_4 * 0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_4 * 0.25f + u_1 * xi_21 * -0.083333333333333329f + u_2 * xi_14 * 0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_14 * -0.083333333333333329f + xi_4 * -0.083333333333333329f;
-        const float forceTerm_18 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_4 * 0.125f + omega_shear * u_1 * xi_21 * 0.041666666666666664f + omega_shear * u_2 * xi_14 * 0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_14 * -0.041666666666666664f + rr_0 * xi_4 * 0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_4 * -0.25f + u_1 * xi_21 * -0.083333333333333329f + u_2 * xi_14 * -0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_14 * 0.083333333333333329f + xi_4 * -0.083333333333333329f;
-        const float u0Mu1 = u_0 + u_1 * -1.0f;
+        const float u_0 = xi_1 * xi_10 * 0.5f + xi_1 * (vel0Term - xi_13 - xi_15 - xi_3 - xi_6 - xi_7);
+        const float u_1 = xi_1 * xi_9 * 0.5f + xi_1 * (vel1Term + xi_12 - xi_13 - xi_19 - xi_23 - xi_24 - xi_4);
+        const float u_2 = xi_1 * xi_5 * 0.5f + xi_1 * (vel2Term + xi_11 - xi_16 - xi_18 - xi_20 + xi_21 - xi_3 - xi_4);
+        const float forceTerm_0 = omega_shear * u_0 * xi_10 * 0.5f + omega_shear * u_1 * xi_9 * 0.5f + omega_shear * u_2 * xi_5 * 0.5f - u_0 * xi_10 - u_1 * xi_9 - u_2 * xi_5;
+        const float forceTerm_1 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * -0.16666666666666666f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_9 * -0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * 0.33333333333333331f + u_2 * xi_5 * -0.16666666666666666f + xi_9 * 0.16666666666666666f;
+        const float forceTerm_2 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * -0.16666666666666666f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_9 * 0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * 0.33333333333333331f + u_2 * xi_5 * -0.16666666666666666f + xi_9 * -0.16666666666666666f;
+        const float forceTerm_3 = omega_shear * u_0 * xi_10 * -0.16666666666666666f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_10 * 0.083333333333333329f + u_0 * xi_10 * 0.33333333333333331f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * -0.16666666666666666f + xi_10 * -0.16666666666666666f;
+        const float forceTerm_4 = omega_shear * u_0 * xi_10 * -0.16666666666666666f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_10 * -0.083333333333333329f + u_0 * xi_10 * 0.33333333333333331f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * -0.16666666666666666f + xi_10 * 0.16666666666666666f;
+        const float forceTerm_5 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.16666666666666666f + rr_0 * xi_5 * -0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * 0.33333333333333331f + xi_5 * 0.16666666666666666f;
+        const float forceTerm_6 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.16666666666666666f + rr_0 * xi_5 * 0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * 0.33333333333333331f + xi_5 * -0.16666666666666666f;
+        const float forceTerm_7 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * 0.125f + omega_shear * u_1 * xi_10 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * -0.25f + u_1 * xi_10 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * -0.083333333333333329f + xi_9 * 0.083333333333333329f;
+        const float forceTerm_8 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * -0.125f + omega_shear * u_1 * xi_10 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * 0.25f + u_1 * xi_10 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * 0.083333333333333329f + xi_9 * 0.083333333333333329f;
+        const float forceTerm_9 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * -0.125f + omega_shear * u_1 * xi_10 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * 0.25f + u_1 * xi_10 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * -0.083333333333333329f + xi_9 * -0.083333333333333329f;
+        const float forceTerm_10 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * 0.125f + omega_shear * u_1 * xi_10 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * -0.25f + u_1 * xi_10 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * 0.083333333333333329f + xi_9 * -0.083333333333333329f;
+        const float forceTerm_11 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * -0.125f + rr_0 * xi_5 * -0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * 0.25f + xi_5 * 0.083333333333333329f + xi_9 * 0.083333333333333329f;
+        const float forceTerm_12 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * 0.125f + rr_0 * xi_5 * -0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * -0.25f + xi_5 * 0.083333333333333329f + xi_9 * -0.083333333333333329f;
+        const float forceTerm_13 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * 0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_5 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * -0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * -0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * -0.083333333333333329f + xi_5 * 0.083333333333333329f;
+        const float forceTerm_14 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * -0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_5 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * 0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * 0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * 0.083333333333333329f + xi_5 * 0.083333333333333329f;
+        const float forceTerm_15 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * 0.125f + rr_0 * xi_5 * 0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * -0.25f + xi_5 * -0.083333333333333329f + xi_9 * 0.083333333333333329f;
+        const float forceTerm_16 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * -0.125f + rr_0 * xi_5 * 0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * 0.25f + xi_5 * -0.083333333333333329f + xi_9 * -0.083333333333333329f;
+        const float forceTerm_17 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * -0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_5 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * 0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * 0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * -0.083333333333333329f + xi_5 * -0.083333333333333329f;
+        const float forceTerm_18 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * 0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_5 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * -0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * -0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * 0.083333333333333329f + xi_5 * -0.083333333333333329f;
+        const float u0Mu1 = u_0 - u_1;
         const float u0Pu1 = u_0 + u_1;
         const float u1Pu2 = u_1 + u_2;
-        const float u1Mu2 = u_1 + u_2 * -1.0f;
-        const float u0Mu2 = u_0 + u_2 * -1.0f;
+        const float u1Mu2 = u_1 - u_2;
+        const float u0Mu2 = u_0 - u_2;
         const float u0Pu2 = u_0 + u_2;
-        const float f_eq_common = rho * -1.0f * (u_0 * u_0) + rho * -1.0f * (u_1 * u_1) + rho * -1.0f * (u_2 * u_2) + rho;
-        _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0] = forceTerm_0 + omega_shear * (f_eq_common * 0.33333333333333331f + xi_10 * -1.0f) + xi_10;
-        _data_pdfs_20_31_10[_stride_pdfs_0 * ctr_0] = forceTerm_1 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_1 * u_1)) + xi_5 * -0.5f + xi_6 * -0.5f) + rr_0 * (rho * u_1 * 0.16666666666666666f + xi_5 * -0.5f + xi_6 * 0.5f) + xi_5 + ((-1.0f <= grid_size * -1.0f + ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + v_s) * 0.16666666666666666f) : (0.0f));
-        _data_pdfs_20_32_10[_stride_pdfs_0 * ctr_0] = forceTerm_2 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_1 * u_1)) + xi_5 * -0.5f + xi_6 * -0.5f) + rr_0 * (rho * u_1 * -0.16666666666666666f + xi_5 * 0.5f + xi_6 * -0.5f) + xi_6 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * -2.0f + v_s) * 0.16666666666666666f) : (0.0f));
-        _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0] = forceTerm_3 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_0 * u_0)) + xi_11 * -0.5f + xi_12 * -0.5f) + rr_0 * (rho * u_0 * -0.16666666666666666f + xi_11 * 0.5f + xi_12 * -0.5f) + xi_12;
-        _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0] = forceTerm_4 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_0 * u_0)) + xi_11 * -0.5f + xi_12 * -0.5f) + rr_0 * (rho * u_0 * 0.16666666666666666f + xi_11 * -0.5f + xi_12 * 0.5f) + xi_11;
-        _data_pdfs_20_35_10[_stride_pdfs_0 * ctr_0] = forceTerm_5 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_2 * u_2)) + xi_15 * -0.5f + xi_3 * -0.5f) + rr_0 * (rho * u_2 * 0.16666666666666666f + xi_15 * -0.5f + xi_3 * 0.5f) + xi_15;
-        _data_pdfs_20_36_10[_stride_pdfs_0 * ctr_0] = forceTerm_6 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_2 * u_2)) + xi_15 * -0.5f + xi_3 * -0.5f) + rr_0 * (rho * u_2 * -0.16666666666666666f + xi_15 * 0.5f + xi_3 * -0.5f) + xi_3;
-        _data_pdfs_20_37_10[_stride_pdfs_0 * ctr_0] = forceTerm_7 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Mu1 * u0Mu1)) + xi_22 * -0.5f + xi_24 * -0.5f) + rr_0 * (rho * u0Mu1 * -0.083333333333333329f + xi_22 * 0.5f + xi_24 * -0.5f) + xi_24 + ((-1.0f <= grid_size * -1.0f + ((float)(ctr_1))) ? (rho * v_s * (u_0 * -2.0f + u_1 * 3.0f + v_s * -1.0f + 1.0f) * 0.083333333333333329f) : (0.0f));
-        _data_pdfs_20_38_10[_stride_pdfs_0 * ctr_0] = forceTerm_8 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Pu1 * u0Pu1)) + xi_17 * -0.5f + xi_23 * -0.5f) + rr_0 * (rho * u0Pu1 * 0.083333333333333329f + xi_17 * -0.5f + xi_23 * 0.5f) + xi_17 + ((-1.0f <= grid_size * -1.0f + ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * 3.0f + v_s + 1.0f) * -0.083333333333333329f) : (0.0f));
-        _data_pdfs_20_39_10[_stride_pdfs_0 * ctr_0] = forceTerm_9 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Pu1 * u0Pu1)) + xi_17 * -0.5f + xi_23 * -0.5f) + rr_0 * (rho * u0Pu1 * -0.083333333333333329f + xi_17 * 0.5f + xi_23 * -0.5f) + xi_23 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * 3.0f + v_s * -1.0f - 1.0f) * 0.083333333333333329f) : (0.0f));
-        _data_pdfs_20_310_10[_stride_pdfs_0 * ctr_0] = forceTerm_10 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Mu1 * u0Mu1)) + xi_22 * -0.5f + xi_24 * -0.5f) + rr_0 * (rho * u0Mu1 * 0.083333333333333329f + xi_22 * -0.5f + xi_24 * 0.5f) + xi_22 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * -3.0f + v_s * -1.0f + 1.0f) * 0.083333333333333329f) : (0.0f));
-        _data_pdfs_20_311_10[_stride_pdfs_0 * ctr_0] = forceTerm_11 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Pu2 * u1Pu2)) + xi_19 * -0.5f + xi_7 * -0.5f) + rr_0 * (rho * u1Pu2 * 0.083333333333333329f + xi_19 * 0.5f + xi_7 * -0.5f) + xi_7;
-        _data_pdfs_20_312_10[_stride_pdfs_0 * ctr_0] = forceTerm_12 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Mu2 * u1Mu2)) + xi_18 * -0.5f + xi_20 * -0.5f) + rr_0 * (rho * u1Mu2 * -0.083333333333333329f + xi_18 * -0.5f + xi_20 * 0.5f) + xi_18;
-        _data_pdfs_20_313_10[_stride_pdfs_0 * ctr_0] = forceTerm_13 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Mu2 * u0Mu2)) + xi_8 * -0.5f + xi_9 * -0.5f) + rr_0 * (rho * u0Mu2 * -0.083333333333333329f + xi_8 * 0.5f + xi_9 * -0.5f) + xi_9;
-        _data_pdfs_20_314_10[_stride_pdfs_0 * ctr_0] = forceTerm_14 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Pu2 * u0Pu2)) + xi_13 * -0.5f + xi_16 * -0.5f) + rr_0 * (rho * u0Pu2 * 0.083333333333333329f + xi_13 * 0.5f + xi_16 * -0.5f) + xi_16;
-        _data_pdfs_20_315_10[_stride_pdfs_0 * ctr_0] = forceTerm_15 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Mu2 * u1Mu2)) + xi_18 * -0.5f + xi_20 * -0.5f) + rr_0 * (rho * u1Mu2 * 0.083333333333333329f + xi_18 * 0.5f + xi_20 * -0.5f) + xi_20;
-        _data_pdfs_20_316_10[_stride_pdfs_0 * ctr_0] = forceTerm_16 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Pu2 * u1Pu2)) + xi_19 * -0.5f + xi_7 * -0.5f) + rr_0 * (rho * u1Pu2 * -0.083333333333333329f + xi_19 * -0.5f + xi_7 * 0.5f) + xi_19;
-        _data_pdfs_20_317_10[_stride_pdfs_0 * ctr_0] = forceTerm_17 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Pu2 * u0Pu2)) + xi_13 * -0.5f + xi_16 * -0.5f) + rr_0 * (rho * u0Pu2 * -0.083333333333333329f + xi_13 * -0.5f + xi_16 * 0.5f) + xi_13;
-        _data_pdfs_20_318_10[_stride_pdfs_0 * ctr_0] = forceTerm_18 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Mu2 * u0Mu2)) + xi_8 * -0.5f + xi_9 * -0.5f) + rr_0 * (rho * u0Mu2 * 0.083333333333333329f + xi_8 * -0.5f + xi_9 * 0.5f) + xi_8;
+        const float f_eq_common = rho - rho * u_0 * u_0 - rho * u_1 * u_1 - rho * u_2 * u_2;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = forceTerm_0 + omega_shear * (f_eq_common * 0.33333333333333331f - xi_17) + xi_17;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = forceTerm_1 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_1 * u_1)) + xi_19 * -0.5f + xi_8 * -0.5f) + rr_0 * (rho * u_1 * 0.16666666666666666f + xi_19 * 0.5f + xi_8 * -0.5f) + xi_8 + ((-1.0f <= -grid_size + ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + v_s) * 0.16666666666666666f) : (0.0f));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = forceTerm_2 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_1 * u_1)) + xi_19 * -0.5f + xi_8 * -0.5f) + rr_0 * (rho * u_1 * -0.16666666666666666f + xi_19 * -0.5f + xi_8 * 0.5f) + xi_19 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * -2.0f + v_s) * 0.16666666666666666f) : (0.0f));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = forceTerm_3 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_0 * u_0)) + xi_14 * -0.5f + xi_6 * -0.5f) + rr_0 * (rho * u_0 * -0.16666666666666666f + xi_14 * 0.5f + xi_6 * -0.5f) + xi_6;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = forceTerm_4 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_0 * u_0)) + xi_14 * -0.5f + xi_6 * -0.5f) + rr_0 * (rho * u_0 * 0.16666666666666666f + xi_14 * -0.5f + xi_6 * 0.5f) + xi_14;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = forceTerm_5 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_2 * u_2)) + xi_16 * -0.5f + xi_22 * -0.5f) + rr_0 * (rho * u_2 * 0.16666666666666666f + xi_16 * 0.5f + xi_22 * -0.5f) + xi_22;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = forceTerm_6 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_2 * u_2)) + xi_16 * -0.5f + xi_22 * -0.5f) + rr_0 * (rho * u_2 * -0.16666666666666666f + xi_16 * -0.5f + xi_22 * 0.5f) + xi_16;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = forceTerm_7 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Mu1 * u0Mu1)) + xi_24 * -0.5f + xi_7 * -0.5f) + rr_0 * (rho * u0Mu1 * -0.083333333333333329f + xi_24 * 0.5f + xi_7 * -0.5f) + xi_7 + ((-1.0f <= -grid_size + ((float)(ctr_1))) ? (rho * v_s * (u_0 * -2.0f + u_1 * 3.0f - v_s + 1.0f) * 0.083333333333333329f) : (0.0f));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = forceTerm_8 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Pu1 * u0Pu1)) + xi_12 * -0.5f + xi_13 * -0.5f) + rr_0 * (rho * u0Pu1 * 0.083333333333333329f + xi_12 * -0.5f + xi_13 * 0.5f) + xi_12 + ((-1.0f <= -grid_size + ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * 3.0f + v_s + 1.0f) * -0.083333333333333329f) : (0.0f));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = forceTerm_9 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Pu1 * u0Pu1)) + xi_12 * -0.5f + xi_13 * -0.5f) + rr_0 * (rho * u0Pu1 * -0.083333333333333329f + xi_12 * 0.5f + xi_13 * -0.5f) + xi_13 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * 3.0f - v_s - 1.0f) * 0.083333333333333329f) : (0.0f));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = forceTerm_10 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Mu1 * u0Mu1)) + xi_24 * -0.5f + xi_7 * -0.5f) + rr_0 * (rho * u0Mu1 * 0.083333333333333329f + xi_24 * -0.5f + xi_7 * 0.5f) + xi_24 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * -3.0f - v_s + 1.0f) * 0.083333333333333329f) : (0.0f));
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = forceTerm_11 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Pu2 * u1Pu2)) + xi_21 * -0.5f + xi_4 * -0.5f) + rr_0 * (rho * u1Pu2 * 0.083333333333333329f + xi_21 * -0.5f + xi_4 * 0.5f) + xi_21;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = forceTerm_12 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Mu2 * u1Mu2)) + xi_20 * -0.5f + xi_23 * -0.5f) + rr_0 * (rho * u1Mu2 * -0.083333333333333329f + xi_20 * 0.5f + xi_23 * -0.5f) + xi_23;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = forceTerm_13 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Mu2 * u0Mu2)) + xi_15 * -0.5f + xi_18 * -0.5f) + rr_0 * (rho * u0Mu2 * -0.083333333333333329f + xi_15 * -0.5f + xi_18 * 0.5f) + xi_15;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = forceTerm_14 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Pu2 * u0Pu2)) + xi_11 * -0.5f + xi_3 * -0.5f) + rr_0 * (rho * u0Pu2 * 0.083333333333333329f + xi_11 * -0.5f + xi_3 * 0.5f) + xi_11;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = forceTerm_15 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Mu2 * u1Mu2)) + xi_20 * -0.5f + xi_23 * -0.5f) + rr_0 * (rho * u1Mu2 * 0.083333333333333329f + xi_20 * -0.5f + xi_23 * 0.5f) + xi_20;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = forceTerm_16 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Pu2 * u1Pu2)) + xi_21 * -0.5f + xi_4 * -0.5f) + rr_0 * (rho * u1Pu2 * -0.083333333333333329f + xi_21 * 0.5f + xi_4 * -0.5f) + xi_4;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = forceTerm_17 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Pu2 * u0Pu2)) + xi_11 * -0.5f + xi_3 * -0.5f) + rr_0 * (rho * u0Pu2 * -0.083333333333333329f + xi_11 * 0.5f + xi_3 * -0.5f) + xi_3;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = forceTerm_18 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Mu2 * u0Mu2)) + xi_15 * -0.5f + xi_18 * -0.5f) + rr_0 * (rho * u0Mu2 * 0.083333333333333329f + xi_15 * 0.5f + xi_18 * -0.5f) + xi_18;
       }
     }
   }
@@ -201,27 +156,28 @@ static FUNC_PREFIX void collidesweepsingleprecisionleesedwards_collidesweepsingl
 } // namespace internal_ab1f3bc3368574afb482da84ccb58898
 
 void CollideSweepSinglePrecisionLeesEdwards::run(IBlock *block) {
+
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
 
+  auto &grid_size = this->grid_size_;
   auto &omega_shear = this->omega_shear_;
   auto &v_s = this->v_s_;
-  auto &grid_size = this->grid_size_;
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -234,6 +190,7 @@ void CollideSweepSinglePrecisionLeesEdwards::run(IBlock *block) {
 }
 
 void CollideSweepSinglePrecisionLeesEdwards::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -245,28 +202,28 @@ void CollideSweepSinglePrecisionLeesEdwards::runOnCellInterval(const shared_ptr<
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
 
+  auto &grid_size = this->grid_size_;
   auto &omega_shear = this->omega_shear_;
   auto &v_s = this->v_s_;
-  auto &grid_size = this->grid_size_;
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -287,4 +244,4 @@ void CollideSweepSinglePrecisionLeesEdwards::runOnCellInterval(const shared_ptr<
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h
index d65e0a1b32f..58c30c1bf4e 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwards.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -92,6 +93,9 @@ class CollideSweepSinglePrecisionLeesEdwards {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   float grid_size_;
@@ -105,4 +109,4 @@ class CollideSweepSinglePrecisionLeesEdwards {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.cpp
index 7885aed8d9c..2e4c932a9a0 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file CollideSweepSinglePrecisionLeesEdwardsAVX.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -53,124 +52,80 @@ static FUNC_PREFIX void collidesweepsingleprecisionleesedwardsavx_collidesweepsi
   const float xi_0 = ((1.0f) / (omega_shear * -0.25f + 2.0f));
   const float rr_0 = xi_0 * (omega_shear * -2.0f + 4.0f);
   for (int64_t ctr_2 = 0; ctr_2 < _size_force_2; ctr_2 += 1) {
-    float *RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    float *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    float *RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    float *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    float *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
     for (int64_t ctr_1 = 0; ctr_1 < _size_force_1; ctr_1 += 1) {
-      float *RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_36;
-      float *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      float *RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_31;
-      float *RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_32;
-      float *RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_311;
-      float *RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_318;
-      float *RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_313;
-      float *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      float *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      float *RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_317;
-      float *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      float *RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_35;
-      float *RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_314;
-      float *RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_38;
-      float *RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_312;
-      float *RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_316;
-      float *RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_39;
-      float *RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_315;
-      float *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      float *RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_310;
-      float *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      float *RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_37;
       {
         for (int64_t ctr_0 = 0; ctr_0 < (int64_t)((_size_force_0) / (8)) * (8); ctr_0 += 8) {
-          const __m256 xi_25 = _mm256_load_ps(&_data_pdfs_20_36_10[ctr_0]);
-          const __m256 xi_26 = _mm256_load_ps(&_data_force_20_32_10[ctr_0]);
-          const __m256 xi_27 = _mm256_load_ps(&_data_pdfs_20_31_10[ctr_0]);
-          const __m256 xi_28 = _mm256_load_ps(&_data_pdfs_20_32_10[ctr_0]);
-          const __m256 xi_29 = _mm256_load_ps(&_data_pdfs_20_311_10[ctr_0]);
-          const __m256 xi_30 = _mm256_load_ps(&_data_pdfs_20_318_10[ctr_0]);
-          const __m256 xi_31 = _mm256_load_ps(&_data_pdfs_20_313_10[ctr_0]);
-          const __m256 xi_32 = _mm256_load_ps(&_data_pdfs_20_34_10[ctr_0]);
-          const __m256 xi_33 = _mm256_load_ps(&_data_pdfs_20_33_10[ctr_0]);
-          const __m256 xi_34 = _mm256_load_ps(&_data_pdfs_20_317_10[ctr_0]);
-          const __m256 xi_35 = _mm256_load_ps(&_data_force_20_30_10[ctr_0]);
-          const __m256 xi_36 = _mm256_load_ps(&_data_pdfs_20_35_10[ctr_0]);
-          const __m256 xi_37 = _mm256_load_ps(&_data_pdfs_20_314_10[ctr_0]);
-          const __m256 xi_38 = _mm256_load_ps(&_data_pdfs_20_38_10[ctr_0]);
-          const __m256 xi_39 = _mm256_load_ps(&_data_pdfs_20_312_10[ctr_0]);
-          const __m256 xi_40 = _mm256_load_ps(&_data_pdfs_20_316_10[ctr_0]);
-          const __m256 xi_41 = _mm256_load_ps(&_data_pdfs_20_39_10[ctr_0]);
-          const __m256 xi_42 = _mm256_load_ps(&_data_pdfs_20_315_10[ctr_0]);
-          const __m256 xi_43 = _mm256_load_ps(&_data_force_20_31_10[ctr_0]);
-          const __m256 xi_44 = _mm256_load_ps(&_data_pdfs_20_310_10[ctr_0]);
-          const __m256 xi_45 = _mm256_load_ps(&_data_pdfs_20_30_10[ctr_0]);
-          const __m256 xi_46 = _mm256_load_ps(&_data_pdfs_20_37_10[ctr_0]);
-          const __m256 xi_3 = xi_25;
-          const __m256 xi_4 = xi_26;
-          const __m256 xi_5 = xi_27;
-          const __m256 xi_6 = xi_28;
-          const __m256 xi_7 = xi_29;
-          const __m256 xi_8 = xi_30;
-          const __m256 xi_9 = xi_31;
-          const __m256 xi_10 = xi_45;
-          const __m256 xi_11 = xi_32;
-          const __m256 xi_12 = xi_33;
-          const __m256 xi_13 = xi_34;
-          const __m256 xi_14 = xi_35;
-          const __m256 xi_15 = xi_36;
-          const __m256 xi_16 = xi_37;
-          const __m256 xi_17 = xi_38;
-          const __m256 xi_18 = xi_39;
-          const __m256 xi_19 = xi_40;
-          const __m256 xi_20 = xi_42;
-          const __m256 xi_21 = xi_43;
+          const __m256 xi_25 = _mm256_loadu_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0]);
+          const __m256 xi_26 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_27 = _mm256_load_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0]);
+          const __m256 xi_28 = _mm256_load_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_29 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_30 = _mm256_load_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_31 = _mm256_load_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0]);
+          const __m256 xi_32 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_33 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_34 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_35 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_36 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_37 = _mm256_loadu_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0]);
+          const __m256 xi_38 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_39 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_40 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_41 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_42 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_43 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_44 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_45 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_46 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_3 = xi_26;
+          const __m256 xi_4 = xi_28;
+          const __m256 xi_5 = xi_37;
+          const __m256 xi_6 = xi_29;
+          const __m256 xi_7 = xi_36;
+          const __m256 xi_8 = xi_32;
+          const __m256 xi_9 = xi_25;
+          const __m256 xi_10 = xi_27;
+          const __m256 xi_11 = xi_43;
+          const __m256 xi_12 = xi_30;
+          const __m256 xi_13 = xi_42;
+          const __m256 xi_14 = xi_46;
+          const __m256 xi_15 = xi_40;
+          const __m256 xi_16 = xi_41;
+          const __m256 xi_17 = xi_31;
+          const __m256 xi_18 = xi_38;
+          const __m256 xi_19 = xi_33;
+          const __m256 xi_20 = xi_34;
+          const __m256 xi_21 = xi_45;
           const __m256 xi_22 = xi_44;
-          const __m256 xi_23 = xi_41;
-          const __m256 xi_24 = xi_46;
-          const __m256 vel0Term = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_11, xi_16), xi_17), xi_22), xi_8);
-          const __m256 vel1Term = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_20, xi_24), xi_5), xi_7);
-          const __m256 vel2Term = _mm256_add_ps(_mm256_add_ps(xi_15, xi_18), xi_9);
-          const __m256 rho = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel0Term, vel1Term), vel2Term), xi_10), xi_12), xi_13), xi_19), xi_23), xi_3), xi_6);
+          const __m256 xi_23 = xi_35;
+          const __m256 xi_24 = xi_39;
+          const __m256 vel0Term = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_11, xi_12), xi_14), xi_18), xi_24);
+          const __m256 vel1Term = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_20, xi_21), xi_7), xi_8);
+          const __m256 vel2Term = _mm256_add_ps(_mm256_add_ps(xi_15, xi_22), xi_23);
+          const __m256 rho = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel0Term, vel1Term), vel2Term), xi_13), xi_16), xi_17), xi_19), xi_3), xi_4), xi_6);
           const __m256 xi_1 = _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho);
-          const __m256 u_0 = _mm256_add_ps(_mm256_mul_ps(xi_1, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_12, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_13, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_23, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_24, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_9, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), vel0Term)), _mm256_mul_ps(_mm256_mul_ps(xi_1, xi_14), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)));
-          const __m256 u_1 = _mm256_add_ps(_mm256_mul_ps(xi_1, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_18, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_19, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_22, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_23, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_6, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), vel1Term), xi_17)), _mm256_mul_ps(_mm256_mul_ps(xi_1, xi_21), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)));
-          const __m256 u_2 = _mm256_add_ps(_mm256_mul_ps(xi_1, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_13, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_19, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_20, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_3, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_8, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), vel2Term), xi_16), xi_7)), _mm256_mul_ps(_mm256_mul_ps(xi_1, xi_4), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)));
-          const __m256 forceTerm_0 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_1 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_2 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_3 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_4 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_5 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_6 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_7 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_14, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_21), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_14), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_21), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_14), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_8 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_21, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_21), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_14), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_21), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_14), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_9 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_mul_ps(xi_21, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_21), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_14), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_21), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_14), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_10 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_21, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_21), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_14), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_21), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_14), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_11 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_4, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_4), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_21), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_4), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_21), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_12 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_21, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_4), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_21), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_4), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_21), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_13 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_14, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_4), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_14), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_4), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_14), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_14 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_4, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_4), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_14), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_4), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_14), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_15 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_4, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_4), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_21), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_4), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_21), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_16 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_mul_ps(xi_4, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_4), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_21), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_4), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_21), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_17 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_mul_ps(xi_4, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_4), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_14), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_4), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_14), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
-          const __m256 forceTerm_18 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_4, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_4), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_14), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_4), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_14), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_21), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_14), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_4), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 u_0 = _mm256_add_ps(_mm256_mul_ps(xi_1, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_13, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_15, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_3, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_6, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_7, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), vel0Term)), _mm256_mul_ps(_mm256_mul_ps(xi_1, xi_10), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)));
+          const __m256 u_1 = _mm256_add_ps(_mm256_mul_ps(xi_1, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_13, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_19, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_23, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_24, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_4, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), vel1Term), xi_12)), _mm256_mul_ps(_mm256_mul_ps(xi_1, xi_9), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)));
+          const __m256 u_2 = _mm256_add_ps(_mm256_mul_ps(xi_1, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_16, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_18, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_20, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_3, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_4, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), vel2Term), xi_11), xi_21)), _mm256_mul_ps(_mm256_mul_ps(xi_1, xi_5), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)));
+          const __m256 forceTerm_0 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_1 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_2 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_3 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_4 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_5 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_6 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_mul_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_7 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_10, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_9), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_10), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_9), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_10), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_8 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_9, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_9), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_10), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_9), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_10), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_9 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_mul_ps(xi_9, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_9), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_10), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_9), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_10), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_10 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_9, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_9), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_10), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_9), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_10), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_11 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_9, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_5), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_9), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_5), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_9), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_12 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_9, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_5), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_9), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_5), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_9), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_13 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_10, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_5), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_10), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_5), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_10), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_14 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_5, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_5), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_10), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_5), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_10), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_15 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_5, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_5), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_9), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_5), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_9), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_16 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_mul_ps(xi_9, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_5), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_9), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_5), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_9), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_17 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_mul_ps(xi_5, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_5), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_10), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_5), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_10), _mm256_set_ps(-0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f, -0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
+          const __m256 forceTerm_18 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_5, _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_mul_ps(_mm256_mul_ps(u_0, xi_5), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(u_2, xi_10), _mm256_set_ps(-0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f, -0.25f))), _mm256_mul_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0))), _mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_5), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_10), _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_1, xi_9), _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_0, xi_10), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(u_2, xi_5), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)));
           const __m256 u0Mu1 = _mm256_add_ps(_mm256_mul_ps(u_1, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), u_0);
           const __m256 u0Pu1 = _mm256_add_ps(u_0, u_1);
           const __m256 u1Pu2 = _mm256_add_ps(u_1, u_2);
@@ -178,124 +133,124 @@ static FUNC_PREFIX void collidesweepsingleprecisionleesedwardsavx_collidesweepsi
           const __m256 u0Mu2 = _mm256_add_ps(_mm256_mul_ps(u_2, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), u_0);
           const __m256 u0Pu2 = _mm256_add_ps(u_0, u_2);
           const __m256 f_eq_common = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(u_0, u_0)), _mm256_mul_ps(_mm256_mul_ps(rho, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(u_1, u_1))), _mm256_mul_ps(_mm256_mul_ps(rho, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(u_2, u_2))), rho);
-          _mm256_store_ps(&_data_pdfs_20_30_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(xi_10, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(f_eq_common, _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)), forceTerm_0), xi_10));
-          _mm256_store_ps(&_data_pdfs_20_31_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_6, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_5, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_1), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_5, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_6, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_1, u_1)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(u_0, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s))), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size)), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)))), _CMP_LE_OQ))), forceTerm_1), xi_5));
-          _mm256_store_ps(&_data_pdfs_20_32_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_5, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_6, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_1), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_5, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_6, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_1, u_1)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(u_0, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s))), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1))), _CMP_GE_OQ))), forceTerm_2), xi_6));
-          _mm256_store_ps(&_data_pdfs_20_33_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_11, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_12, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_0), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_11, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_12, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_0, u_0)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), forceTerm_3), xi_12));
-          _mm256_store_ps(&_data_pdfs_20_34_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_12, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_11, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_0), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_11, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_12, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_0, u_0)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), forceTerm_4), xi_11));
-          _mm256_store_ps(&_data_pdfs_20_35_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_3, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_15, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_2), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_15, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_3, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_2, u_2)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), forceTerm_5), xi_15));
-          _mm256_store_ps(&_data_pdfs_20_36_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_15, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_3, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_2), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_15, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_3, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_2, u_2)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), forceTerm_6), xi_3));
-          _mm256_store_ps(&_data_pdfs_20_37_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_22, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_24, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Mu1), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_22, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_24, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Mu1, u0Mu1)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_2, u_2))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_mul_ps(u_1, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_mul_ps(u_0, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f))), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size)), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)))), _CMP_LE_OQ))), forceTerm_7), xi_24));
-          _mm256_store_ps(&_data_pdfs_20_38_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_23, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_17, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Pu1), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_17, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_23, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Pu1, u0Pu1)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_2, u_2))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(u_0, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(u_1, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s))), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size)), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)))), _CMP_LE_OQ))), forceTerm_8), xi_17));
-          _mm256_store_ps(&_data_pdfs_20_39_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_17, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_23, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Pu1), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_17, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_23, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Pu1, u0Pu1)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_2, u_2))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_mul_ps(u_0, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f))), _mm256_mul_ps(u_1, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1))), _CMP_GE_OQ))), forceTerm_9), xi_23));
-          _mm256_store_ps(&_data_pdfs_20_310_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_24, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_22, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Mu1), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_22, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_24, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Mu1, u0Mu1)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_2, u_2))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_mul_ps(u_0, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f))), _mm256_mul_ps(u_1, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f))), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1))), _CMP_GE_OQ))), forceTerm_10), xi_22));
-          _mm256_store_ps(&_data_pdfs_20_311_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_19, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_7, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u1Pu2), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_19, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_7, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u1Pu2, u1Pu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_0, u_0))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_11), xi_7));
-          _mm256_store_ps(&_data_pdfs_20_312_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_20, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_18, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u1Mu2), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_18, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_20, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u1Mu2, u1Mu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_0, u_0))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_12), xi_18));
-          _mm256_store_ps(&_data_pdfs_20_313_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_8, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_9, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Mu2), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_8, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_9, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Mu2, u0Mu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_1, u_1))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_13), xi_9));
-          _mm256_store_ps(&_data_pdfs_20_314_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_13, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_16, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Pu2), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_13, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_16, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Pu2, u0Pu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_1, u_1))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_14), xi_16));
-          _mm256_store_ps(&_data_pdfs_20_315_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_18, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_20, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u1Mu2), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_18, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_20, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u1Mu2, u1Mu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_0, u_0))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_15), xi_20));
-          _mm256_store_ps(&_data_pdfs_20_316_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_7, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_19, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u1Pu2), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_19, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_7, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u1Pu2, u1Pu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_0, u_0))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_16), xi_19));
-          _mm256_store_ps(&_data_pdfs_20_317_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_16, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_13, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Pu2), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_13, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_16, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Pu2, u0Pu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_1, u_1))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_17), xi_13));
-          _mm256_store_ps(&_data_pdfs_20_318_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_9, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_8, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Mu2), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_8, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_9, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Mu2, u0Mu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_1, u_1))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_18), xi_8));
+          _mm256_store_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f)), _mm256_mul_ps(xi_17, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)), forceTerm_0), xi_17));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_19, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_8, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_1), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_19, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_8, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_1, u_1)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(u_0, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s))), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size)), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)))), _CMP_LE_OQ))), forceTerm_1), xi_8));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_8, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_19, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_1), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_19, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_8, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_1, u_1)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(u_0, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s))), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1))), _CMP_GE_OQ))), forceTerm_2), xi_19));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_14, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_6, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_0), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_14, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_6, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_0, u_0)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), forceTerm_3), xi_6));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_6, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_14, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_0), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_14, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_6, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_0, u_0)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), forceTerm_4), xi_14));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_16, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_22, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_2), _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_16, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_22, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_2, u_2)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), forceTerm_5), xi_22));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_22, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_16, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u_2), _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f)), _mm256_mul_ps(xi_16, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_22, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f), _mm256_mul_ps(u_2, u_2)), _mm256_set_ps(-0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f, -0.1111111111111111f)))))), forceTerm_6), xi_16));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_24, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_7, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Mu1), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_24, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_7, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Mu1, u0Mu1)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_2, u_2))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_mul_ps(u_1, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_mul_ps(u_0, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f))), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size)), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)))), _CMP_LE_OQ))), forceTerm_7), xi_7));
+          _mm256_store_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_13, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_12, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Pu1), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_12, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_13, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Pu1, u0Pu1)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_2, u_2))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(u_0, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(u_1, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s))), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size, grid_size)), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)))), _CMP_LE_OQ))), forceTerm_8), xi_12));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_12, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_13, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Pu1), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_12, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_13, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Pu1, u0Pu1)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_2, u_2))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_mul_ps(u_0, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f))), _mm256_mul_ps(u_1, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1))), _CMP_GE_OQ))), forceTerm_9), xi_13));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_7, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_24, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Mu1), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_24, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_7, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Mu1, u0Mu1)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_2, u_2))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), _mm256_blendv_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_mul_ps(u_0, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f))), _mm256_mul_ps(u_1, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f))), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_set_ps(v_s, v_s, v_s, v_s, v_s, v_s, v_s, v_s)), _mm256_cmp_ps(_mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f), _mm256_set_ps(((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1)), ((float)(ctr_1))), _CMP_GE_OQ))), forceTerm_10), xi_24));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_4, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_21, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u1Pu2), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_21, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_4, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u1Pu2, u1Pu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_0, u_0))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_11), xi_21));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_20, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_23, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u1Mu2), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_20, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_23, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u1Mu2, u1Mu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_0, u_0))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_12), xi_23));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_18, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_15, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Mu2), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_15, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_18, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Mu2, u0Mu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_1, u_1))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_13), xi_15));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_3, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_11, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Pu2), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_11, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_3, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Pu2, u0Pu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_1, u_1))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_14), xi_11));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_23, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_20, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u1Mu2), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_20, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_23, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u1Mu2, u1Mu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_0, u_0))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_15), xi_20));
+          _mm256_store_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_21, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_4, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u1Pu2), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_21, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_4, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u1Pu2, u1Pu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_0, u_0))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_16), xi_4));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_11, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_3, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Pu2), _mm256_set_ps(-0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f, -0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_11, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_3, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Pu2, u0Pu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_1, u_1))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_17), xi_3));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_15, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f)), _mm256_mul_ps(xi_18, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(_mm256_mul_ps(rho, u0Mu2), _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0)), _mm256_mul_ps(_mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear), _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(f_eq_common, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_15, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_18, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(rho, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f), _mm256_mul_ps(u0Mu2, u0Mu2)), _mm256_mul_ps(_mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f), _mm256_mul_ps(u_1, u_1))), _mm256_set_ps(-0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f, -0.013888888888888888f)))))), forceTerm_18), xi_18));
         }
         for (int64_t ctr_0 = (int64_t)((_size_force_0) / (8)) * (8); ctr_0 < _size_force_0; ctr_0 += 1) {
-          const float xi_25 = _data_pdfs_20_36_10[ctr_0];
-          const float xi_26 = _data_force_20_32_10[ctr_0];
-          const float xi_27 = _data_pdfs_20_31_10[ctr_0];
-          const float xi_28 = _data_pdfs_20_32_10[ctr_0];
-          const float xi_29 = _data_pdfs_20_311_10[ctr_0];
-          const float xi_30 = _data_pdfs_20_318_10[ctr_0];
-          const float xi_31 = _data_pdfs_20_313_10[ctr_0];
-          const float xi_32 = _data_pdfs_20_34_10[ctr_0];
-          const float xi_33 = _data_pdfs_20_33_10[ctr_0];
-          const float xi_34 = _data_pdfs_20_317_10[ctr_0];
-          const float xi_35 = _data_force_20_30_10[ctr_0];
-          const float xi_36 = _data_pdfs_20_35_10[ctr_0];
-          const float xi_37 = _data_pdfs_20_314_10[ctr_0];
-          const float xi_38 = _data_pdfs_20_38_10[ctr_0];
-          const float xi_39 = _data_pdfs_20_312_10[ctr_0];
-          const float xi_40 = _data_pdfs_20_316_10[ctr_0];
-          const float xi_41 = _data_pdfs_20_39_10[ctr_0];
-          const float xi_42 = _data_pdfs_20_315_10[ctr_0];
-          const float xi_43 = _data_force_20_31_10[ctr_0];
-          const float xi_44 = _data_pdfs_20_310_10[ctr_0];
-          const float xi_45 = _data_pdfs_20_30_10[ctr_0];
-          const float xi_46 = _data_pdfs_20_37_10[ctr_0];
-          const float xi_3 = xi_25;
-          const float xi_4 = xi_26;
-          const float xi_5 = xi_27;
-          const float xi_6 = xi_28;
-          const float xi_7 = xi_29;
-          const float xi_8 = xi_30;
-          const float xi_9 = xi_31;
-          const float xi_10 = xi_45;
-          const float xi_11 = xi_32;
-          const float xi_12 = xi_33;
-          const float xi_13 = xi_34;
-          const float xi_14 = xi_35;
-          const float xi_15 = xi_36;
-          const float xi_16 = xi_37;
-          const float xi_17 = xi_38;
-          const float xi_18 = xi_39;
-          const float xi_19 = xi_40;
-          const float xi_20 = xi_42;
-          const float xi_21 = xi_43;
+          const float xi_25 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0];
+          const float xi_26 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0];
+          const float xi_27 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0];
+          const float xi_28 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0];
+          const float xi_29 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0];
+          const float xi_30 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0];
+          const float xi_31 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0];
+          const float xi_32 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0];
+          const float xi_33 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0];
+          const float xi_34 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0];
+          const float xi_35 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0];
+          const float xi_36 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0];
+          const float xi_37 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0];
+          const float xi_38 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0];
+          const float xi_39 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0];
+          const float xi_40 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0];
+          const float xi_41 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0];
+          const float xi_42 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0];
+          const float xi_43 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0];
+          const float xi_44 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0];
+          const float xi_45 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0];
+          const float xi_46 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0];
+          const float xi_3 = xi_26;
+          const float xi_4 = xi_28;
+          const float xi_5 = xi_37;
+          const float xi_6 = xi_29;
+          const float xi_7 = xi_36;
+          const float xi_8 = xi_32;
+          const float xi_9 = xi_25;
+          const float xi_10 = xi_27;
+          const float xi_11 = xi_43;
+          const float xi_12 = xi_30;
+          const float xi_13 = xi_42;
+          const float xi_14 = xi_46;
+          const float xi_15 = xi_40;
+          const float xi_16 = xi_41;
+          const float xi_17 = xi_31;
+          const float xi_18 = xi_38;
+          const float xi_19 = xi_33;
+          const float xi_20 = xi_34;
+          const float xi_21 = xi_45;
           const float xi_22 = xi_44;
-          const float xi_23 = xi_41;
-          const float xi_24 = xi_46;
-          const float vel0Term = xi_11 + xi_16 + xi_17 + xi_22 + xi_8;
-          const float vel1Term = xi_20 + xi_24 + xi_5 + xi_7;
-          const float vel2Term = xi_15 + xi_18 + xi_9;
-          const float rho = vel0Term + vel1Term + vel2Term + xi_10 + xi_12 + xi_13 + xi_19 + xi_23 + xi_3 + xi_6;
+          const float xi_23 = xi_35;
+          const float xi_24 = xi_39;
+          const float vel0Term = xi_11 + xi_12 + xi_14 + xi_18 + xi_24;
+          const float vel1Term = xi_20 + xi_21 + xi_7 + xi_8;
+          const float vel2Term = xi_15 + xi_22 + xi_23;
+          const float rho = vel0Term + vel1Term + vel2Term + xi_13 + xi_16 + xi_17 + xi_19 + xi_3 + xi_4 + xi_6;
           const float xi_1 = ((1.0f) / (rho));
-          const float u_0 = xi_1 * xi_14 * 0.5f + xi_1 * (vel0Term + xi_12 * -1.0f + xi_13 * -1.0f + xi_23 * -1.0f + xi_24 * -1.0f + xi_9 * -1.0f);
-          const float u_1 = xi_1 * xi_21 * 0.5f + xi_1 * (vel1Term + xi_17 + xi_18 * -1.0f + xi_19 * -1.0f + xi_22 * -1.0f + xi_23 * -1.0f + xi_6 * -1.0f);
-          const float u_2 = xi_1 * xi_4 * 0.5f + xi_1 * (vel2Term + xi_13 * -1.0f + xi_16 + xi_19 * -1.0f + xi_20 * -1.0f + xi_3 * -1.0f + xi_7 + xi_8 * -1.0f);
-          const float forceTerm_0 = omega_shear * u_0 * xi_14 * 0.5f + omega_shear * u_1 * xi_21 * 0.5f + omega_shear * u_2 * xi_4 * 0.5f + u_0 * xi_14 * -1.0f + u_1 * xi_21 * -1.0f + u_2 * xi_4 * -1.0f;
-          const float forceTerm_1 = omega_shear * u_0 * xi_14 * 0.083333333333333329f + omega_shear * u_1 * xi_21 * -0.16666666666666666f + omega_shear * u_2 * xi_4 * 0.083333333333333329f + rr_0 * xi_21 * -0.083333333333333329f + u_0 * xi_14 * -0.16666666666666666f + u_1 * xi_21 * 0.33333333333333331f + u_2 * xi_4 * -0.16666666666666666f + xi_21 * 0.16666666666666666f;
-          const float forceTerm_2 = omega_shear * u_0 * xi_14 * 0.083333333333333329f + omega_shear * u_1 * xi_21 * -0.16666666666666666f + omega_shear * u_2 * xi_4 * 0.083333333333333329f + rr_0 * xi_21 * 0.083333333333333329f + u_0 * xi_14 * -0.16666666666666666f + u_1 * xi_21 * 0.33333333333333331f + u_2 * xi_4 * -0.16666666666666666f + xi_21 * -0.16666666666666666f;
-          const float forceTerm_3 = omega_shear * u_0 * xi_14 * -0.16666666666666666f + omega_shear * u_1 * xi_21 * 0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.083333333333333329f + rr_0 * xi_14 * 0.083333333333333329f + u_0 * xi_14 * 0.33333333333333331f + u_1 * xi_21 * -0.16666666666666666f + u_2 * xi_4 * -0.16666666666666666f + xi_14 * -0.16666666666666666f;
-          const float forceTerm_4 = omega_shear * u_0 * xi_14 * -0.16666666666666666f + omega_shear * u_1 * xi_21 * 0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.083333333333333329f + rr_0 * xi_14 * -0.083333333333333329f + u_0 * xi_14 * 0.33333333333333331f + u_1 * xi_21 * -0.16666666666666666f + u_2 * xi_4 * -0.16666666666666666f + xi_14 * 0.16666666666666666f;
-          const float forceTerm_5 = omega_shear * u_0 * xi_14 * 0.083333333333333329f + omega_shear * u_1 * xi_21 * 0.083333333333333329f + omega_shear * u_2 * xi_4 * -0.16666666666666666f + rr_0 * xi_4 * -0.083333333333333329f + u_0 * xi_14 * -0.16666666666666666f + u_1 * xi_21 * -0.16666666666666666f + u_2 * xi_4 * 0.33333333333333331f + xi_4 * 0.16666666666666666f;
-          const float forceTerm_6 = omega_shear * u_0 * xi_14 * 0.083333333333333329f + omega_shear * u_1 * xi_21 * 0.083333333333333329f + omega_shear * u_2 * xi_4 * -0.16666666666666666f + rr_0 * xi_4 * 0.083333333333333329f + u_0 * xi_14 * -0.16666666666666666f + u_1 * xi_21 * -0.16666666666666666f + u_2 * xi_4 * 0.33333333333333331f + xi_4 * -0.16666666666666666f;
-          const float forceTerm_7 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_21 * 0.125f + omega_shear * u_1 * xi_14 * 0.125f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.041666666666666664f + rr_0 * xi_14 * 0.041666666666666664f + rr_0 * xi_21 * -0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_21 * -0.25f + u_1 * xi_14 * -0.25f + u_1 * xi_21 * 0.16666666666666666f + u_2 * xi_4 * -0.083333333333333329f + xi_14 * -0.083333333333333329f + xi_21 * 0.083333333333333329f;
-          const float forceTerm_8 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_21 * -0.125f + omega_shear * u_1 * xi_14 * -0.125f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.041666666666666664f + rr_0 * xi_14 * -0.041666666666666664f + rr_0 * xi_21 * -0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_21 * 0.25f + u_1 * xi_14 * 0.25f + u_1 * xi_21 * 0.16666666666666666f + u_2 * xi_4 * -0.083333333333333329f + xi_14 * 0.083333333333333329f + xi_21 * 0.083333333333333329f;
-          const float forceTerm_9 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_21 * -0.125f + omega_shear * u_1 * xi_14 * -0.125f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.041666666666666664f + rr_0 * xi_14 * 0.041666666666666664f + rr_0 * xi_21 * 0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_21 * 0.25f + u_1 * xi_14 * 0.25f + u_1 * xi_21 * 0.16666666666666666f + u_2 * xi_4 * -0.083333333333333329f + xi_14 * -0.083333333333333329f + xi_21 * -0.083333333333333329f;
-          const float forceTerm_10 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_21 * 0.125f + omega_shear * u_1 * xi_14 * 0.125f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_2 * xi_4 * 0.041666666666666664f + rr_0 * xi_14 * -0.041666666666666664f + rr_0 * xi_21 * 0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_21 * -0.25f + u_1 * xi_14 * -0.25f + u_1 * xi_21 * 0.16666666666666666f + u_2 * xi_4 * -0.083333333333333329f + xi_14 * 0.083333333333333329f + xi_21 * -0.083333333333333329f;
-          const float forceTerm_11 = omega_shear * u_0 * xi_14 * 0.041666666666666664f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_1 * xi_4 * -0.125f + omega_shear * u_2 * xi_21 * -0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_21 * -0.041666666666666664f + rr_0 * xi_4 * -0.041666666666666664f + u_0 * xi_14 * -0.083333333333333329f + u_1 * xi_21 * 0.16666666666666666f + u_1 * xi_4 * 0.25f + u_2 * xi_21 * 0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_21 * 0.083333333333333329f + xi_4 * 0.083333333333333329f;
-          const float forceTerm_12 = omega_shear * u_0 * xi_14 * 0.041666666666666664f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_1 * xi_4 * 0.125f + omega_shear * u_2 * xi_21 * 0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_21 * 0.041666666666666664f + rr_0 * xi_4 * -0.041666666666666664f + u_0 * xi_14 * -0.083333333333333329f + u_1 * xi_21 * 0.16666666666666666f + u_1 * xi_4 * -0.25f + u_2 * xi_21 * -0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_21 * -0.083333333333333329f + xi_4 * 0.083333333333333329f;
-          const float forceTerm_13 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_4 * 0.125f + omega_shear * u_1 * xi_21 * 0.041666666666666664f + omega_shear * u_2 * xi_14 * 0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_14 * 0.041666666666666664f + rr_0 * xi_4 * -0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_4 * -0.25f + u_1 * xi_21 * -0.083333333333333329f + u_2 * xi_14 * -0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_14 * -0.083333333333333329f + xi_4 * 0.083333333333333329f;
-          const float forceTerm_14 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_4 * -0.125f + omega_shear * u_1 * xi_21 * 0.041666666666666664f + omega_shear * u_2 * xi_14 * -0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_14 * -0.041666666666666664f + rr_0 * xi_4 * -0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_4 * 0.25f + u_1 * xi_21 * -0.083333333333333329f + u_2 * xi_14 * 0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_14 * 0.083333333333333329f + xi_4 * 0.083333333333333329f;
-          const float forceTerm_15 = omega_shear * u_0 * xi_14 * 0.041666666666666664f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_1 * xi_4 * 0.125f + omega_shear * u_2 * xi_21 * 0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_21 * -0.041666666666666664f + rr_0 * xi_4 * 0.041666666666666664f + u_0 * xi_14 * -0.083333333333333329f + u_1 * xi_21 * 0.16666666666666666f + u_1 * xi_4 * -0.25f + u_2 * xi_21 * -0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_21 * 0.083333333333333329f + xi_4 * -0.083333333333333329f;
-          const float forceTerm_16 = omega_shear * u_0 * xi_14 * 0.041666666666666664f + omega_shear * u_1 * xi_21 * -0.083333333333333329f + omega_shear * u_1 * xi_4 * -0.125f + omega_shear * u_2 * xi_21 * -0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_21 * 0.041666666666666664f + rr_0 * xi_4 * 0.041666666666666664f + u_0 * xi_14 * -0.083333333333333329f + u_1 * xi_21 * 0.16666666666666666f + u_1 * xi_4 * 0.25f + u_2 * xi_21 * 0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_21 * -0.083333333333333329f + xi_4 * -0.083333333333333329f;
-          const float forceTerm_17 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_4 * -0.125f + omega_shear * u_1 * xi_21 * 0.041666666666666664f + omega_shear * u_2 * xi_14 * -0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_14 * 0.041666666666666664f + rr_0 * xi_4 * 0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_4 * 0.25f + u_1 * xi_21 * -0.083333333333333329f + u_2 * xi_14 * 0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_14 * -0.083333333333333329f + xi_4 * -0.083333333333333329f;
-          const float forceTerm_18 = omega_shear * u_0 * xi_14 * -0.083333333333333329f + omega_shear * u_0 * xi_4 * 0.125f + omega_shear * u_1 * xi_21 * 0.041666666666666664f + omega_shear * u_2 * xi_14 * 0.125f + omega_shear * u_2 * xi_4 * -0.083333333333333329f + rr_0 * xi_14 * -0.041666666666666664f + rr_0 * xi_4 * 0.041666666666666664f + u_0 * xi_14 * 0.16666666666666666f + u_0 * xi_4 * -0.25f + u_1 * xi_21 * -0.083333333333333329f + u_2 * xi_14 * -0.25f + u_2 * xi_4 * 0.16666666666666666f + xi_14 * 0.083333333333333329f + xi_4 * -0.083333333333333329f;
-          const float u0Mu1 = u_0 + u_1 * -1.0f;
+          const float u_0 = xi_1 * xi_10 * 0.5f + xi_1 * (vel0Term - xi_13 - xi_15 - xi_3 - xi_6 - xi_7);
+          const float u_1 = xi_1 * xi_9 * 0.5f + xi_1 * (vel1Term + xi_12 - xi_13 - xi_19 - xi_23 - xi_24 - xi_4);
+          const float u_2 = xi_1 * xi_5 * 0.5f + xi_1 * (vel2Term + xi_11 - xi_16 - xi_18 - xi_20 + xi_21 - xi_3 - xi_4);
+          const float forceTerm_0 = omega_shear * u_0 * xi_10 * 0.5f + omega_shear * u_1 * xi_9 * 0.5f + omega_shear * u_2 * xi_5 * 0.5f - u_0 * xi_10 - u_1 * xi_9 - u_2 * xi_5;
+          const float forceTerm_1 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * -0.16666666666666666f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_9 * -0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * 0.33333333333333331f + u_2 * xi_5 * -0.16666666666666666f + xi_9 * 0.16666666666666666f;
+          const float forceTerm_2 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * -0.16666666666666666f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_9 * 0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * 0.33333333333333331f + u_2 * xi_5 * -0.16666666666666666f + xi_9 * -0.16666666666666666f;
+          const float forceTerm_3 = omega_shear * u_0 * xi_10 * -0.16666666666666666f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_10 * 0.083333333333333329f + u_0 * xi_10 * 0.33333333333333331f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * -0.16666666666666666f + xi_10 * -0.16666666666666666f;
+          const float forceTerm_4 = omega_shear * u_0 * xi_10 * -0.16666666666666666f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_10 * -0.083333333333333329f + u_0 * xi_10 * 0.33333333333333331f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * -0.16666666666666666f + xi_10 * 0.16666666666666666f;
+          const float forceTerm_5 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.16666666666666666f + rr_0 * xi_5 * -0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * 0.33333333333333331f + xi_5 * 0.16666666666666666f;
+          const float forceTerm_6 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.16666666666666666f + rr_0 * xi_5 * 0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * 0.33333333333333331f + xi_5 * -0.16666666666666666f;
+          const float forceTerm_7 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * 0.125f + omega_shear * u_1 * xi_10 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * -0.25f + u_1 * xi_10 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * -0.083333333333333329f + xi_9 * 0.083333333333333329f;
+          const float forceTerm_8 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * -0.125f + omega_shear * u_1 * xi_10 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * 0.25f + u_1 * xi_10 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * 0.083333333333333329f + xi_9 * 0.083333333333333329f;
+          const float forceTerm_9 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * -0.125f + omega_shear * u_1 * xi_10 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * 0.25f + u_1 * xi_10 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * -0.083333333333333329f + xi_9 * -0.083333333333333329f;
+          const float forceTerm_10 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * 0.125f + omega_shear * u_1 * xi_10 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * -0.25f + u_1 * xi_10 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * 0.083333333333333329f + xi_9 * -0.083333333333333329f;
+          const float forceTerm_11 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * -0.125f + rr_0 * xi_5 * -0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * 0.25f + xi_5 * 0.083333333333333329f + xi_9 * 0.083333333333333329f;
+          const float forceTerm_12 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * 0.125f + rr_0 * xi_5 * -0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * -0.25f + xi_5 * 0.083333333333333329f + xi_9 * -0.083333333333333329f;
+          const float forceTerm_13 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * 0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_5 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * -0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * -0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * -0.083333333333333329f + xi_5 * 0.083333333333333329f;
+          const float forceTerm_14 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * -0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_5 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * 0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * 0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * 0.083333333333333329f + xi_5 * 0.083333333333333329f;
+          const float forceTerm_15 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * 0.125f + rr_0 * xi_5 * 0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * -0.25f + xi_5 * -0.083333333333333329f + xi_9 * 0.083333333333333329f;
+          const float forceTerm_16 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * -0.125f + rr_0 * xi_5 * 0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * 0.25f + xi_5 * -0.083333333333333329f + xi_9 * -0.083333333333333329f;
+          const float forceTerm_17 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * -0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_5 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * 0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * 0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * -0.083333333333333329f + xi_5 * -0.083333333333333329f;
+          const float forceTerm_18 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * 0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_5 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * -0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * -0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * 0.083333333333333329f + xi_5 * -0.083333333333333329f;
+          const float u0Mu1 = u_0 - u_1;
           const float u0Pu1 = u_0 + u_1;
           const float u1Pu2 = u_1 + u_2;
-          const float u1Mu2 = u_1 + u_2 * -1.0f;
-          const float u0Mu2 = u_0 + u_2 * -1.0f;
+          const float u1Mu2 = u_1 - u_2;
+          const float u0Mu2 = u_0 - u_2;
           const float u0Pu2 = u_0 + u_2;
-          const float f_eq_common = rho * -1.0f * u_0 * u_0 + rho * -1.0f * u_1 * u_1 + rho * -1.0f * u_2 * u_2 + rho;
-          _data_pdfs_20_30_10[ctr_0] = forceTerm_0 + omega_shear * (f_eq_common * 0.33333333333333331f + xi_10 * -1.0f) + xi_10;
-          _data_pdfs_20_31_10[ctr_0] = forceTerm_1 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * u_1 * u_1) + xi_5 * -0.5f + xi_6 * -0.5f) + rr_0 * (rho * u_1 * 0.16666666666666666f + xi_5 * -0.5f + xi_6 * 0.5f) + xi_5 + ((-1.0f <= grid_size * -1.0f + ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + v_s) * 0.16666666666666666f) : (0.0f));
-          _data_pdfs_20_32_10[ctr_0] = forceTerm_2 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * u_1 * u_1) + xi_5 * -0.5f + xi_6 * -0.5f) + rr_0 * (rho * u_1 * -0.16666666666666666f + xi_5 * 0.5f + xi_6 * -0.5f) + xi_6 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * -2.0f + v_s) * 0.16666666666666666f) : (0.0f));
-          _data_pdfs_20_33_10[ctr_0] = forceTerm_3 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * u_0 * u_0) + xi_11 * -0.5f + xi_12 * -0.5f) + rr_0 * (rho * u_0 * -0.16666666666666666f + xi_11 * 0.5f + xi_12 * -0.5f) + xi_12;
-          _data_pdfs_20_34_10[ctr_0] = forceTerm_4 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * u_0 * u_0) + xi_11 * -0.5f + xi_12 * -0.5f) + rr_0 * (rho * u_0 * 0.16666666666666666f + xi_11 * -0.5f + xi_12 * 0.5f) + xi_11;
-          _data_pdfs_20_35_10[ctr_0] = forceTerm_5 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * u_2 * u_2) + xi_15 * -0.5f + xi_3 * -0.5f) + rr_0 * (rho * u_2 * 0.16666666666666666f + xi_15 * -0.5f + xi_3 * 0.5f) + xi_15;
-          _data_pdfs_20_36_10[ctr_0] = forceTerm_6 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * u_2 * u_2) + xi_15 * -0.5f + xi_3 * -0.5f) + rr_0 * (rho * u_2 * -0.16666666666666666f + xi_15 * 0.5f + xi_3 * -0.5f) + xi_3;
-          _data_pdfs_20_37_10[ctr_0] = forceTerm_7 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_2 * u_2 + 0.125f * u0Mu1 * u0Mu1) + xi_22 * -0.5f + xi_24 * -0.5f) + rr_0 * (rho * u0Mu1 * -0.083333333333333329f + xi_22 * 0.5f + xi_24 * -0.5f) + xi_24 + ((-1.0f <= grid_size * -1.0f + ((float)(ctr_1))) ? (rho * v_s * (u_0 * -2.0f + u_1 * 3.0f + v_s * -1.0f + 1.0f) * 0.083333333333333329f) : (0.0f));
-          _data_pdfs_20_38_10[ctr_0] = forceTerm_8 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_2 * u_2 + 0.125f * u0Pu1 * u0Pu1) + xi_17 * -0.5f + xi_23 * -0.5f) + rr_0 * (rho * u0Pu1 * 0.083333333333333329f + xi_17 * -0.5f + xi_23 * 0.5f) + xi_17 + ((-1.0f <= grid_size * -1.0f + ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * 3.0f + v_s + 1.0f) * -0.083333333333333329f) : (0.0f));
-          _data_pdfs_20_39_10[ctr_0] = forceTerm_9 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_2 * u_2 + 0.125f * u0Pu1 * u0Pu1) + xi_17 * -0.5f + xi_23 * -0.5f) + rr_0 * (rho * u0Pu1 * -0.083333333333333329f + xi_17 * 0.5f + xi_23 * -0.5f) + xi_23 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * 3.0f + v_s * -1.0f - 1.0f) * 0.083333333333333329f) : (0.0f));
-          _data_pdfs_20_310_10[ctr_0] = forceTerm_10 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_2 * u_2 + 0.125f * u0Mu1 * u0Mu1) + xi_22 * -0.5f + xi_24 * -0.5f) + rr_0 * (rho * u0Mu1 * 0.083333333333333329f + xi_22 * -0.5f + xi_24 * 0.5f) + xi_22 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * -3.0f + v_s * -1.0f + 1.0f) * 0.083333333333333329f) : (0.0f));
-          _data_pdfs_20_311_10[ctr_0] = forceTerm_11 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_0 * u_0 + 0.125f * u1Pu2 * u1Pu2) + xi_19 * -0.5f + xi_7 * -0.5f) + rr_0 * (rho * u1Pu2 * 0.083333333333333329f + xi_19 * 0.5f + xi_7 * -0.5f) + xi_7;
-          _data_pdfs_20_312_10[ctr_0] = forceTerm_12 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_0 * u_0 + 0.125f * u1Mu2 * u1Mu2) + xi_18 * -0.5f + xi_20 * -0.5f) + rr_0 * (rho * u1Mu2 * -0.083333333333333329f + xi_18 * -0.5f + xi_20 * 0.5f) + xi_18;
-          _data_pdfs_20_313_10[ctr_0] = forceTerm_13 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_1 * u_1 + 0.125f * u0Mu2 * u0Mu2) + xi_8 * -0.5f + xi_9 * -0.5f) + rr_0 * (rho * u0Mu2 * -0.083333333333333329f + xi_8 * 0.5f + xi_9 * -0.5f) + xi_9;
-          _data_pdfs_20_314_10[ctr_0] = forceTerm_14 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_1 * u_1 + 0.125f * u0Pu2 * u0Pu2) + xi_13 * -0.5f + xi_16 * -0.5f) + rr_0 * (rho * u0Pu2 * 0.083333333333333329f + xi_13 * 0.5f + xi_16 * -0.5f) + xi_16;
-          _data_pdfs_20_315_10[ctr_0] = forceTerm_15 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_0 * u_0 + 0.125f * u1Mu2 * u1Mu2) + xi_18 * -0.5f + xi_20 * -0.5f) + rr_0 * (rho * u1Mu2 * 0.083333333333333329f + xi_18 * 0.5f + xi_20 * -0.5f) + xi_20;
-          _data_pdfs_20_316_10[ctr_0] = forceTerm_16 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_0 * u_0 + 0.125f * u1Pu2 * u1Pu2) + xi_19 * -0.5f + xi_7 * -0.5f) + rr_0 * (rho * u1Pu2 * -0.083333333333333329f + xi_19 * -0.5f + xi_7 * 0.5f) + xi_19;
-          _data_pdfs_20_317_10[ctr_0] = forceTerm_17 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_1 * u_1 + 0.125f * u0Pu2 * u0Pu2) + xi_13 * -0.5f + xi_16 * -0.5f) + rr_0 * (rho * u0Pu2 * -0.083333333333333329f + xi_13 * -0.5f + xi_16 * 0.5f) + xi_13;
-          _data_pdfs_20_318_10[ctr_0] = forceTerm_18 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * u_1 * u_1 + 0.125f * u0Mu2 * u0Mu2) + xi_8 * -0.5f + xi_9 * -0.5f) + rr_0 * (rho * u0Mu2 * 0.083333333333333329f + xi_8 * -0.5f + xi_9 * 0.5f) + xi_8;
+          const float f_eq_common = rho - rho * (u_0 * u_0) - rho * (u_1 * u_1) - rho * (u_2 * u_2);
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0] = forceTerm_0 + omega_shear * (f_eq_common * 0.33333333333333331f - xi_17) + xi_17;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0] = forceTerm_1 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_1 * u_1)) + xi_19 * -0.5f + xi_8 * -0.5f) + rr_0 * (rho * u_1 * 0.16666666666666666f + xi_19 * 0.5f + xi_8 * -0.5f) + xi_8 + ((-1.0f <= -grid_size + ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + v_s) * 0.16666666666666666f) : (0.0f));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0] = forceTerm_2 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_1 * u_1)) + xi_19 * -0.5f + xi_8 * -0.5f) + rr_0 * (rho * u_1 * -0.16666666666666666f + xi_19 * -0.5f + xi_8 * 0.5f) + xi_19 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * -2.0f + v_s) * 0.16666666666666666f) : (0.0f));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0] = forceTerm_3 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_0 * u_0)) + xi_14 * -0.5f + xi_6 * -0.5f) + rr_0 * (rho * u_0 * -0.16666666666666666f + xi_14 * 0.5f + xi_6 * -0.5f) + xi_6;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0] = forceTerm_4 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_0 * u_0)) + xi_14 * -0.5f + xi_6 * -0.5f) + rr_0 * (rho * u_0 * 0.16666666666666666f + xi_14 * -0.5f + xi_6 * 0.5f) + xi_14;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0] = forceTerm_5 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_2 * u_2)) + xi_16 * -0.5f + xi_22 * -0.5f) + rr_0 * (rho * u_2 * 0.16666666666666666f + xi_16 * 0.5f + xi_22 * -0.5f) + xi_22;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0] = forceTerm_6 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_2 * u_2)) + xi_16 * -0.5f + xi_22 * -0.5f) + rr_0 * (rho * u_2 * -0.16666666666666666f + xi_16 * -0.5f + xi_22 * 0.5f) + xi_16;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0] = forceTerm_7 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Mu1 * u0Mu1)) + xi_24 * -0.5f + xi_7 * -0.5f) + rr_0 * (rho * u0Mu1 * -0.083333333333333329f + xi_24 * 0.5f + xi_7 * -0.5f) + xi_7 + ((-1.0f <= -grid_size + ((float)(ctr_1))) ? (rho * v_s * (u_0 * -2.0f + u_1 * 3.0f - v_s + 1.0f) * 0.083333333333333329f) : (0.0f));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0] = forceTerm_8 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Pu1 * u0Pu1)) + xi_12 * -0.5f + xi_13 * -0.5f) + rr_0 * (rho * u0Pu1 * 0.083333333333333329f + xi_12 * -0.5f + xi_13 * 0.5f) + xi_12 + ((-1.0f <= -grid_size + ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * 3.0f + v_s + 1.0f) * -0.083333333333333329f) : (0.0f));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0] = forceTerm_9 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Pu1 * u0Pu1)) + xi_12 * -0.5f + xi_13 * -0.5f) + rr_0 * (rho * u0Pu1 * -0.083333333333333329f + xi_12 * 0.5f + xi_13 * -0.5f) + xi_13 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * 3.0f - v_s - 1.0f) * 0.083333333333333329f) : (0.0f));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0] = forceTerm_10 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Mu1 * u0Mu1)) + xi_24 * -0.5f + xi_7 * -0.5f) + rr_0 * (rho * u0Mu1 * 0.083333333333333329f + xi_24 * -0.5f + xi_7 * 0.5f) + xi_24 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * -3.0f - v_s + 1.0f) * 0.083333333333333329f) : (0.0f));
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0] = forceTerm_11 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Pu2 * u1Pu2)) + xi_21 * -0.5f + xi_4 * -0.5f) + rr_0 * (rho * u1Pu2 * 0.083333333333333329f + xi_21 * -0.5f + xi_4 * 0.5f) + xi_21;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0] = forceTerm_12 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Mu2 * u1Mu2)) + xi_20 * -0.5f + xi_23 * -0.5f) + rr_0 * (rho * u1Mu2 * -0.083333333333333329f + xi_20 * 0.5f + xi_23 * -0.5f) + xi_23;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0] = forceTerm_13 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Mu2 * u0Mu2)) + xi_15 * -0.5f + xi_18 * -0.5f) + rr_0 * (rho * u0Mu2 * -0.083333333333333329f + xi_15 * -0.5f + xi_18 * 0.5f) + xi_15;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0] = forceTerm_14 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Pu2 * u0Pu2)) + xi_11 * -0.5f + xi_3 * -0.5f) + rr_0 * (rho * u0Pu2 * 0.083333333333333329f + xi_11 * -0.5f + xi_3 * 0.5f) + xi_11;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0] = forceTerm_15 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Mu2 * u1Mu2)) + xi_20 * -0.5f + xi_23 * -0.5f) + rr_0 * (rho * u1Mu2 * 0.083333333333333329f + xi_20 * -0.5f + xi_23 * 0.5f) + xi_20;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0] = forceTerm_16 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Pu2 * u1Pu2)) + xi_21 * -0.5f + xi_4 * -0.5f) + rr_0 * (rho * u1Pu2 * -0.083333333333333329f + xi_21 * 0.5f + xi_4 * -0.5f) + xi_4;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0] = forceTerm_17 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Pu2 * u0Pu2)) + xi_11 * -0.5f + xi_3 * -0.5f) + rr_0 * (rho * u0Pu2 * -0.083333333333333329f + xi_11 * 0.5f + xi_3 * -0.5f) + xi_3;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0] = forceTerm_18 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Mu2 * u0Mu2)) + xi_15 * -0.5f + xi_18 * -0.5f) + rr_0 * (rho * u0Mu2 * 0.083333333333333329f + xi_15 * 0.5f + xi_18 * -0.5f) + xi_18;
         }
       }
     }
@@ -304,32 +259,33 @@ static FUNC_PREFIX void collidesweepsingleprecisionleesedwardsavx_collidesweepsi
 } // namespace internal_9a18f2f4073cdcc5365cdfddb752069e
 
 void CollideSweepSinglePrecisionLeesEdwardsAVX::run(IBlock *block) {
+
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
 
+  auto &grid_size = this->grid_size_;
   auto &omega_shear = this->omega_shear_;
   auto &v_s = this->v_s_;
-  auto &grid_size = this->grid_size_;
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -340,6 +296,7 @@ void CollideSweepSinglePrecisionLeesEdwardsAVX::run(IBlock *block) {
 }
 
 void CollideSweepSinglePrecisionLeesEdwardsAVX::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -351,33 +308,33 @@ void CollideSweepSinglePrecisionLeesEdwardsAVX::runOnCellInterval(const shared_p
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
 
+  auto &grid_size = this->grid_size_;
   auto &omega_shear = this->omega_shear_;
   auto &v_s = this->v_s_;
-  auto &grid_size = this->grid_size_;
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -396,4 +353,4 @@ void CollideSweepSinglePrecisionLeesEdwardsAVX::runOnCellInterval(const shared_p
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h
index 1885949a8e9..2819ac83a73 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsAVX.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -93,6 +94,9 @@ class CollideSweepSinglePrecisionLeesEdwardsAVX {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   float grid_size_;
@@ -106,4 +110,4 @@ class CollideSweepSinglePrecisionLeesEdwardsAVX {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.cu
new file mode 100644
index 00000000000..69f29a89ba6
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.cu
@@ -0,0 +1,250 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file CollideSweepSinglePrecisionLeesEdwardsCUDA.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#include <cmath>
+
+#include "CollideSweepSinglePrecisionLeesEdwardsCUDA.h"
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+
+#define FUNC_PREFIX __global__
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#pragma GCC diagnostic ignored "-Wshadow"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning push
+#pragma warning(disable : 1599)
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace pystencils {
+
+namespace internal_collidesweepsingleprecisionleesedwardscuda_collidesweepsingleprecisionleesedwardscuda {
+static FUNC_PREFIX __launch_bounds__(256) void collidesweepsingleprecisionleesedwardscuda_collidesweepsingleprecisionleesedwardscuda(float *RESTRICT const _data_force, float *RESTRICT _data_pdfs, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, float grid_size, float omega_shear, float v_s) {
+  if (blockDim.x * blockIdx.x + threadIdx.x < _size_force_0 && blockDim.y * blockIdx.y + threadIdx.y < _size_force_1 && blockDim.z * blockIdx.z + threadIdx.z < _size_force_2) {
+    const int64_t ctr_0 = blockDim.x * blockIdx.x + threadIdx.x;
+    const int64_t ctr_1 = blockDim.y * blockIdx.y + threadIdx.y;
+    const int64_t ctr_2 = blockDim.z * blockIdx.z + threadIdx.z;
+    const float xi_25 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+    const float xi_26 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3];
+    const float xi_27 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3];
+    const float xi_28 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+    const float xi_29 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3];
+    const float xi_30 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3];
+    const float xi_31 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+    const float xi_32 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+    const float xi_33 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3];
+    const float xi_34 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+    const float xi_35 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+    const float xi_36 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3];
+    const float xi_37 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3];
+    const float xi_38 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
+    const float xi_39 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3];
+    const float xi_40 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+    const float xi_41 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3];
+    const float xi_42 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+    const float xi_43 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+    const float xi_44 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3];
+    const float xi_45 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+    const float xi_46 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+    const float xi_3 = xi_41;
+    const float xi_4 = xi_36;
+    const float xi_5 = xi_28;
+    const float xi_6 = xi_45;
+    const float xi_7 = xi_46;
+    const float xi_8 = xi_40;
+    const float xi_9 = xi_35;
+    const float xi_10 = xi_42;
+    const float xi_11 = xi_33;
+    const float xi_12 = xi_34;
+    const float xi_13 = xi_32;
+    const float xi_14 = xi_38;
+    const float xi_15 = xi_39;
+    const float xi_16 = xi_44;
+    const float xi_17 = xi_25;
+    const float xi_18 = xi_30;
+    const float xi_19 = xi_31;
+    const float xi_20 = xi_26;
+    const float xi_21 = xi_27;
+    const float xi_22 = xi_29;
+    const float xi_23 = xi_37;
+    const float xi_24 = xi_43;
+    const float xi_0 = ((1.0f) / (omega_shear * -0.25f + 2.0f));
+    const float rr_0 = xi_0 * (omega_shear * -2.0f + 4.0f);
+    const float vel0Term = xi_11 + xi_12 + xi_14 + xi_18 + xi_24;
+    const float vel1Term = xi_20 + xi_21 + xi_7 + xi_8;
+    const float vel2Term = xi_15 + xi_22 + xi_23;
+    const float rho = vel0Term + vel1Term + vel2Term + xi_13 + xi_16 + xi_17 + xi_19 + xi_3 + xi_4 + xi_6;
+    const float xi_1 = ((1.0f) / (rho));
+    const float u_0 = xi_1 * xi_10 * 0.5f + xi_1 * (vel0Term - xi_13 - xi_15 - xi_3 - xi_6 - xi_7);
+    const float u_1 = xi_1 * xi_9 * 0.5f + xi_1 * (vel1Term + xi_12 - xi_13 - xi_19 - xi_23 - xi_24 - xi_4);
+    const float u_2 = xi_1 * xi_5 * 0.5f + xi_1 * (vel2Term + xi_11 - xi_16 - xi_18 - xi_20 + xi_21 - xi_3 - xi_4);
+    const float forceTerm_0 = omega_shear * u_0 * xi_10 * 0.5f + omega_shear * u_1 * xi_9 * 0.5f + omega_shear * u_2 * xi_5 * 0.5f - u_0 * xi_10 - u_1 * xi_9 - u_2 * xi_5;
+    const float forceTerm_1 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * -0.16666666666666666f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_9 * -0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * 0.33333333333333331f + u_2 * xi_5 * -0.16666666666666666f + xi_9 * 0.16666666666666666f;
+    const float forceTerm_2 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * -0.16666666666666666f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_9 * 0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * 0.33333333333333331f + u_2 * xi_5 * -0.16666666666666666f + xi_9 * -0.16666666666666666f;
+    const float forceTerm_3 = omega_shear * u_0 * xi_10 * -0.16666666666666666f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_10 * 0.083333333333333329f + u_0 * xi_10 * 0.33333333333333331f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * -0.16666666666666666f + xi_10 * -0.16666666666666666f;
+    const float forceTerm_4 = omega_shear * u_0 * xi_10 * -0.16666666666666666f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.083333333333333329f + rr_0 * xi_10 * -0.083333333333333329f + u_0 * xi_10 * 0.33333333333333331f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * -0.16666666666666666f + xi_10 * 0.16666666666666666f;
+    const float forceTerm_5 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.16666666666666666f + rr_0 * xi_5 * -0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * 0.33333333333333331f + xi_5 * 0.16666666666666666f;
+    const float forceTerm_6 = omega_shear * u_0 * xi_10 * 0.083333333333333329f + omega_shear * u_1 * xi_9 * 0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.16666666666666666f + rr_0 * xi_5 * 0.083333333333333329f + u_0 * xi_10 * -0.16666666666666666f + u_1 * xi_9 * -0.16666666666666666f + u_2 * xi_5 * 0.33333333333333331f + xi_5 * -0.16666666666666666f;
+    const float forceTerm_7 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * 0.125f + omega_shear * u_1 * xi_10 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * -0.25f + u_1 * xi_10 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * -0.083333333333333329f + xi_9 * 0.083333333333333329f;
+    const float forceTerm_8 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * -0.125f + omega_shear * u_1 * xi_10 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * 0.25f + u_1 * xi_10 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * 0.083333333333333329f + xi_9 * 0.083333333333333329f;
+    const float forceTerm_9 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * -0.125f + omega_shear * u_1 * xi_10 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * 0.25f + u_1 * xi_10 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * -0.083333333333333329f + xi_9 * -0.083333333333333329f;
+    const float forceTerm_10 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_9 * 0.125f + omega_shear * u_1 * xi_10 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * 0.041666666666666664f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_9 * -0.25f + u_1 * xi_10 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * -0.083333333333333329f + xi_10 * 0.083333333333333329f + xi_9 * -0.083333333333333329f;
+    const float forceTerm_11 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * -0.125f + rr_0 * xi_5 * -0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * 0.25f + xi_5 * 0.083333333333333329f + xi_9 * 0.083333333333333329f;
+    const float forceTerm_12 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * 0.125f + rr_0 * xi_5 * -0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * -0.25f + xi_5 * 0.083333333333333329f + xi_9 * -0.083333333333333329f;
+    const float forceTerm_13 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * 0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_5 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * -0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * -0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * -0.083333333333333329f + xi_5 * 0.083333333333333329f;
+    const float forceTerm_14 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * -0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_5 * -0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * 0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * 0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * 0.083333333333333329f + xi_5 * 0.083333333333333329f;
+    const float forceTerm_15 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * 0.125f + rr_0 * xi_5 * 0.041666666666666664f + rr_0 * xi_9 * -0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * -0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * -0.25f + xi_5 * -0.083333333333333329f + xi_9 * 0.083333333333333329f;
+    const float forceTerm_16 = omega_shear * u_0 * xi_10 * 0.041666666666666664f + omega_shear * u_1 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * -0.083333333333333329f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + omega_shear * u_2 * xi_9 * -0.125f + rr_0 * xi_5 * 0.041666666666666664f + rr_0 * xi_9 * 0.041666666666666664f + u_0 * xi_10 * -0.083333333333333329f + u_1 * xi_5 * 0.25f + u_1 * xi_9 * 0.16666666666666666f + u_2 * xi_5 * 0.16666666666666666f + u_2 * xi_9 * 0.25f + xi_5 * -0.083333333333333329f + xi_9 * -0.083333333333333329f;
+    const float forceTerm_17 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * -0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * -0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * 0.041666666666666664f + rr_0 * xi_5 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * 0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * 0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * -0.083333333333333329f + xi_5 * -0.083333333333333329f;
+    const float forceTerm_18 = omega_shear * u_0 * xi_10 * -0.083333333333333329f + omega_shear * u_0 * xi_5 * 0.125f + omega_shear * u_1 * xi_9 * 0.041666666666666664f + omega_shear * u_2 * xi_10 * 0.125f + omega_shear * u_2 * xi_5 * -0.083333333333333329f + rr_0 * xi_10 * -0.041666666666666664f + rr_0 * xi_5 * 0.041666666666666664f + u_0 * xi_10 * 0.16666666666666666f + u_0 * xi_5 * -0.25f + u_1 * xi_9 * -0.083333333333333329f + u_2 * xi_10 * -0.25f + u_2 * xi_5 * 0.16666666666666666f + xi_10 * 0.083333333333333329f + xi_5 * -0.083333333333333329f;
+    const float u0Mu1 = u_0 - u_1;
+    const float u0Pu1 = u_0 + u_1;
+    const float u1Pu2 = u_1 + u_2;
+    const float u1Mu2 = u_1 - u_2;
+    const float u0Mu2 = u_0 - u_2;
+    const float u0Pu2 = u_0 + u_2;
+    const float f_eq_common = rho - rho * u_0 * u_0 - rho * u_1 * u_1 - rho * u_2 * u_2;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = forceTerm_0 + omega_shear * (f_eq_common * 0.33333333333333331f - xi_17) + xi_17;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = forceTerm_1 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_1 * u_1)) + xi_19 * -0.5f + xi_8 * -0.5f) + rr_0 * (rho * u_1 * 0.16666666666666666f + xi_19 * 0.5f + xi_8 * -0.5f) + xi_8 + ((-1.0f <= -grid_size + ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + v_s) * 0.16666666666666666f) : (0.0f));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = forceTerm_2 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_1 * u_1)) + xi_19 * -0.5f + xi_8 * -0.5f) + rr_0 * (rho * u_1 * -0.16666666666666666f + xi_19 * -0.5f + xi_8 * 0.5f) + xi_19 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * -2.0f + v_s) * 0.16666666666666666f) : (0.0f));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = forceTerm_3 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_0 * u_0)) + xi_14 * -0.5f + xi_6 * -0.5f) + rr_0 * (rho * u_0 * -0.16666666666666666f + xi_14 * 0.5f + xi_6 * -0.5f) + xi_6;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = forceTerm_4 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_0 * u_0)) + xi_14 * -0.5f + xi_6 * -0.5f) + rr_0 * (rho * u_0 * 0.16666666666666666f + xi_14 * -0.5f + xi_6 * 0.5f) + xi_14;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = forceTerm_5 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_2 * u_2)) + xi_16 * -0.5f + xi_22 * -0.5f) + rr_0 * (rho * u_2 * 0.16666666666666666f + xi_16 * 0.5f + xi_22 * -0.5f) + xi_22;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = forceTerm_6 + omega_shear * (f_eq_common * 0.16666666666666666f + rho * (-0.1111111111111111f + 0.33333333333333331f * (u_2 * u_2)) + xi_16 * -0.5f + xi_22 * -0.5f) + rr_0 * (rho * u_2 * -0.16666666666666666f + xi_16 * -0.5f + xi_22 * 0.5f) + xi_16;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = forceTerm_7 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Mu1 * u0Mu1)) + xi_24 * -0.5f + xi_7 * -0.5f) + rr_0 * (rho * u0Mu1 * -0.083333333333333329f + xi_24 * 0.5f + xi_7 * -0.5f) + xi_7 + ((-1.0f <= -grid_size + ((float)(ctr_1))) ? (rho * v_s * (u_0 * -2.0f + u_1 * 3.0f - v_s + 1.0f) * 0.083333333333333329f) : (0.0f));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = forceTerm_8 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Pu1 * u0Pu1)) + xi_12 * -0.5f + xi_13 * -0.5f) + rr_0 * (rho * u0Pu1 * 0.083333333333333329f + xi_12 * -0.5f + xi_13 * 0.5f) + xi_12 + ((-1.0f <= -grid_size + ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * 3.0f + v_s + 1.0f) * -0.083333333333333329f) : (0.0f));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = forceTerm_9 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Pu1 * u0Pu1)) + xi_12 * -0.5f + xi_13 * -0.5f) + rr_0 * (rho * u0Pu1 * -0.083333333333333329f + xi_12 * 0.5f + xi_13 * -0.5f) + xi_13 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * 3.0f - v_s - 1.0f) * 0.083333333333333329f) : (0.0f));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = forceTerm_10 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_2 * u_2) + 0.125f * (u0Mu1 * u0Mu1)) + xi_24 * -0.5f + xi_7 * -0.5f) + rr_0 * (rho * u0Mu1 * 0.083333333333333329f + xi_24 * -0.5f + xi_7 * 0.5f) + xi_24 + ((0.0f >= ((float)(ctr_1))) ? (rho * v_s * (u_0 * 2.0f + u_1 * -3.0f - v_s + 1.0f) * 0.083333333333333329f) : (0.0f));
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = forceTerm_11 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Pu2 * u1Pu2)) + xi_21 * -0.5f + xi_4 * -0.5f) + rr_0 * (rho * u1Pu2 * 0.083333333333333329f + xi_21 * -0.5f + xi_4 * 0.5f) + xi_21;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = forceTerm_12 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Mu2 * u1Mu2)) + xi_20 * -0.5f + xi_23 * -0.5f) + rr_0 * (rho * u1Mu2 * -0.083333333333333329f + xi_20 * 0.5f + xi_23 * -0.5f) + xi_23;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = forceTerm_13 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Mu2 * u0Mu2)) + xi_15 * -0.5f + xi_18 * -0.5f) + rr_0 * (rho * u0Mu2 * -0.083333333333333329f + xi_15 * -0.5f + xi_18 * 0.5f) + xi_15;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = forceTerm_14 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Pu2 * u0Pu2)) + xi_11 * -0.5f + xi_3 * -0.5f) + rr_0 * (rho * u0Pu2 * 0.083333333333333329f + xi_11 * -0.5f + xi_3 * 0.5f) + xi_11;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = forceTerm_15 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Mu2 * u1Mu2)) + xi_20 * -0.5f + xi_23 * -0.5f) + rr_0 * (rho * u1Mu2 * 0.083333333333333329f + xi_20 * -0.5f + xi_23 * 0.5f) + xi_20;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = forceTerm_16 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_0 * u_0) + 0.125f * (u1Pu2 * u1Pu2)) + xi_21 * -0.5f + xi_4 * -0.5f) + rr_0 * (rho * u1Pu2 * -0.083333333333333329f + xi_21 * 0.5f + xi_4 * -0.5f) + xi_4;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = forceTerm_17 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Pu2 * u0Pu2)) + xi_11 * -0.5f + xi_3 * -0.5f) + rr_0 * (rho * u0Pu2 * -0.083333333333333329f + xi_11 * 0.5f + xi_3 * -0.5f) + xi_3;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = forceTerm_18 + omega_shear * (f_eq_common * 0.041666666666666664f + rho * (-0.013888888888888888f + 0.041666666666666664f * (u_1 * u_1) + 0.125f * (u0Mu2 * u0Mu2)) + xi_15 * -0.5f + xi_18 * -0.5f) + rr_0 * (rho * u0Mu2 * 0.083333333333333329f + xi_15 * 0.5f + xi_18 * -0.5f) + xi_18;
+  }
+}
+} // namespace internal_collidesweepsingleprecisionleesedwardscuda_collidesweepsingleprecisionleesedwardscuda
+
+void CollideSweepSinglePrecisionLeesEdwardsCUDA::run(IBlock *block, gpuStream_t stream) {
+
+  auto pdfs = block->getData<gpu::GPUField<float>>(pdfsID);
+  auto force = block->getData<gpu::GPUField<float>>(forceID);
+
+  auto &v_s = this->v_s_;
+  auto &omega_shear = this->omega_shear_;
+  auto &grid_size = this->grid_size_;
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
+  float *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+  float *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_collidesweepsingleprecisionleesedwardscuda_collidesweepsingleprecisionleesedwardscuda::collidesweepsingleprecisionleesedwardscuda_collidesweepsingleprecisionleesedwardscuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, grid_size, omega_shear, v_s);
+}
+
+void CollideSweepSinglePrecisionLeesEdwardsCUDA::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block, gpuStream_t stream) {
+
+  CellInterval ci = globalCellInterval;
+  CellInterval blockBB = blocks->getBlockCellBB(*block);
+  blockBB.expand(ghostLayers);
+  ci.intersect(blockBB);
+  blocks->transformGlobalToBlockLocalCellInterval(ci, *block);
+  if (ci.empty())
+    return;
+
+  auto pdfs = block->getData<gpu::GPUField<float>>(pdfsID);
+  auto force = block->getData<gpu::GPUField<float>>(forceID);
+
+  auto &v_s = this->v_s_;
+  auto &omega_shear = this->omega_shear_;
+  auto &grid_size = this->grid_size_;
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
+  float *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+  float *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_collidesweepsingleprecisionleesedwardscuda_collidesweepsingleprecisionleesedwardscuda::collidesweepsingleprecisionleesedwardscuda_collidesweepsingleprecisionleesedwardscuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, grid_size, omega_shear, v_s);
+}
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h
new file mode 100644
index 00000000000..bf9a807a67f
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h
@@ -0,0 +1,122 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file CollideSweepSinglePrecisionLeesEdwardsCUDA.h
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "field/SwapableCompare.h"
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace pystencils {
+
+class CollideSweepSinglePrecisionLeesEdwardsCUDA {
+public:
+  CollideSweepSinglePrecisionLeesEdwardsCUDA(BlockDataID forceID_,
+                                             BlockDataID pdfsID_,
+                                             float grid_size, float omega_shear,
+                                             float v_s)
+      : forceID(forceID_), pdfsID(pdfsID_), grid_size_(grid_size),
+        omega_shear_(omega_shear), v_s_(v_s){};
+
+  void run(IBlock *block, gpuStream_t stream = nullptr);
+
+  void runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers, IBlock *block,
+                         gpuStream_t stream = nullptr);
+
+  void operator()(IBlock *block, gpuStream_t stream = nullptr) {
+    run(block, stream);
+  }
+
+  static std::function<void(IBlock *)> getSweep(
+      const shared_ptr<CollideSweepSinglePrecisionLeesEdwardsCUDA> &kernel) {
+    return [kernel](IBlock *b) { kernel->run(b); };
+  }
+
+  static std::function<void(IBlock *, gpuStream_t)> getSweepOnCellInterval(
+      const shared_ptr<CollideSweepSinglePrecisionLeesEdwardsCUDA> &kernel,
+      const shared_ptr<StructuredBlockStorage> &blocks,
+      const CellInterval &globalCellInterval, cell_idx_t ghostLayers = 1) {
+    return [kernel, blocks, globalCellInterval,
+            ghostLayers](IBlock *b, gpuStream_t stream = nullptr) {
+      kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                                stream);
+    };
+  }
+
+  std::function<void(IBlock *)> getSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->run(b, stream); };
+  }
+
+  std::function<void(IBlock *)>
+  getSweepOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers = 1,
+                         gpuStream_t stream = nullptr) {
+    return [this, blocks, globalCellInterval, ghostLayers, stream](IBlock *b) {
+      this->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                              stream);
+    };
+  }
+
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
+  BlockDataID forceID;
+  BlockDataID pdfsID;
+  float grid_size_;
+  float omega_shear_;
+  float v_s_;
+};
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.cpp
index bcdd45ddad4..f0d187f855a 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file CollideSweepSinglePrecisionThermalized.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -63,74 +62,30 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalized_collidesweepsingl
   const float rr_0 = 0.0f;
   const float xi_53 = rr_0 * 0.041666666666666664f;
   for (int64_t ctr_2 = 0; ctr_2 < _size_force_2; ctr_2 += 1) {
-    float *RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    float *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    float *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    float *RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    float *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
     for (int64_t ctr_1 = 0; ctr_1 < _size_force_1; ctr_1 += 1) {
-      float *RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_36;
-      float *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      float *RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_31;
-      float *RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_32;
-      float *RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_311;
-      float *RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_318;
-      float *RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_313;
-      float *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      float *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      float *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      float *RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_317;
-      float *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      float *RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_35;
-      float *RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_314;
-      float *RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_312;
-      float *RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_316;
-      float *RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_38;
-      float *RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_315;
-      float *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      float *RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_310;
-      float *RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_39;
-      float *RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_37;
       for (int64_t ctr_0 = 0; ctr_0 < _size_force_0; ctr_0 += 1) {
-        const float xi_244 = _data_pdfs_20_36_10[_stride_pdfs_0 * ctr_0];
-        const float xi_245 = _data_force_20_32_10[_stride_force_0 * ctr_0];
-        const float xi_246 = _data_pdfs_20_31_10[_stride_pdfs_0 * ctr_0];
-        const float xi_247 = _data_pdfs_20_32_10[_stride_pdfs_0 * ctr_0];
-        const float xi_248 = _data_pdfs_20_311_10[_stride_pdfs_0 * ctr_0];
-        const float xi_249 = _data_pdfs_20_318_10[_stride_pdfs_0 * ctr_0];
-        const float xi_250 = _data_pdfs_20_313_10[_stride_pdfs_0 * ctr_0];
-        const float xi_251 = _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0];
-        const float xi_252 = _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0];
-        const float xi_253 = _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0];
-        const float xi_254 = _data_pdfs_20_317_10[_stride_pdfs_0 * ctr_0];
-        const float xi_255 = _data_force_20_30_10[_stride_force_0 * ctr_0];
-        const float xi_256 = _data_pdfs_20_35_10[_stride_pdfs_0 * ctr_0];
-        const float xi_257 = _data_pdfs_20_314_10[_stride_pdfs_0 * ctr_0];
-        const float xi_258 = _data_pdfs_20_312_10[_stride_pdfs_0 * ctr_0];
-        const float xi_259 = _data_pdfs_20_316_10[_stride_pdfs_0 * ctr_0];
-        const float xi_260 = _data_pdfs_20_38_10[_stride_pdfs_0 * ctr_0];
-        const float xi_261 = _data_pdfs_20_315_10[_stride_pdfs_0 * ctr_0];
-        const float xi_262 = _data_force_20_31_10[_stride_force_0 * ctr_0];
-        const float xi_263 = _data_pdfs_20_310_10[_stride_pdfs_0 * ctr_0];
-        const float xi_264 = _data_pdfs_20_39_10[_stride_pdfs_0 * ctr_0];
-        const float xi_265 = _data_pdfs_20_37_10[_stride_pdfs_0 * ctr_0];
+        const float xi_244 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+        const float xi_245 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3];
+        const float xi_246 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+        const float xi_247 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3];
+        const float xi_248 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+        const float xi_249 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+        const float xi_250 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+        const float xi_251 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+        const float xi_252 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+        const float xi_253 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3];
+        const float xi_254 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3];
+        const float xi_255 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+        const float xi_256 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+        const float xi_257 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3];
+        const float xi_258 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+        const float xi_259 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3];
+        const float xi_260 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3];
+        const float xi_261 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+        const float xi_262 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3];
+        const float xi_263 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3];
+        const float xi_264 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3];
+        const float xi_265 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
 
         float random_3_0{};
         float random_3_1{};
@@ -163,69 +118,69 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalized_collidesweepsingl
         if (kT > 0.) {
           philox_float4(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 0, seed, random_0_0, random_0_1, random_0_2, random_0_3);
         }
-        const float xi_2 = xi_249 + xi_257;
-        const float xi_3 = xi_2 + xi_252;
-        const float xi_4 = xi_246 + xi_248 + xi_261;
-        const float xi_5 = xi_256 + xi_258;
-        const float xi_6 = xi_244 + xi_254;
-        const float xi_8 = xi_264 * -1.0f;
-        const float xi_9 = xi_265 * -1.0f;
-        const float xi_10 = xi_254 * -1.0f;
-        const float xi_11 = xi_250 * -1.0f;
-        const float xi_12 = xi_253 * -1.0f;
+        const float xi_2 = xi_257 + xi_262;
+        const float xi_3 = xi_2 + xi_265;
+        const float xi_4 = xi_251 + xi_254 + xi_264;
+        const float xi_5 = xi_253 + xi_263;
+        const float xi_6 = xi_245 + xi_260;
+        const float xi_8 = -xi_261;
+        const float xi_9 = -xi_255;
+        const float xi_10 = -xi_245;
+        const float xi_11 = -xi_259;
+        const float xi_12 = -xi_248;
         const float xi_13 = xi_10 + xi_11 + xi_12;
-        const float xi_14 = xi_247 * -1.0f;
-        const float xi_15 = xi_263 * -1.0f;
+        const float xi_14 = -xi_252;
+        const float xi_15 = -xi_258;
         const float xi_16 = xi_14 + xi_15;
-        const float xi_17 = xi_259 * -1.0f;
-        const float xi_18 = xi_258 * -1.0f;
+        const float xi_17 = -xi_247;
+        const float xi_18 = -xi_253;
         const float xi_19 = xi_17 + xi_18;
-        const float xi_20 = xi_249 * -1.0f;
+        const float xi_20 = -xi_257;
         const float xi_21 = xi_10 + xi_20;
-        const float xi_22 = xi_261 * -1.0f;
-        const float xi_23 = xi_244 * -1.0f;
-        const float xi_24 = xi_17 + xi_22 + xi_23 + xi_248;
-        const float xi_29 = xi_262 * 0.16666666666666666f;
-        const float xi_30 = xi_262 * 0.083333333333333329f;
-        const float xi_42 = xi_255 * 0.16666666666666666f;
-        const float xi_43 = xi_255 * 0.083333333333333329f;
-        const float xi_49 = xi_245 * 0.16666666666666666f;
-        const float xi_50 = xi_245 * 0.083333333333333329f;
-        const float xi_67 = xi_262 * 0.25f;
-        const float xi_72 = xi_262 * xi_71;
-        const float xi_114 = xi_251 * -1.0f;
-        const float xi_118 = xi_248 * -1.0f;
+        const float xi_22 = -xi_254;
+        const float xi_23 = -xi_260;
+        const float xi_24 = xi_17 + xi_22 + xi_23 + xi_264;
+        const float xi_29 = xi_244 * 0.16666666666666666f;
+        const float xi_30 = xi_244 * 0.083333333333333329f;
+        const float xi_42 = xi_246 * 0.16666666666666666f;
+        const float xi_43 = xi_246 * 0.083333333333333329f;
+        const float xi_49 = xi_256 * 0.16666666666666666f;
+        const float xi_50 = xi_256 * 0.083333333333333329f;
+        const float xi_67 = xi_244 * 0.25f;
+        const float xi_72 = xi_244 * xi_71;
+        const float xi_114 = -xi_250;
+        const float xi_118 = -xi_264;
         const float xi_119 = xi_118 + xi_18;
-        const float xi_120 = xi_260 * -1.0f + xi_8;
-        const float xi_122 = xi_257 * -1.0f;
+        const float xi_120 = -xi_249 + xi_8;
+        const float xi_122 = -xi_262;
         const float xi_123 = xi_11 + xi_122 + xi_15 + xi_21;
-        const float xi_125 = xi_248 * 2.0f + xi_258 * 2.0f + xi_259 * 2.0f + xi_261 * 2.0f;
-        const float xi_126 = xi_125 + xi_252 * 5.0f + xi_253 * 5.0f;
-        const float xi_128 = xi_250 * 2.0f;
-        const float xi_129 = xi_257 * 2.0f;
-        const float xi_130 = xi_249 * 2.0f + xi_254 * 2.0f;
-        const float xi_132 = xi_118 + xi_258;
-        const float xi_133 = xi_132 + xi_14 + xi_22 + xi_246 + xi_259;
+        const float xi_125 = xi_247 * 2.0f + xi_253 * 2.0f + xi_254 * 2.0f + xi_264 * 2.0f;
+        const float xi_126 = xi_125 + xi_248 * 5.0f + xi_265 * 5.0f;
+        const float xi_128 = xi_259 * 2.0f;
+        const float xi_129 = xi_262 * 2.0f;
+        const float xi_130 = xi_245 * 2.0f + xi_257 * 2.0f;
+        const float xi_132 = xi_118 + xi_253;
+        const float xi_133 = xi_132 + xi_14 + xi_22 + xi_247 + xi_251;
         const float xi_135 = xi_133 * xi_134;
         const float xi_136 = random_2_3 - 0.5f;
-        const float xi_141 = xi_265 * 2.0f;
-        const float xi_142 = xi_263 * 2.0f;
-        const float xi_143 = xi_260 * -2.0f + xi_264 * 2.0f;
-        const float xi_144 = xi_14 + xi_141 * -1.0f + xi_142 + xi_143 + xi_19 + xi_4;
+        const float xi_141 = xi_255 * 2.0f;
+        const float xi_142 = xi_258 * 2.0f;
+        const float xi_143 = xi_249 * -2.0f + xi_261 * 2.0f;
+        const float xi_144 = xi_14 - xi_141 + xi_142 + xi_143 + xi_19 + xi_4;
         const float xi_146 = xi_144 * xi_145;
         const float xi_147 = random_1_2 - 0.5f;
         const float xi_152 = random_0_1 - 0.5f;
-        const float xi_166 = xi_122 + xi_250;
-        const float xi_167 = xi_12 + xi_166 + xi_20 + xi_252 + xi_254;
+        const float xi_166 = xi_122 + xi_259;
+        const float xi_167 = xi_12 + xi_166 + xi_20 + xi_245 + xi_265;
         const float xi_168 = xi_134 * xi_167;
         const float xi_169 = random_2_1 - 0.5f;
-        const float xi_171 = xi_13 + xi_141 + xi_142 * -1.0f + xi_143 + xi_3;
+        const float xi_171 = xi_13 + xi_141 - xi_142 + xi_143 + xi_3;
         const float xi_172 = xi_145 * xi_171;
         const float xi_173 = random_2_0 - 0.5f;
-        const float xi_178 = xi_119 + xi_23 + xi_256 + xi_259 + xi_261;
+        const float xi_178 = xi_119 + xi_23 + xi_247 + xi_254 + xi_263;
         const float xi_179 = xi_134 * xi_178;
         const float xi_180 = random_2_2 - 0.5f;
-        const float xi_182 = xi_128 * -1.0f + xi_129 * -1.0f + xi_130 + xi_24 + xi_5;
+        const float xi_182 = -xi_128 - xi_129 + xi_130 + xi_24 + xi_5;
         const float xi_183 = xi_145 * xi_182;
         const float xi_184 = random_1_3 - 0.5f;
         const float xi_212 = xi_182 * xi_211;
@@ -237,28 +192,28 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalized_collidesweepsingl
         const float xi_31 = rr_0 * xi_30;
         const float xi_44 = rr_0 * xi_43;
         const float xi_51 = rr_0 * xi_50;
-        const float xi_54 = xi_255 * xi_53;
-        const float xi_59 = xi_262 * xi_53;
-        const float xi_81 = xi_245 * xi_53;
-        const float vel0Term = xi_260 + xi_263 + xi_3;
-        const float vel1Term = xi_265 + xi_4;
-        const float vel2Term = xi_250 + xi_5;
-        const float rho = vel0Term + vel1Term + vel2Term + xi_247 + xi_251 + xi_253 + xi_259 + xi_264 + xi_6;
+        const float xi_54 = xi_246 * xi_53;
+        const float xi_59 = xi_244 * xi_53;
+        const float xi_81 = xi_256 * xi_53;
+        const float vel0Term = xi_249 + xi_258 + xi_3;
+        const float vel1Term = xi_255 + xi_4;
+        const float vel2Term = xi_259 + xi_5;
+        const float rho = vel0Term + vel1Term + vel2Term + xi_247 + xi_248 + xi_250 + xi_252 + xi_261 + xi_6;
         const float xi_105 = kT * rho;
-        const float xi_106 = powf(xi_105 * (-1.0f * ((omega_even * -1.0f + 1.0f) * (omega_even * -1.0f + 1.0f)) + 1.0f), 0.5f);
+        const float xi_106 = powf(xi_105 * (1.0f - (-omega_even + 1.0f) * (-omega_even + 1.0f)), 0.5f);
         const float xi_107 = xi_106 * (random_3_0 - 0.5f) * 3.7416573867739413f;
         const float xi_108 = xi_106 * (random_3_2 - 0.5f) * 5.4772255750516612f;
-        const float xi_110 = xi_109 * (random_1_1 - 0.5f) * powf(xi_105 * (-1.0f * ((omega_bulk * -1.0f + 1.0f) * (omega_bulk * -1.0f + 1.0f)) + 1.0f), 0.5f);
+        const float xi_110 = xi_109 * (random_1_1 - 0.5f) * powf(xi_105 * (1.0f - (-omega_bulk + 1.0f) * (-omega_bulk + 1.0f)), 0.5f);
         const float xi_111 = xi_106 * (random_3_1 - 0.5f) * 8.3666002653407556f;
-        const float xi_137 = powf(xi_105 * (-1.0f * ((omega_odd * -1.0f + 1.0f) * (omega_odd * -1.0f + 1.0f)) + 1.0f), 0.5f);
+        const float xi_137 = powf(xi_105 * (1.0f - (-omega_odd + 1.0f) * (-omega_odd + 1.0f)), 0.5f);
         const float xi_138 = xi_137 * 1.4142135623730951f;
         const float xi_139 = xi_138 * 0.5f;
         const float xi_140 = xi_136 * xi_139;
         const float xi_148 = xi_109 * xi_137;
         const float xi_149 = xi_148 * 0.16666666666666666f;
         const float xi_150 = xi_147 * xi_149;
-        const float xi_151 = xi_146 * -1.0f + xi_150 * -1.0f;
-        const float xi_153 = powf(xi_105 * (-1.0f * ((omega_shear * -1.0f + 1.0f) * (omega_shear * -1.0f + 1.0f)) + 1.0f), 0.5f);
+        const float xi_151 = -xi_146 - xi_150;
+        const float xi_153 = powf(xi_105 * (1.0f - (-omega_shear + 1.0f) * (-omega_shear + 1.0f)), 0.5f);
         const float xi_154 = xi_153 * 0.5f;
         const float xi_155 = xi_152 * xi_154;
         const float xi_161 = xi_153 * (random_0_0 - 0.5f) * 1.7320508075688772f;
@@ -266,10 +221,10 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalized_collidesweepsingl
         const float xi_170 = xi_139 * xi_169;
         const float xi_174 = xi_149 * xi_173;
         const float xi_175 = xi_172 + xi_174;
-        const float xi_177 = xi_172 * -1.0f + xi_174 * -1.0f;
+        const float xi_177 = -xi_172 - xi_174;
         const float xi_181 = xi_139 * xi_180;
         const float xi_185 = xi_149 * xi_184;
-        const float xi_186 = xi_183 * -1.0f + xi_185 * -1.0f;
+        const float xi_186 = -xi_183 - xi_185;
         const float xi_188 = xi_183 + xi_185;
         const float xi_189 = xi_152 * xi_153 * 0.25f;
         const float xi_192 = xi_107 * 0.083333333333333329f;
@@ -281,108 +236,108 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalized_collidesweepsingl
         const float xi_216 = xi_184 * xi_215;
         const float xi_217 = xi_138 * 0.25f;
         const float xi_218 = xi_180 * xi_217;
-        const float xi_219 = xi_212 * -1.0f + xi_214 + xi_216 * -1.0f + xi_218;
+        const float xi_219 = -xi_212 + xi_214 - xi_216 + xi_218;
         const float xi_222 = xi_147 * xi_215;
         const float xi_223 = xi_136 * xi_217;
-        const float xi_224 = xi_220 * -1.0f + xi_221 + xi_222 * -1.0f + xi_223;
-        const float xi_225 = xi_220 + xi_221 * -1.0f + xi_222 + xi_223 * -1.0f;
-        const float xi_227 = xi_189 * -1.0f;
+        const float xi_224 = -xi_220 + xi_221 - xi_222 + xi_223;
+        const float xi_225 = xi_220 - xi_221 + xi_222 - xi_223;
+        const float xi_227 = -xi_189;
         const float xi_230 = xi_111 * 0.035714285714285712f;
         const float xi_232 = xi_154 * (random_0_3 - 0.5f);
         const float xi_237 = xi_169 * xi_217;
         const float xi_238 = xi_173 * xi_215;
-        const float xi_239 = xi_235 * -1.0f + xi_236 + xi_237 * -1.0f + xi_238;
-        const float xi_241 = xi_235 + xi_236 * -1.0f + xi_237 + xi_238 * -1.0f;
-        const float xi_242 = xi_212 + xi_214 * -1.0f + xi_216 + xi_218 * -1.0f;
+        const float xi_239 = -xi_235 + xi_236 - xi_237 + xi_238;
+        const float xi_241 = xi_235 - xi_236 + xi_237 - xi_238;
+        const float xi_242 = xi_212 - xi_214 + xi_216 - xi_218;
         const float xi_0 = ((1.0f) / (rho));
         const float xi_7 = xi_0 * 0.5f;
-        const float u_0 = xi_0 * (vel0Term + xi_13 + xi_8 + xi_9) + xi_255 * xi_7;
-        const float xi_25 = u_0 * xi_255;
+        const float u_0 = xi_0 * (vel0Term + xi_13 + xi_8 + xi_9) + xi_246 * xi_7;
+        const float xi_25 = u_0 * xi_246;
         const float xi_37 = xi_25 * 0.16666666666666666f;
         const float xi_38 = xi_25 * 0.083333333333333329f;
         const float xi_39 = omega_shear * xi_38;
-        const float xi_40 = xi_37 * -1.0f + xi_39;
-        const float xi_56 = xi_25 * xi_55 * -1.0f + xi_37;
-        const float xi_57 = xi_43 * -1.0f + xi_54 + xi_56;
-        const float xi_61 = xi_25 * xi_60 * -1.0f;
+        const float xi_40 = -xi_37 + xi_39;
+        const float xi_56 = -xi_25 * xi_55 + xi_37;
+        const float xi_57 = -xi_43 + xi_54 + xi_56;
+        const float xi_61 = -xi_25 * xi_60;
         const float xi_68 = u_0 * xi_67;
         const float xi_73 = u_0 * xi_72;
-        const float xi_77 = xi_43 + xi_54 * -1.0f + xi_56;
-        const float xi_84 = xi_38 * -1.0f;
-        const float xi_95 = u_0 * xi_245;
+        const float xi_77 = xi_43 - xi_54 + xi_56;
+        const float xi_84 = -xi_38;
+        const float xi_95 = u_0 * xi_256;
         const float xi_96 = xi_95 * 0.25f;
         const float xi_99 = xi_71 * xi_95;
         const float xi_113 = rho * (u_0 * u_0);
-        const float u_1 = xi_0 * (vel1Term + xi_16 + xi_19 + xi_260 + xi_8) + xi_262 * xi_7;
-        const float xi_26 = u_1 * xi_262;
+        const float u_1 = xi_0 * (vel1Term + xi_16 + xi_19 + xi_249 + xi_8) + xi_244 * xi_7;
+        const float xi_26 = u_1 * xi_244;
         const float xi_32 = xi_26 * 0.16666666666666666f;
         const float xi_45 = xi_26 * 0.083333333333333329f;
         const float xi_46 = omega_shear * xi_45;
-        const float xi_47 = xi_32 * -1.0f + xi_46;
-        const float xi_62 = xi_26 * xi_60 * -1.0f;
+        const float xi_47 = -xi_32 + xi_46;
+        const float xi_62 = -xi_26 * xi_60;
         const float xi_69 = u_1 * 0.25f;
-        const float xi_70 = xi_255 * xi_69;
+        const float xi_70 = xi_246 * xi_69;
         const float xi_74 = u_1 * xi_71;
-        const float xi_75 = xi_255 * xi_74;
-        const float xi_76 = xi_68 * -1.0f + xi_70 * -1.0f + xi_73 + xi_75;
-        const float xi_78 = xi_68 + xi_70 + xi_73 * -1.0f + xi_75 * -1.0f;
-        const float xi_86 = xi_245 * xi_69;
-        const float xi_88 = xi_245 * xi_74;
-        const float xi_93 = xi_45 * -1.0f;
+        const float xi_75 = xi_246 * xi_74;
+        const float xi_76 = -xi_68 - xi_70 + xi_73 + xi_75;
+        const float xi_78 = xi_68 + xi_70 - xi_73 - xi_75;
+        const float xi_86 = xi_256 * xi_69;
+        const float xi_88 = xi_256 * xi_74;
+        const float xi_93 = -xi_45;
         const float xi_112 = rho * (u_1 * u_1);
         const float xi_121 = xi_112 + xi_120 + xi_9;
         const float xi_197 = rho * u_1;
-        const float xi_199 = xi_198 * (u_0 * xi_197 + xi_120 + xi_263 + xi_265);
-        const float xi_200 = xi_196 * -1.0f + xi_199 * -1.0f;
+        const float xi_199 = xi_198 * (u_0 * xi_197 + xi_120 + xi_255 + xi_258);
+        const float xi_200 = -xi_196 - xi_199;
         const float xi_201 = xi_196 + xi_199;
-        const float u_2 = xi_0 * (vel2Term + xi_21 + xi_24 + xi_257) + xi_245 * xi_7;
-        const float xi_27 = u_2 * xi_245;
+        const float u_2 = xi_0 * (vel2Term + xi_21 + xi_24 + xi_262) + xi_256 * xi_7;
+        const float xi_27 = u_2 * xi_256;
         const float xi_33 = xi_27 * 0.16666666666666666f;
         const float xi_34 = xi_27 * 0.083333333333333329f;
         const float xi_35 = omega_shear * xi_34;
-        const float xi_36 = xi_33 * -1.0f + xi_35;
-        const float xi_41 = omega_shear * xi_32 * -1.0f + xi_26 * 0.33333333333333331f + xi_36 + xi_40;
-        const float xi_48 = omega_shear * xi_37 * -1.0f + xi_25 * 0.33333333333333331f + xi_36 + xi_47;
-        const float xi_52 = omega_shear * xi_33 * -1.0f + xi_27 * 0.33333333333333331f + xi_40 + xi_47;
-        const float xi_58 = xi_34 * -1.0f;
-        const float xi_63 = xi_27 * xi_60 * -1.0f;
-        const float xi_64 = xi_26 * xi_55 * -1.0f + xi_32 + xi_61 + xi_62 + xi_63;
-        const float xi_65 = xi_30 + xi_59 * -1.0f + xi_64;
+        const float xi_36 = -xi_33 + xi_35;
+        const float xi_41 = -omega_shear * xi_32 + xi_26 * 0.33333333333333331f + xi_36 + xi_40;
+        const float xi_48 = -omega_shear * xi_37 + xi_25 * 0.33333333333333331f + xi_36 + xi_47;
+        const float xi_52 = -omega_shear * xi_33 + xi_27 * 0.33333333333333331f + xi_40 + xi_47;
+        const float xi_58 = -xi_34;
+        const float xi_63 = -xi_27 * xi_60;
+        const float xi_64 = -xi_26 * xi_55 + xi_32 + xi_61 + xi_62 + xi_63;
+        const float xi_65 = xi_30 - xi_59 + xi_64;
         const float xi_66 = xi_35 + xi_58 + xi_65;
-        const float xi_79 = xi_30 * -1.0f + xi_59 + xi_64;
+        const float xi_79 = -xi_30 + xi_59 + xi_64;
         const float xi_80 = xi_35 + xi_58 + xi_79;
-        const float xi_82 = xi_27 * xi_55 * -1.0f + xi_33;
-        const float xi_83 = xi_50 + xi_81 * -1.0f + xi_82;
+        const float xi_82 = -xi_27 * xi_55 + xi_33;
+        const float xi_83 = xi_50 - xi_81 + xi_82;
         const float xi_85 = xi_39 + xi_65 + xi_84;
         const float xi_87 = u_2 * xi_67;
         const float xi_89 = u_2 * xi_72;
-        const float xi_90 = xi_86 + xi_87 + xi_88 * -1.0f + xi_89 * -1.0f;
+        const float xi_90 = xi_86 + xi_87 - xi_88 - xi_89;
         const float xi_91 = xi_39 + xi_79 + xi_84;
-        const float xi_92 = xi_86 * -1.0f + xi_87 * -1.0f + xi_88 + xi_89;
+        const float xi_92 = -xi_86 - xi_87 + xi_88 + xi_89;
         const float xi_94 = xi_46 + xi_61 + xi_62 + xi_63 + xi_83 + xi_93;
-        const float xi_97 = u_2 * xi_255;
+        const float xi_97 = u_2 * xi_246;
         const float xi_98 = xi_97 * 0.25f;
         const float xi_100 = xi_71 * xi_97;
-        const float xi_101 = xi_100 + xi_96 * -1.0f + xi_98 * -1.0f + xi_99;
-        const float xi_102 = xi_100 * -1.0f + xi_96 + xi_98 + xi_99 * -1.0f;
-        const float xi_103 = xi_50 * -1.0f + xi_81 + xi_82;
+        const float xi_101 = xi_100 - xi_96 - xi_98 + xi_99;
+        const float xi_102 = -xi_100 + xi_96 + xi_98 - xi_99;
+        const float xi_103 = -xi_50 + xi_81 + xi_82;
         const float xi_104 = xi_103 + xi_46 + xi_61 + xi_62 + xi_63 + xi_93;
         const float xi_115 = rho * (u_2 * u_2);
-        const float xi_116 = xi_114 + xi_115 * 0.66666666666666663f + xi_244 * 3.0f + xi_256 * 3.0f;
-        const float xi_117 = omega_even * (xi_112 * 0.66666666666666663f + xi_113 * 1.6666666666666667f + xi_116 + xi_246 * 3.0f + xi_247 * 3.0f + xi_248 * -3.0f + xi_258 * -3.0f + xi_259 * -3.0f + xi_261 * -3.0f);
-        const float xi_124 = omega_bulk * (xi_113 + xi_115 + xi_119 + xi_121 + xi_123 + xi_17 + xi_22 + xi_251);
-        const float xi_127 = omega_even * (xi_112 * 2.3333333333333335f + xi_116 + xi_126 + xi_246 * -2.0f + xi_247 * -2.0f + xi_249 * -5.0f + xi_250 * -5.0f + xi_254 * -5.0f + xi_257 * -5.0f);
-        const float xi_131 = omega_even * (xi_114 + xi_115 * 3.0f + xi_126 + xi_128 + xi_129 + xi_130 + xi_244 * -4.0f + xi_246 * 5.0f + xi_247 * 5.0f + xi_256 * -4.0f + xi_260 * -7.0f + xi_263 * -7.0f + xi_264 * -7.0f + xi_265 * -7.0f);
-        const float xi_156 = xi_115 * -1.0f + xi_256;
-        const float xi_157 = omega_shear * (xi_121 + xi_156 + xi_16 + xi_2 + xi_246 * -1.0f + xi_250 + xi_6);
+        const float xi_116 = xi_114 + xi_115 * 0.66666666666666663f + xi_260 * 3.0f + xi_263 * 3.0f;
+        const float xi_117 = omega_even * (xi_112 * 0.66666666666666663f + xi_113 * 1.6666666666666667f + xi_116 + xi_247 * -3.0f + xi_251 * 3.0f + xi_252 * 3.0f + xi_253 * -3.0f + xi_254 * -3.0f + xi_264 * -3.0f);
+        const float xi_124 = omega_bulk * (xi_113 + xi_115 + xi_119 + xi_121 + xi_123 + xi_17 + xi_22 + xi_250);
+        const float xi_127 = omega_even * (xi_112 * 2.3333333333333335f + xi_116 + xi_126 + xi_245 * -5.0f + xi_251 * -2.0f + xi_252 * -2.0f + xi_257 * -5.0f + xi_259 * -5.0f + xi_262 * -5.0f);
+        const float xi_131 = omega_even * (xi_114 + xi_115 * 3.0f + xi_126 + xi_128 + xi_129 + xi_130 + xi_249 * -7.0f + xi_251 * 5.0f + xi_252 * 5.0f + xi_255 * -7.0f + xi_258 * -7.0f + xi_260 * -4.0f + xi_261 * -7.0f + xi_263 * -4.0f);
+        const float xi_156 = -xi_115 + xi_263;
+        const float xi_157 = omega_shear * (xi_121 + xi_156 + xi_16 + xi_2 - xi_251 + xi_259 + xi_6);
         const float xi_158 = xi_157 * 0.125f;
         const float xi_159 = xi_107 * -0.11904761904761904f + xi_131 * -0.01984126984126984f;
-        const float xi_160 = omega_shear * (xi_112 * -1.0f + xi_113 * 2.0f + xi_120 + xi_123 + xi_125 + xi_156 + xi_244 + xi_246 + xi_247 + xi_252 * -2.0f + xi_253 * -2.0f + xi_9);
+        const float xi_160 = omega_shear * (-xi_112 + xi_113 * 2.0f + xi_120 + xi_123 + xi_125 + xi_156 + xi_248 * -2.0f + xi_251 + xi_252 + xi_260 + xi_265 * -2.0f + xi_9);
         const float xi_162 = xi_160 * -0.041666666666666664f + xi_161 * -0.16666666666666666f;
         const float xi_163 = xi_108 * -0.10000000000000001f + xi_117 * -0.050000000000000003f + xi_162;
         const float xi_164 = xi_111 * 0.028571428571428571f + xi_127 * 0.014285714285714285f + xi_155 + xi_158 + xi_159 + xi_163;
         const float xi_176 = xi_111 * -0.071428571428571425f + xi_127 * -0.035714285714285712f + xi_159 + xi_160 * 0.083333333333333329f + xi_161 * 0.33333333333333331f;
-        const float xi_187 = xi_107 * 0.095238095238095233f + xi_111 * -0.042857142857142858f + xi_127 * -0.021428571428571429f + xi_131 * 0.015873015873015872f + xi_155 * -1.0f + xi_158 * -1.0f + xi_163;
+        const float xi_187 = xi_107 * 0.095238095238095233f + xi_111 * -0.042857142857142858f + xi_127 * -0.021428571428571429f + xi_131 * 0.015873015873015872f - xi_155 - xi_158 + xi_163;
         const float xi_190 = xi_157 * 0.0625f;
         const float xi_191 = xi_131 * 0.013888888888888888f;
         const float xi_193 = xi_110 * 0.083333333333333329f + xi_124 * 0.041666666666666664f;
@@ -390,25 +345,25 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalized_collidesweepsingl
         const float xi_195 = xi_165 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
         const float xi_202 = xi_151 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
         const float xi_204 = xi_127 * -0.0071428571428571426f;
-        const float xi_205 = xi_198 * (u_2 * xi_197 + xi_132 + xi_17 + xi_261);
+        const float xi_205 = xi_198 * (u_2 * xi_197 + xi_132 + xi_17 + xi_254);
         const float xi_206 = xi_117 * 0.025000000000000001f;
         const float xi_209 = xi_107 * -0.023809523809523808f + xi_131 * -0.003968253968253968f;
         const float xi_210 = xi_162 + xi_193 + xi_203 + xi_204 + xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
-        const float xi_226 = xi_162 + xi_193 + xi_203 * -1.0f + xi_204 + xi_205 * -1.0f + xi_206 + xi_207 + xi_208 + xi_209;
-        const float xi_228 = xi_190 * -1.0f;
+        const float xi_226 = xi_162 + xi_193 - xi_203 + xi_204 - xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
+        const float xi_228 = -xi_190;
         const float xi_229 = xi_127 * 0.017857142857142856f;
         const float xi_231 = xi_188 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
-        const float xi_233 = xi_198 * (rho * u_0 * u_2 + xi_10 + xi_166 + xi_249);
-        const float xi_234 = xi_232 * -1.0f + xi_233 * -1.0f;
+        const float xi_233 = xi_198 * (rho * u_0 * u_2 + xi_10 + xi_166 + xi_257);
+        const float xi_234 = -xi_232 - xi_233;
         const float xi_240 = xi_232 + xi_233;
         const float xi_243 = xi_186 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
-        const float forceTerm_0 = xi_25 * xi_28 + xi_25 * -1.0f + xi_26 * xi_28 + xi_26 * -1.0f + xi_27 * xi_28 + xi_27 * -1.0f;
-        const float forceTerm_1 = xi_29 + xi_31 * -1.0f + xi_41;
-        const float forceTerm_2 = xi_29 * -1.0f + xi_31 + xi_41;
-        const float forceTerm_3 = xi_42 * -1.0f + xi_44 + xi_48;
-        const float forceTerm_4 = xi_42 + xi_44 * -1.0f + xi_48;
-        const float forceTerm_5 = xi_49 + xi_51 * -1.0f + xi_52;
-        const float forceTerm_6 = xi_49 * -1.0f + xi_51 + xi_52;
+        const float forceTerm_0 = xi_25 * xi_28 - xi_25 + xi_26 * xi_28 - xi_26 + xi_27 * xi_28 - xi_27;
+        const float forceTerm_1 = xi_29 - xi_31 + xi_41;
+        const float forceTerm_2 = -xi_29 + xi_31 + xi_41;
+        const float forceTerm_3 = -xi_42 + xi_44 + xi_48;
+        const float forceTerm_4 = xi_42 - xi_44 + xi_48;
+        const float forceTerm_5 = xi_49 - xi_51 + xi_52;
+        const float forceTerm_6 = -xi_49 + xi_51 + xi_52;
         const float forceTerm_7 = xi_57 + xi_66 + xi_76;
         const float forceTerm_8 = xi_66 + xi_77 + xi_78;
         const float forceTerm_9 = xi_57 + xi_78 + xi_80;
@@ -421,25 +376,25 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalized_collidesweepsingl
         const float forceTerm_16 = xi_103 + xi_90 + xi_91;
         const float forceTerm_17 = xi_102 + xi_104 + xi_57;
         const float forceTerm_18 = xi_101 + xi_104 + xi_77;
-        _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0] = forceTerm_0 + xi_107 * 0.14285714285714285f + xi_108 * 0.20000000000000001f + xi_110 * -1.0f + xi_111 * 0.085714285714285715f + xi_117 * 0.10000000000000001f + xi_124 * -0.5f + xi_127 * 0.042857142857142858f + xi_131 * 0.023809523809523808f + xi_251;
-        _data_pdfs_20_31_10[_stride_pdfs_0 * ctr_0] = forceTerm_1 + xi_135 * -1.0f + xi_140 * -1.0f + xi_151 + xi_164 + xi_246;
-        _data_pdfs_20_32_10[_stride_pdfs_0 * ctr_0] = forceTerm_2 + xi_135 + xi_140 + xi_164 + xi_165 + xi_247;
-        _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0] = forceTerm_3 + xi_168 + xi_170 + xi_175 + xi_176 + xi_253;
-        _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0] = forceTerm_4 + xi_168 * -1.0f + xi_170 * -1.0f + xi_176 + xi_177 + xi_252;
-        _data_pdfs_20_35_10[_stride_pdfs_0 * ctr_0] = forceTerm_5 + xi_179 * -1.0f + xi_181 * -1.0f + xi_186 + xi_187 + xi_256;
-        _data_pdfs_20_36_10[_stride_pdfs_0 * ctr_0] = forceTerm_6 + xi_179 + xi_181 + xi_187 + xi_188 + xi_244;
-        _data_pdfs_20_37_10[_stride_pdfs_0 * ctr_0] = forceTerm_7 + xi_177 + xi_195 + xi_200 + xi_265;
-        _data_pdfs_20_38_10[_stride_pdfs_0 * ctr_0] = forceTerm_8 + xi_175 + xi_195 + xi_201 + xi_260;
-        _data_pdfs_20_39_10[_stride_pdfs_0 * ctr_0] = forceTerm_9 + xi_177 + xi_201 + xi_202 + xi_264;
-        _data_pdfs_20_310_10[_stride_pdfs_0 * ctr_0] = forceTerm_10 + xi_175 + xi_200 + xi_202 + xi_263;
-        _data_pdfs_20_311_10[_stride_pdfs_0 * ctr_0] = forceTerm_11 + xi_210 + xi_219 + xi_224 + xi_248;
-        _data_pdfs_20_312_10[_stride_pdfs_0 * ctr_0] = forceTerm_12 + xi_219 + xi_225 + xi_226 + xi_258;
-        _data_pdfs_20_313_10[_stride_pdfs_0 * ctr_0] = forceTerm_13 + xi_231 + xi_234 + xi_239 + xi_250;
-        _data_pdfs_20_314_10[_stride_pdfs_0 * ctr_0] = forceTerm_14 + xi_231 + xi_240 + xi_241 + xi_257;
-        _data_pdfs_20_315_10[_stride_pdfs_0 * ctr_0] = forceTerm_15 + xi_224 + xi_226 + xi_242 + xi_261;
-        _data_pdfs_20_316_10[_stride_pdfs_0 * ctr_0] = forceTerm_16 + xi_210 + xi_225 + xi_242 + xi_259;
-        _data_pdfs_20_317_10[_stride_pdfs_0 * ctr_0] = forceTerm_17 + xi_239 + xi_240 + xi_243 + xi_254;
-        _data_pdfs_20_318_10[_stride_pdfs_0 * ctr_0] = forceTerm_18 + xi_234 + xi_241 + xi_243 + xi_249;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = forceTerm_0 + xi_107 * 0.14285714285714285f + xi_108 * 0.20000000000000001f - xi_110 + xi_111 * 0.085714285714285715f + xi_117 * 0.10000000000000001f + xi_124 * -0.5f + xi_127 * 0.042857142857142858f + xi_131 * 0.023809523809523808f + xi_250;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = forceTerm_1 - xi_135 - xi_140 + xi_151 + xi_164 + xi_251;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = forceTerm_2 + xi_135 + xi_140 + xi_164 + xi_165 + xi_252;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = forceTerm_3 + xi_168 + xi_170 + xi_175 + xi_176 + xi_248;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = forceTerm_4 - xi_168 - xi_170 + xi_176 + xi_177 + xi_265;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = forceTerm_5 - xi_179 - xi_181 + xi_186 + xi_187 + xi_263;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = forceTerm_6 + xi_179 + xi_181 + xi_187 + xi_188 + xi_260;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = forceTerm_7 + xi_177 + xi_195 + xi_200 + xi_255;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = forceTerm_8 + xi_175 + xi_195 + xi_201 + xi_249;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = forceTerm_9 + xi_177 + xi_201 + xi_202 + xi_261;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = forceTerm_10 + xi_175 + xi_200 + xi_202 + xi_258;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = forceTerm_11 + xi_210 + xi_219 + xi_224 + xi_264;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = forceTerm_12 + xi_219 + xi_225 + xi_226 + xi_253;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = forceTerm_13 + xi_231 + xi_234 + xi_239 + xi_259;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = forceTerm_14 + xi_231 + xi_240 + xi_241 + xi_262;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = forceTerm_15 + xi_224 + xi_226 + xi_242 + xi_254;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = forceTerm_16 + xi_210 + xi_225 + xi_242 + xi_247;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = forceTerm_17 + xi_239 + xi_240 + xi_243 + xi_245;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = forceTerm_18 + xi_234 + xi_241 + xi_243 + xi_257;
       }
     }
   }
@@ -447,35 +402,37 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalized_collidesweepsingl
 } // namespace internal_69764eed2d0964e29e3b97d1054b4693
 
 void CollideSweepSinglePrecisionThermalized::run(IBlock *block) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
 
-  auto &time_step = this->time_step_;
+  auto &omega_shear = this->omega_shear_;
+  auto &omega_even = this->omega_even_;
+  auto &block_offset_1 = this->block_offset_1_;
   auto &kT = this->kT_;
-  auto &omega_odd = this->omega_odd_;
   auto &seed = this->seed_;
+  auto &time_step = this->time_step_;
+  auto &block_offset_2 = this->block_offset_2_;
+  auto &omega_odd = this->omega_odd_;
+  auto &block_offset_0 = this->block_offset_0_;
   auto &omega_bulk = this->omega_bulk_;
-  auto block_offset_0 = this->block_offset_0_;
-  auto &omega_shear = this->omega_shear_;
-  auto &omega_even = this->omega_even_;
-  auto block_offset_2 = this->block_offset_2_;
-  auto block_offset_1 = this->block_offset_1_;
-  block_offset_generator(block, block_offset_0, block_offset_1, block_offset_2);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -488,6 +445,9 @@ void CollideSweepSinglePrecisionThermalized::run(IBlock *block) {
 }
 
 void CollideSweepSinglePrecisionThermalized::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -499,36 +459,35 @@ void CollideSweepSinglePrecisionThermalized::runOnCellInterval(const shared_ptr<
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
 
-  auto &time_step = this->time_step_;
+  auto &omega_shear = this->omega_shear_;
+  auto &omega_even = this->omega_even_;
+  auto &block_offset_1 = this->block_offset_1_;
   auto &kT = this->kT_;
-  auto &omega_odd = this->omega_odd_;
   auto &seed = this->seed_;
+  auto &time_step = this->time_step_;
+  auto &block_offset_2 = this->block_offset_2_;
+  auto &omega_odd = this->omega_odd_;
+  auto &block_offset_0 = this->block_offset_0_;
   auto &omega_bulk = this->omega_bulk_;
-  auto block_offset_0 = this->block_offset_0_;
-  auto &omega_shear = this->omega_shear_;
-  auto &omega_even = this->omega_even_;
-  auto block_offset_2 = this->block_offset_2_;
-  auto block_offset_1 = this->block_offset_1_;
-  block_offset_generator(block, block_offset_0, block_offset_1, block_offset_2);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -549,4 +508,4 @@ void CollideSweepSinglePrecisionThermalized::runOnCellInterval(const shared_ptr<
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h
index 91353151cdf..5a9aa9eb3fc 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalized.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -51,16 +52,15 @@ namespace pystencils {
 
 class CollideSweepSinglePrecisionThermalized {
 public:
-  CollideSweepSinglePrecisionThermalized(
-      BlockDataID forceID_, BlockDataID pdfsID_, uint32_t block_offset_0,
-      uint32_t block_offset_1, uint32_t block_offset_2, float kT,
-      float omega_bulk, float omega_even, float omega_odd, float omega_shear,
-      uint32_t seed, uint32_t time_step)
-      : forceID(forceID_), pdfsID(pdfsID_), block_offset_0_(block_offset_0),
-        block_offset_1_(block_offset_1), block_offset_2_(block_offset_2),
-        kT_(kT), omega_bulk_(omega_bulk), omega_even_(omega_even),
-        omega_odd_(omega_odd), omega_shear_(omega_shear), seed_(seed),
-        time_step_(time_step){};
+  CollideSweepSinglePrecisionThermalized(BlockDataID forceID_,
+                                         BlockDataID pdfsID_, float kT,
+                                         float omega_bulk, float omega_even,
+                                         float omega_odd, float omega_shear,
+                                         uint32_t seed, uint32_t time_step)
+      : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
+        omega_even_(omega_even), omega_odd_(omega_odd),
+        omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
+        configured_(false){};
 
   void run(IBlock *block);
 
@@ -97,6 +97,15 @@ class CollideSweepSinglePrecisionThermalized {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {
+    Cell BlockCellBB = blocks->getBlockCellBB(*block).min();
+    block_offset_0_ = uint32_t(BlockCellBB[0]);
+    block_offset_1_ = uint32_t(BlockCellBB[1]);
+    block_offset_2_ = uint32_t(BlockCellBB[2]);
+    configured_ = true;
+  }
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   uint32_t block_offset_0_;
@@ -109,9 +118,7 @@ class CollideSweepSinglePrecisionThermalized {
   float omega_shear_;
   uint32_t seed_;
   uint32_t time_step_;
-  std::function<void(IBlock *, uint32_t &, uint32_t &, uint32_t &)>
-      block_offset_generator =
-          [](IBlock *const, uint32_t &, uint32_t &, uint32_t &) {};
+  bool configured_;
 };
 
 } // namespace pystencils
@@ -120,4 +127,4 @@ class CollideSweepSinglePrecisionThermalized {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.cpp
index 0455514ffaa..0a650a8e75c 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file CollideSweepSinglePrecisionThermalizedAVX.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -65,75 +64,31 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
   const float rr_0 = 0.0f;
   const float xi_53 = rr_0 * 0.041666666666666664f;
   for (int64_t ctr_2 = 0; ctr_2 < _size_force_2; ctr_2 += 1) {
-    float *RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    float *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    float *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    float *RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    float *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
     for (int64_t ctr_1 = 0; ctr_1 < _size_force_1; ctr_1 += 1) {
-      float *RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_36;
-      float *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      float *RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_31;
-      float *RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_32;
-      float *RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_311;
-      float *RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_318;
-      float *RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_313;
-      float *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      float *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      float *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      float *RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_317;
-      float *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      float *RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_35;
-      float *RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_314;
-      float *RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_312;
-      float *RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_316;
-      float *RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_38;
-      float *RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_315;
-      float *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      float *RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_310;
-      float *RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_39;
-      float *RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_37;
       {
         for (int64_t ctr_0 = 0; ctr_0 < (int64_t)((_size_force_0) / (8)) * (8); ctr_0 += 8) {
-          const __m256 xi_244 = _mm256_load_ps(&_data_pdfs_20_36_10[ctr_0]);
-          const __m256 xi_245 = _mm256_load_ps(&_data_force_20_32_10[ctr_0]);
-          const __m256 xi_246 = _mm256_load_ps(&_data_pdfs_20_31_10[ctr_0]);
-          const __m256 xi_247 = _mm256_load_ps(&_data_pdfs_20_32_10[ctr_0]);
-          const __m256 xi_248 = _mm256_load_ps(&_data_pdfs_20_311_10[ctr_0]);
-          const __m256 xi_249 = _mm256_load_ps(&_data_pdfs_20_318_10[ctr_0]);
-          const __m256 xi_250 = _mm256_load_ps(&_data_pdfs_20_313_10[ctr_0]);
-          const __m256 xi_251 = _mm256_load_ps(&_data_pdfs_20_30_10[ctr_0]);
-          const __m256 xi_252 = _mm256_load_ps(&_data_pdfs_20_34_10[ctr_0]);
-          const __m256 xi_253 = _mm256_load_ps(&_data_pdfs_20_33_10[ctr_0]);
-          const __m256 xi_254 = _mm256_load_ps(&_data_pdfs_20_317_10[ctr_0]);
-          const __m256 xi_255 = _mm256_load_ps(&_data_force_20_30_10[ctr_0]);
-          const __m256 xi_256 = _mm256_load_ps(&_data_pdfs_20_35_10[ctr_0]);
-          const __m256 xi_257 = _mm256_load_ps(&_data_pdfs_20_314_10[ctr_0]);
-          const __m256 xi_258 = _mm256_load_ps(&_data_pdfs_20_312_10[ctr_0]);
-          const __m256 xi_259 = _mm256_load_ps(&_data_pdfs_20_316_10[ctr_0]);
-          const __m256 xi_260 = _mm256_load_ps(&_data_pdfs_20_38_10[ctr_0]);
-          const __m256 xi_261 = _mm256_load_ps(&_data_pdfs_20_315_10[ctr_0]);
-          const __m256 xi_262 = _mm256_load_ps(&_data_force_20_31_10[ctr_0]);
-          const __m256 xi_263 = _mm256_load_ps(&_data_pdfs_20_310_10[ctr_0]);
-          const __m256 xi_264 = _mm256_load_ps(&_data_pdfs_20_39_10[ctr_0]);
-          const __m256 xi_265 = _mm256_load_ps(&_data_pdfs_20_37_10[ctr_0]);
+          const __m256 xi_244 = _mm256_loadu_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0]);
+          const __m256 xi_245 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_246 = _mm256_load_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0]);
+          const __m256 xi_247 = _mm256_load_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_248 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_249 = _mm256_load_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_250 = _mm256_load_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0]);
+          const __m256 xi_251 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_252 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_253 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_254 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_255 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_256 = _mm256_loadu_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0]);
+          const __m256 xi_257 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_258 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_259 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_260 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_261 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_262 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_263 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_264 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0]);
+          const __m256 xi_265 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0]);
 
           __m256 random_3_0{};
           __m256 random_3_1{};
@@ -166,66 +121,66 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           if (kT > 0.) {
             philox_float4(time_step, _mm256_add_epi32(_mm256_add_epi32(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), _mm256_set_epi32(ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0)), _mm256_set_epi32(((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)))), block_offset_1 + ctr_1, block_offset_2 + ctr_2, 0, seed, random_0_0, random_0_1, random_0_2, random_0_3);
           }
-          const __m256 xi_2 = _mm256_add_ps(xi_249, xi_257);
-          const __m256 xi_3 = _mm256_add_ps(xi_2, xi_252);
-          const __m256 xi_4 = _mm256_add_ps(_mm256_add_ps(xi_246, xi_248), xi_261);
-          const __m256 xi_5 = _mm256_add_ps(xi_256, xi_258);
-          const __m256 xi_6 = _mm256_add_ps(xi_244, xi_254);
-          const __m256 xi_8 = _mm256_mul_ps(xi_264, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
-          const __m256 xi_9 = _mm256_mul_ps(xi_265, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
-          const __m256 xi_10 = _mm256_mul_ps(xi_254, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
-          const __m256 xi_11 = _mm256_mul_ps(xi_250, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
-          const __m256 xi_12 = _mm256_mul_ps(xi_253, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_2 = _mm256_add_ps(xi_257, xi_262);
+          const __m256 xi_3 = _mm256_add_ps(xi_2, xi_265);
+          const __m256 xi_4 = _mm256_add_ps(_mm256_add_ps(xi_251, xi_254), xi_264);
+          const __m256 xi_5 = _mm256_add_ps(xi_253, xi_263);
+          const __m256 xi_6 = _mm256_add_ps(xi_245, xi_260);
+          const __m256 xi_8 = _mm256_mul_ps(xi_261, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_9 = _mm256_mul_ps(xi_255, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_10 = _mm256_mul_ps(xi_245, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_11 = _mm256_mul_ps(xi_259, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_12 = _mm256_mul_ps(xi_248, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
           const __m256 xi_13 = _mm256_add_ps(_mm256_add_ps(xi_10, xi_11), xi_12);
-          const __m256 xi_14 = _mm256_mul_ps(xi_247, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
-          const __m256 xi_15 = _mm256_mul_ps(xi_263, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_14 = _mm256_mul_ps(xi_252, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_15 = _mm256_mul_ps(xi_258, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
           const __m256 xi_16 = _mm256_add_ps(xi_14, xi_15);
-          const __m256 xi_17 = _mm256_mul_ps(xi_259, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
-          const __m256 xi_18 = _mm256_mul_ps(xi_258, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_17 = _mm256_mul_ps(xi_247, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_18 = _mm256_mul_ps(xi_253, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
           const __m256 xi_19 = _mm256_add_ps(xi_17, xi_18);
-          const __m256 xi_20 = _mm256_mul_ps(xi_249, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_20 = _mm256_mul_ps(xi_257, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
           const __m256 xi_21 = _mm256_add_ps(xi_10, xi_20);
-          const __m256 xi_22 = _mm256_mul_ps(xi_261, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
-          const __m256 xi_23 = _mm256_mul_ps(xi_244, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
-          const __m256 xi_24 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_17, xi_22), xi_23), xi_248);
-          const __m256 xi_29 = _mm256_mul_ps(xi_262, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
-          const __m256 xi_30 = _mm256_mul_ps(xi_262, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
-          const __m256 xi_42 = _mm256_mul_ps(xi_255, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
-          const __m256 xi_43 = _mm256_mul_ps(xi_255, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
-          const __m256 xi_49 = _mm256_mul_ps(xi_245, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
-          const __m256 xi_50 = _mm256_mul_ps(xi_245, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
-          const __m256 xi_67 = _mm256_mul_ps(xi_262, _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f));
-          const __m256 xi_72 = _mm256_mul_ps(xi_262, _mm256_set_ps(xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71));
-          const __m256 xi_114 = _mm256_mul_ps(xi_251, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
-          const __m256 xi_118 = _mm256_mul_ps(xi_248, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_22 = _mm256_mul_ps(xi_254, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_23 = _mm256_mul_ps(xi_260, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_24 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_17, xi_22), xi_23), xi_264);
+          const __m256 xi_29 = _mm256_mul_ps(xi_244, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
+          const __m256 xi_30 = _mm256_mul_ps(xi_244, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
+          const __m256 xi_42 = _mm256_mul_ps(xi_246, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
+          const __m256 xi_43 = _mm256_mul_ps(xi_246, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
+          const __m256 xi_49 = _mm256_mul_ps(xi_256, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
+          const __m256 xi_50 = _mm256_mul_ps(xi_256, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
+          const __m256 xi_67 = _mm256_mul_ps(xi_244, _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f));
+          const __m256 xi_72 = _mm256_mul_ps(xi_244, _mm256_set_ps(xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71));
+          const __m256 xi_114 = _mm256_mul_ps(xi_250, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_118 = _mm256_mul_ps(xi_264, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
           const __m256 xi_119 = _mm256_add_ps(xi_118, xi_18);
-          const __m256 xi_120 = _mm256_add_ps(_mm256_mul_ps(xi_260, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_8);
-          const __m256 xi_122 = _mm256_mul_ps(xi_257, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
+          const __m256 xi_120 = _mm256_add_ps(_mm256_mul_ps(xi_249, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_8);
+          const __m256 xi_122 = _mm256_mul_ps(xi_262, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
           const __m256 xi_123 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_11, xi_122), xi_15), xi_21);
-          const __m256 xi_125 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_248, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(xi_258, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f))), _mm256_mul_ps(xi_259, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f))), _mm256_mul_ps(xi_261, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)));
-          const __m256 xi_126 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_252, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f)), _mm256_mul_ps(xi_253, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f))), xi_125);
-          const __m256 xi_128 = _mm256_mul_ps(xi_250, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
-          const __m256 xi_129 = _mm256_mul_ps(xi_257, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
-          const __m256 xi_130 = _mm256_add_ps(_mm256_mul_ps(xi_249, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(xi_254, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)));
-          const __m256 xi_132 = _mm256_add_ps(xi_118, xi_258);
-          const __m256 xi_133 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_132, xi_14), xi_22), xi_246), xi_259);
+          const __m256 xi_125 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_247, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(xi_253, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f))), _mm256_mul_ps(xi_254, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f))), _mm256_mul_ps(xi_264, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)));
+          const __m256 xi_126 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_248, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f)), _mm256_mul_ps(xi_265, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f))), xi_125);
+          const __m256 xi_128 = _mm256_mul_ps(xi_259, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
+          const __m256 xi_129 = _mm256_mul_ps(xi_262, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
+          const __m256 xi_130 = _mm256_add_ps(_mm256_mul_ps(xi_245, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(xi_257, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)));
+          const __m256 xi_132 = _mm256_add_ps(xi_118, xi_253);
+          const __m256 xi_133 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_132, xi_14), xi_22), xi_247), xi_251);
           const __m256 xi_135 = _mm256_mul_ps(xi_133, _mm256_set_ps(xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134));
           const __m256 xi_136 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_2_3);
-          const __m256 xi_141 = _mm256_mul_ps(xi_265, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
-          const __m256 xi_142 = _mm256_mul_ps(xi_263, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
-          const __m256 xi_143 = _mm256_add_ps(_mm256_mul_ps(xi_264, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(xi_260, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f)));
+          const __m256 xi_141 = _mm256_mul_ps(xi_255, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
+          const __m256 xi_142 = _mm256_mul_ps(xi_258, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
+          const __m256 xi_143 = _mm256_add_ps(_mm256_mul_ps(xi_261, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(xi_249, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f)));
           const __m256 xi_144 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_141, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_14), xi_142), xi_143), xi_19), xi_4);
           const __m256 xi_146 = _mm256_mul_ps(xi_144, _mm256_set_ps(xi_145, xi_145, xi_145, xi_145, xi_145, xi_145, xi_145, xi_145));
           const __m256 xi_147 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_1_2);
           const __m256 xi_152 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_0_1);
-          const __m256 xi_166 = _mm256_add_ps(xi_122, xi_250);
-          const __m256 xi_167 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_12, xi_166), xi_20), xi_252), xi_254);
+          const __m256 xi_166 = _mm256_add_ps(xi_122, xi_259);
+          const __m256 xi_167 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_12, xi_166), xi_20), xi_245), xi_265);
           const __m256 xi_168 = _mm256_mul_ps(xi_167, _mm256_set_ps(xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134));
           const __m256 xi_169 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_2_1);
           const __m256 xi_171 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_142, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_13), xi_141), xi_143), xi_3);
           const __m256 xi_172 = _mm256_mul_ps(xi_171, _mm256_set_ps(xi_145, xi_145, xi_145, xi_145, xi_145, xi_145, xi_145, xi_145));
           const __m256 xi_173 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_2_0);
-          const __m256 xi_178 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_119, xi_23), xi_256), xi_259), xi_261);
+          const __m256 xi_178 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_119, xi_23), xi_247), xi_254), xi_263);
           const __m256 xi_179 = _mm256_mul_ps(xi_178, _mm256_set_ps(xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134));
           const __m256 xi_180 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_2_2);
           const __m256 xi_182 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_128, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_129, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_130), xi_24), xi_5);
@@ -240,13 +195,13 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const __m256 xi_31 = _mm256_mul_ps(xi_30, _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0));
           const __m256 xi_44 = _mm256_mul_ps(xi_43, _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0));
           const __m256 xi_51 = _mm256_mul_ps(xi_50, _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0));
-          const __m256 xi_54 = _mm256_mul_ps(xi_255, _mm256_set_ps(xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53));
-          const __m256 xi_59 = _mm256_mul_ps(xi_262, _mm256_set_ps(xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53));
-          const __m256 xi_81 = _mm256_mul_ps(xi_245, _mm256_set_ps(xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53));
-          const __m256 vel0Term = _mm256_add_ps(_mm256_add_ps(xi_260, xi_263), xi_3);
-          const __m256 vel1Term = _mm256_add_ps(xi_265, xi_4);
-          const __m256 vel2Term = _mm256_add_ps(xi_250, xi_5);
-          const __m256 rho = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel0Term, vel1Term), vel2Term), xi_247), xi_251), xi_253), xi_259), xi_264), xi_6);
+          const __m256 xi_54 = _mm256_mul_ps(xi_246, _mm256_set_ps(xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53));
+          const __m256 xi_59 = _mm256_mul_ps(xi_244, _mm256_set_ps(xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53));
+          const __m256 xi_81 = _mm256_mul_ps(xi_256, _mm256_set_ps(xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53));
+          const __m256 vel0Term = _mm256_add_ps(_mm256_add_ps(xi_249, xi_258), xi_3);
+          const __m256 vel1Term = _mm256_add_ps(xi_255, xi_4);
+          const __m256 vel2Term = _mm256_add_ps(xi_259, xi_5);
+          const __m256 rho = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel0Term, vel1Term), vel2Term), xi_247), xi_248), xi_250), xi_252), xi_261), xi_6);
           const __m256 xi_105 = _mm256_mul_ps(rho, _mm256_set_ps(kT, kT, kT, kT, kT, kT, kT, kT));
           const __m256 xi_106 = _mm256_sqrt_ps(_mm256_mul_ps(xi_105, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even)), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)), _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even)), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)))), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f))));
           const __m256 xi_107 = _mm256_mul_ps(_mm256_mul_ps(xi_106, _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_3_0)), _mm256_set_ps(3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f));
@@ -299,8 +254,8 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const __m256 xi_242 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_214, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_218, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_212), xi_216);
           const __m256 xi_0 = _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho);
           const __m256 xi_7 = _mm256_mul_ps(xi_0, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f));
-          const __m256 u_0 = _mm256_add_ps(_mm256_mul_ps(xi_0, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel0Term, xi_13), xi_8), xi_9)), _mm256_mul_ps(xi_255, xi_7));
-          const __m256 xi_25 = _mm256_mul_ps(u_0, xi_255);
+          const __m256 u_0 = _mm256_add_ps(_mm256_mul_ps(xi_0, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel0Term, xi_13), xi_8), xi_9)), _mm256_mul_ps(xi_246, xi_7));
+          const __m256 xi_25 = _mm256_mul_ps(u_0, xi_246);
           const __m256 xi_37 = _mm256_mul_ps(xi_25, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
           const __m256 xi_38 = _mm256_mul_ps(xi_25, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
           const __m256 xi_39 = _mm256_mul_ps(xi_38, _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
@@ -312,34 +267,34 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const __m256 xi_73 = _mm256_mul_ps(u_0, xi_72);
           const __m256 xi_77 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_54, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_43), xi_56);
           const __m256 xi_84 = _mm256_mul_ps(xi_38, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
-          const __m256 xi_95 = _mm256_mul_ps(u_0, xi_245);
+          const __m256 xi_95 = _mm256_mul_ps(u_0, xi_256);
           const __m256 xi_96 = _mm256_mul_ps(xi_95, _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f));
           const __m256 xi_99 = _mm256_mul_ps(xi_95, _mm256_set_ps(xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71));
           const __m256 xi_113 = _mm256_mul_ps(rho, _mm256_mul_ps(u_0, u_0));
-          const __m256 u_1 = _mm256_add_ps(_mm256_mul_ps(xi_0, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel1Term, xi_16), xi_19), xi_260), xi_8)), _mm256_mul_ps(xi_262, xi_7));
-          const __m256 xi_26 = _mm256_mul_ps(u_1, xi_262);
+          const __m256 u_1 = _mm256_add_ps(_mm256_mul_ps(xi_0, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel1Term, xi_16), xi_19), xi_249), xi_8)), _mm256_mul_ps(xi_244, xi_7));
+          const __m256 xi_26 = _mm256_mul_ps(u_1, xi_244);
           const __m256 xi_32 = _mm256_mul_ps(xi_26, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
           const __m256 xi_45 = _mm256_mul_ps(xi_26, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
           const __m256 xi_46 = _mm256_mul_ps(xi_45, _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
           const __m256 xi_47 = _mm256_add_ps(_mm256_mul_ps(xi_32, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_46);
           const __m256 xi_62 = _mm256_mul_ps(_mm256_mul_ps(xi_26, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_set_ps(xi_60, xi_60, xi_60, xi_60, xi_60, xi_60, xi_60, xi_60));
           const __m256 xi_69 = _mm256_mul_ps(u_1, _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f));
-          const __m256 xi_70 = _mm256_mul_ps(xi_255, xi_69);
+          const __m256 xi_70 = _mm256_mul_ps(xi_246, xi_69);
           const __m256 xi_74 = _mm256_mul_ps(u_1, _mm256_set_ps(xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71));
-          const __m256 xi_75 = _mm256_mul_ps(xi_255, xi_74);
+          const __m256 xi_75 = _mm256_mul_ps(xi_246, xi_74);
           const __m256 xi_76 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_68, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_70, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_73), xi_75);
           const __m256 xi_78 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_73, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_75, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_68), xi_70);
-          const __m256 xi_86 = _mm256_mul_ps(xi_245, xi_69);
-          const __m256 xi_88 = _mm256_mul_ps(xi_245, xi_74);
+          const __m256 xi_86 = _mm256_mul_ps(xi_256, xi_69);
+          const __m256 xi_88 = _mm256_mul_ps(xi_256, xi_74);
           const __m256 xi_93 = _mm256_mul_ps(xi_45, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
           const __m256 xi_112 = _mm256_mul_ps(rho, _mm256_mul_ps(u_1, u_1));
           const __m256 xi_121 = _mm256_add_ps(_mm256_add_ps(xi_112, xi_120), xi_9);
           const __m256 xi_197 = _mm256_mul_ps(rho, u_1);
-          const __m256 xi_199 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(u_0, xi_197), xi_120), xi_263), xi_265), _mm256_set_ps(xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198));
+          const __m256 xi_199 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(u_0, xi_197), xi_120), xi_255), xi_258), _mm256_set_ps(xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198));
           const __m256 xi_200 = _mm256_add_ps(_mm256_mul_ps(xi_196, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_199, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)));
           const __m256 xi_201 = _mm256_add_ps(xi_196, xi_199);
-          const __m256 u_2 = _mm256_add_ps(_mm256_mul_ps(xi_0, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel2Term, xi_21), xi_24), xi_257)), _mm256_mul_ps(xi_245, xi_7));
-          const __m256 xi_27 = _mm256_mul_ps(u_2, xi_245);
+          const __m256 u_2 = _mm256_add_ps(_mm256_mul_ps(xi_0, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel2Term, xi_21), xi_24), xi_262)), _mm256_mul_ps(xi_256, xi_7));
+          const __m256 xi_27 = _mm256_mul_ps(u_2, xi_256);
           const __m256 xi_33 = _mm256_mul_ps(xi_27, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
           const __m256 xi_34 = _mm256_mul_ps(xi_27, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
           const __m256 xi_35 = _mm256_mul_ps(xi_34, _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
@@ -363,7 +318,7 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const __m256 xi_91 = _mm256_add_ps(_mm256_add_ps(xi_39, xi_79), xi_84);
           const __m256 xi_92 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_86, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_87, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_88), xi_89);
           const __m256 xi_94 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_46, xi_61), xi_62), xi_63), xi_83), xi_93);
-          const __m256 xi_97 = _mm256_mul_ps(u_2, xi_255);
+          const __m256 xi_97 = _mm256_mul_ps(u_2, xi_246);
           const __m256 xi_98 = _mm256_mul_ps(xi_97, _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f));
           const __m256 xi_100 = _mm256_mul_ps(xi_97, _mm256_set_ps(xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71));
           const __m256 xi_101 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_96, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_98, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_100), xi_99);
@@ -371,21 +326,21 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const __m256 xi_103 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_50, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_81), xi_82);
           const __m256 xi_104 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_103, xi_46), xi_61), xi_62), xi_63), xi_93);
           const __m256 xi_115 = _mm256_mul_ps(rho, _mm256_mul_ps(u_2, u_2));
-          const __m256 xi_116 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_244, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f)), _mm256_mul_ps(xi_256, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_mul_ps(xi_115, _mm256_set_ps(0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f))), xi_114);
-          const __m256 xi_117 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_246, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f)), _mm256_mul_ps(xi_247, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_mul_ps(xi_112, _mm256_set_ps(0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f))), _mm256_mul_ps(xi_113, _mm256_set_ps(1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f))), _mm256_mul_ps(xi_248, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), _mm256_mul_ps(xi_258, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), _mm256_mul_ps(xi_259, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), _mm256_mul_ps(xi_261, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), xi_116), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even));
-          const __m256 xi_124 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_113, xi_115), xi_119), xi_121), xi_123), xi_17), xi_22), xi_251), _mm256_set_ps(omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk));
-          const __m256 xi_127 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_112, _mm256_set_ps(2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f)), _mm256_mul_ps(xi_246, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), _mm256_mul_ps(xi_247, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), _mm256_mul_ps(xi_249, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), _mm256_mul_ps(xi_250, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), _mm256_mul_ps(xi_254, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), _mm256_mul_ps(xi_257, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), xi_116), xi_126), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even));
-          const __m256 xi_131 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_115, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f)), _mm256_mul_ps(xi_246, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f))), _mm256_mul_ps(xi_247, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f))), _mm256_mul_ps(xi_244, _mm256_set_ps(-4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f))), _mm256_mul_ps(xi_256, _mm256_set_ps(-4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f))), _mm256_mul_ps(xi_260, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), _mm256_mul_ps(xi_263, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), _mm256_mul_ps(xi_264, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), _mm256_mul_ps(xi_265, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), xi_114), xi_126), xi_128), xi_129), xi_130), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even));
-          const __m256 xi_156 = _mm256_add_ps(_mm256_mul_ps(xi_115, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_256);
-          const __m256 xi_157 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_246, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_121), xi_156), xi_16), xi_2), xi_250), xi_6), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
+          const __m256 xi_116 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_260, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f)), _mm256_mul_ps(xi_263, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_mul_ps(xi_115, _mm256_set_ps(0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f))), xi_114);
+          const __m256 xi_117 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_251, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f)), _mm256_mul_ps(xi_252, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_mul_ps(xi_112, _mm256_set_ps(0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f))), _mm256_mul_ps(xi_113, _mm256_set_ps(1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f))), _mm256_mul_ps(xi_247, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), _mm256_mul_ps(xi_253, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), _mm256_mul_ps(xi_254, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), _mm256_mul_ps(xi_264, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), xi_116), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even));
+          const __m256 xi_124 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_113, xi_115), xi_119), xi_121), xi_123), xi_17), xi_22), xi_250), _mm256_set_ps(omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk));
+          const __m256 xi_127 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_112, _mm256_set_ps(2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f)), _mm256_mul_ps(xi_251, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), _mm256_mul_ps(xi_252, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), _mm256_mul_ps(xi_245, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), _mm256_mul_ps(xi_257, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), _mm256_mul_ps(xi_259, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), _mm256_mul_ps(xi_262, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), xi_116), xi_126), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even));
+          const __m256 xi_131 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_115, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f)), _mm256_mul_ps(xi_251, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f))), _mm256_mul_ps(xi_252, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f))), _mm256_mul_ps(xi_260, _mm256_set_ps(-4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f))), _mm256_mul_ps(xi_263, _mm256_set_ps(-4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f))), _mm256_mul_ps(xi_249, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), _mm256_mul_ps(xi_255, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), _mm256_mul_ps(xi_258, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), _mm256_mul_ps(xi_261, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), xi_114), xi_126), xi_128), xi_129), xi_130), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even));
+          const __m256 xi_156 = _mm256_add_ps(_mm256_mul_ps(xi_115, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_263);
+          const __m256 xi_157 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_251, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_121), xi_156), xi_16), xi_2), xi_259), xi_6), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
           const __m256 xi_158 = _mm256_mul_ps(xi_157, _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f));
           const __m256 xi_159 = _mm256_add_ps(_mm256_mul_ps(xi_131, _mm256_set_ps(-0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f)), _mm256_mul_ps(xi_107, _mm256_set_ps(-0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f)));
-          const __m256 xi_160 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_112, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_113, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f))), _mm256_mul_ps(xi_252, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), _mm256_mul_ps(xi_253, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), xi_120), xi_123), xi_125), xi_156), xi_244), xi_246), xi_247), xi_9), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
+          const __m256 xi_160 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_113, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(xi_112, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_248, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), _mm256_mul_ps(xi_265, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), xi_120), xi_123), xi_125), xi_156), xi_251), xi_252), xi_260), xi_9), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
           const __m256 xi_162 = _mm256_add_ps(_mm256_mul_ps(xi_160, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_mul_ps(xi_161, _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)));
           const __m256 xi_163 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_117, _mm256_set_ps(-0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f)), _mm256_mul_ps(xi_108, _mm256_set_ps(-0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f))), xi_162);
           const __m256 xi_164 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_127, _mm256_set_ps(0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f)), _mm256_mul_ps(xi_111, _mm256_set_ps(0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f))), xi_155), xi_158), xi_159), xi_163);
           const __m256 xi_176 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_160, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_161, _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(xi_127, _mm256_set_ps(-0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f))), _mm256_mul_ps(xi_111, _mm256_set_ps(-0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f))), xi_159);
-          const __m256 xi_187 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_155, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_158, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_131, _mm256_set_ps(0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f))), _mm256_mul_ps(xi_107, _mm256_set_ps(0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f))), _mm256_mul_ps(xi_127, _mm256_set_ps(-0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f))), _mm256_mul_ps(xi_111, _mm256_set_ps(-0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f))), xi_163);
+          const __m256 xi_187 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_131, _mm256_set_ps(0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f)), _mm256_mul_ps(xi_107, _mm256_set_ps(0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f))), _mm256_mul_ps(xi_155, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_158, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_127, _mm256_set_ps(-0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f))), _mm256_mul_ps(xi_111, _mm256_set_ps(-0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f))), xi_163);
           const __m256 xi_190 = _mm256_mul_ps(xi_157, _mm256_set_ps(0.0625f, 0.0625f, 0.0625f, 0.0625f, 0.0625f, 0.0625f, 0.0625f, 0.0625f));
           const __m256 xi_191 = _mm256_mul_ps(xi_131, _mm256_set_ps(0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f));
           const __m256 xi_193 = _mm256_add_ps(_mm256_mul_ps(xi_124, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_110, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)));
@@ -393,7 +348,7 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const __m256 xi_195 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_165, xi_189), xi_190), xi_191), xi_192), xi_194);
           const __m256 xi_202 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_151, xi_189), xi_190), xi_191), xi_192), xi_194);
           const __m256 xi_204 = _mm256_mul_ps(xi_127, _mm256_set_ps(-0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f));
-          const __m256 xi_205 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(u_2, xi_197), xi_132), xi_17), xi_261), _mm256_set_ps(xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198));
+          const __m256 xi_205 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(u_2, xi_197), xi_132), xi_17), xi_254), _mm256_set_ps(xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198));
           const __m256 xi_206 = _mm256_mul_ps(xi_117, _mm256_set_ps(0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f));
           const __m256 xi_209 = _mm256_add_ps(_mm256_mul_ps(xi_131, _mm256_set_ps(-0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f)), _mm256_mul_ps(xi_107, _mm256_set_ps(-0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f)));
           const __m256 xi_210 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_162, xi_193), xi_203), xi_204), xi_205), xi_206), xi_207), xi_208), xi_209);
@@ -401,7 +356,7 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const __m256 xi_228 = _mm256_mul_ps(xi_190, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
           const __m256 xi_229 = _mm256_mul_ps(xi_127, _mm256_set_ps(0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f));
           const __m256 xi_231 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_188, xi_194), xi_209), xi_227), xi_228), xi_229), xi_230);
-          const __m256 xi_233 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(rho, u_0), u_2), xi_10), xi_166), xi_249), _mm256_set_ps(xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198));
+          const __m256 xi_233 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(rho, u_0), u_2), xi_10), xi_166), xi_257), _mm256_set_ps(xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198));
           const __m256 xi_234 = _mm256_add_ps(_mm256_mul_ps(xi_232, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_233, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)));
           const __m256 xi_240 = _mm256_add_ps(xi_232, xi_233);
           const __m256 xi_243 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_186, xi_194), xi_209), xi_227), xi_228), xi_229), xi_230);
@@ -424,49 +379,49 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const __m256 forceTerm_16 = _mm256_add_ps(_mm256_add_ps(xi_103, xi_90), xi_91);
           const __m256 forceTerm_17 = _mm256_add_ps(_mm256_add_ps(xi_102, xi_104), xi_57);
           const __m256 forceTerm_18 = _mm256_add_ps(_mm256_add_ps(xi_101, xi_104), xi_77);
-          _mm256_store_ps(&_data_pdfs_20_30_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_110, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_131, _mm256_set_ps(0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f))), _mm256_mul_ps(xi_107, _mm256_set_ps(0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f))), _mm256_mul_ps(xi_127, _mm256_set_ps(0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f))), _mm256_mul_ps(xi_111, _mm256_set_ps(0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f))), _mm256_mul_ps(xi_117, _mm256_set_ps(0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f))), _mm256_mul_ps(xi_108, _mm256_set_ps(0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f))), _mm256_mul_ps(xi_124, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), forceTerm_0), xi_251));
-          _mm256_store_ps(&_data_pdfs_20_31_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_135, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_140, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), forceTerm_1), xi_151), xi_164), xi_246));
-          _mm256_store_ps(&_data_pdfs_20_32_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_2, xi_135), xi_140), xi_164), xi_165), xi_247));
-          _mm256_store_ps(&_data_pdfs_20_33_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_3, xi_168), xi_170), xi_175), xi_176), xi_253));
-          _mm256_store_ps(&_data_pdfs_20_34_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_168, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_170, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), forceTerm_4), xi_176), xi_177), xi_252));
-          _mm256_store_ps(&_data_pdfs_20_35_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_179, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_181, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), forceTerm_5), xi_186), xi_187), xi_256));
-          _mm256_store_ps(&_data_pdfs_20_36_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_6, xi_179), xi_181), xi_187), xi_188), xi_244));
-          _mm256_store_ps(&_data_pdfs_20_37_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_7, xi_177), xi_195), xi_200), xi_265));
-          _mm256_store_ps(&_data_pdfs_20_38_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_8, xi_175), xi_195), xi_201), xi_260));
-          _mm256_store_ps(&_data_pdfs_20_39_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_9, xi_177), xi_201), xi_202), xi_264));
-          _mm256_store_ps(&_data_pdfs_20_310_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_10, xi_175), xi_200), xi_202), xi_263));
-          _mm256_store_ps(&_data_pdfs_20_311_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_11, xi_210), xi_219), xi_224), xi_248));
-          _mm256_store_ps(&_data_pdfs_20_312_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_12, xi_219), xi_225), xi_226), xi_258));
-          _mm256_store_ps(&_data_pdfs_20_313_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_13, xi_231), xi_234), xi_239), xi_250));
-          _mm256_store_ps(&_data_pdfs_20_314_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_14, xi_231), xi_240), xi_241), xi_257));
-          _mm256_store_ps(&_data_pdfs_20_315_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_15, xi_224), xi_226), xi_242), xi_261));
-          _mm256_store_ps(&_data_pdfs_20_316_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_16, xi_210), xi_225), xi_242), xi_259));
-          _mm256_store_ps(&_data_pdfs_20_317_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_17, xi_239), xi_240), xi_243), xi_254));
-          _mm256_store_ps(&_data_pdfs_20_318_10[ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_18, xi_234), xi_241), xi_243), xi_249));
+          _mm256_store_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_131, _mm256_set_ps(0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f)), _mm256_mul_ps(xi_107, _mm256_set_ps(0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f))), _mm256_mul_ps(xi_127, _mm256_set_ps(0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f))), _mm256_mul_ps(xi_111, _mm256_set_ps(0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f))), _mm256_mul_ps(xi_117, _mm256_set_ps(0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f))), _mm256_mul_ps(xi_108, _mm256_set_ps(0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f))), _mm256_mul_ps(xi_124, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_110, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), forceTerm_0), xi_250));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_135, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_140, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), forceTerm_1), xi_151), xi_164), xi_251));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_2, xi_135), xi_140), xi_164), xi_165), xi_252));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_3, xi_168), xi_170), xi_175), xi_176), xi_248));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_168, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_170, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), forceTerm_4), xi_176), xi_177), xi_265));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_179, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_181, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), forceTerm_5), xi_186), xi_187), xi_263));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_6, xi_179), xi_181), xi_187), xi_188), xi_260));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_7, xi_177), xi_195), xi_200), xi_255));
+          _mm256_store_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_8, xi_175), xi_195), xi_201), xi_249));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_9, xi_177), xi_201), xi_202), xi_261));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_10, xi_175), xi_200), xi_202), xi_258));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_11, xi_210), xi_219), xi_224), xi_264));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_12, xi_219), xi_225), xi_226), xi_253));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_13, xi_231), xi_234), xi_239), xi_259));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_14, xi_231), xi_240), xi_241), xi_262));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_15, xi_224), xi_226), xi_242), xi_254));
+          _mm256_store_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_16, xi_210), xi_225), xi_242), xi_247));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_17, xi_239), xi_240), xi_243), xi_245));
+          _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_18, xi_234), xi_241), xi_243), xi_257));
         }
         for (int64_t ctr_0 = (int64_t)((_size_force_0) / (8)) * (8); ctr_0 < _size_force_0; ctr_0 += 1) {
-          const float xi_244 = _data_pdfs_20_36_10[ctr_0];
-          const float xi_245 = _data_force_20_32_10[ctr_0];
-          const float xi_246 = _data_pdfs_20_31_10[ctr_0];
-          const float xi_247 = _data_pdfs_20_32_10[ctr_0];
-          const float xi_248 = _data_pdfs_20_311_10[ctr_0];
-          const float xi_249 = _data_pdfs_20_318_10[ctr_0];
-          const float xi_250 = _data_pdfs_20_313_10[ctr_0];
-          const float xi_251 = _data_pdfs_20_30_10[ctr_0];
-          const float xi_252 = _data_pdfs_20_34_10[ctr_0];
-          const float xi_253 = _data_pdfs_20_33_10[ctr_0];
-          const float xi_254 = _data_pdfs_20_317_10[ctr_0];
-          const float xi_255 = _data_force_20_30_10[ctr_0];
-          const float xi_256 = _data_pdfs_20_35_10[ctr_0];
-          const float xi_257 = _data_pdfs_20_314_10[ctr_0];
-          const float xi_258 = _data_pdfs_20_312_10[ctr_0];
-          const float xi_259 = _data_pdfs_20_316_10[ctr_0];
-          const float xi_260 = _data_pdfs_20_38_10[ctr_0];
-          const float xi_261 = _data_pdfs_20_315_10[ctr_0];
-          const float xi_262 = _data_force_20_31_10[ctr_0];
-          const float xi_263 = _data_pdfs_20_310_10[ctr_0];
-          const float xi_264 = _data_pdfs_20_39_10[ctr_0];
-          const float xi_265 = _data_pdfs_20_37_10[ctr_0];
+          const float xi_244 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0];
+          const float xi_245 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0];
+          const float xi_246 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0];
+          const float xi_247 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0];
+          const float xi_248 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0];
+          const float xi_249 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0];
+          const float xi_250 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0];
+          const float xi_251 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0];
+          const float xi_252 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0];
+          const float xi_253 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0];
+          const float xi_254 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0];
+          const float xi_255 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0];
+          const float xi_256 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0];
+          const float xi_257 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0];
+          const float xi_258 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0];
+          const float xi_259 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0];
+          const float xi_260 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0];
+          const float xi_261 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0];
+          const float xi_262 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0];
+          const float xi_263 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0];
+          const float xi_264 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0];
+          const float xi_265 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0];
 
           float random_3_0{};
           float random_3_1{};
@@ -499,69 +454,69 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           if (kT > 0.) {
             philox_float4(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 0, seed, random_0_0, random_0_1, random_0_2, random_0_3);
           }
-          const float xi_2 = xi_249 + xi_257;
-          const float xi_3 = xi_2 + xi_252;
-          const float xi_4 = xi_246 + xi_248 + xi_261;
-          const float xi_5 = xi_256 + xi_258;
-          const float xi_6 = xi_244 + xi_254;
-          const float xi_8 = xi_264 * -1.0f;
-          const float xi_9 = xi_265 * -1.0f;
-          const float xi_10 = xi_254 * -1.0f;
-          const float xi_11 = xi_250 * -1.0f;
-          const float xi_12 = xi_253 * -1.0f;
+          const float xi_2 = xi_257 + xi_262;
+          const float xi_3 = xi_2 + xi_265;
+          const float xi_4 = xi_251 + xi_254 + xi_264;
+          const float xi_5 = xi_253 + xi_263;
+          const float xi_6 = xi_245 + xi_260;
+          const float xi_8 = -xi_261;
+          const float xi_9 = -xi_255;
+          const float xi_10 = -xi_245;
+          const float xi_11 = -xi_259;
+          const float xi_12 = -xi_248;
           const float xi_13 = xi_10 + xi_11 + xi_12;
-          const float xi_14 = xi_247 * -1.0f;
-          const float xi_15 = xi_263 * -1.0f;
+          const float xi_14 = -xi_252;
+          const float xi_15 = -xi_258;
           const float xi_16 = xi_14 + xi_15;
-          const float xi_17 = xi_259 * -1.0f;
-          const float xi_18 = xi_258 * -1.0f;
+          const float xi_17 = -xi_247;
+          const float xi_18 = -xi_253;
           const float xi_19 = xi_17 + xi_18;
-          const float xi_20 = xi_249 * -1.0f;
+          const float xi_20 = -xi_257;
           const float xi_21 = xi_10 + xi_20;
-          const float xi_22 = xi_261 * -1.0f;
-          const float xi_23 = xi_244 * -1.0f;
-          const float xi_24 = xi_17 + xi_22 + xi_23 + xi_248;
-          const float xi_29 = xi_262 * 0.16666666666666666f;
-          const float xi_30 = xi_262 * 0.083333333333333329f;
-          const float xi_42 = xi_255 * 0.16666666666666666f;
-          const float xi_43 = xi_255 * 0.083333333333333329f;
-          const float xi_49 = xi_245 * 0.16666666666666666f;
-          const float xi_50 = xi_245 * 0.083333333333333329f;
-          const float xi_67 = xi_262 * 0.25f;
-          const float xi_72 = xi_262 * xi_71;
-          const float xi_114 = xi_251 * -1.0f;
-          const float xi_118 = xi_248 * -1.0f;
+          const float xi_22 = -xi_254;
+          const float xi_23 = -xi_260;
+          const float xi_24 = xi_17 + xi_22 + xi_23 + xi_264;
+          const float xi_29 = xi_244 * 0.16666666666666666f;
+          const float xi_30 = xi_244 * 0.083333333333333329f;
+          const float xi_42 = xi_246 * 0.16666666666666666f;
+          const float xi_43 = xi_246 * 0.083333333333333329f;
+          const float xi_49 = xi_256 * 0.16666666666666666f;
+          const float xi_50 = xi_256 * 0.083333333333333329f;
+          const float xi_67 = xi_244 * 0.25f;
+          const float xi_72 = xi_244 * xi_71;
+          const float xi_114 = -xi_250;
+          const float xi_118 = -xi_264;
           const float xi_119 = xi_118 + xi_18;
-          const float xi_120 = xi_260 * -1.0f + xi_8;
-          const float xi_122 = xi_257 * -1.0f;
+          const float xi_120 = -xi_249 + xi_8;
+          const float xi_122 = -xi_262;
           const float xi_123 = xi_11 + xi_122 + xi_15 + xi_21;
-          const float xi_125 = xi_248 * 2.0f + xi_258 * 2.0f + xi_259 * 2.0f + xi_261 * 2.0f;
-          const float xi_126 = xi_125 + xi_252 * 5.0f + xi_253 * 5.0f;
-          const float xi_128 = xi_250 * 2.0f;
-          const float xi_129 = xi_257 * 2.0f;
-          const float xi_130 = xi_249 * 2.0f + xi_254 * 2.0f;
-          const float xi_132 = xi_118 + xi_258;
-          const float xi_133 = xi_132 + xi_14 + xi_22 + xi_246 + xi_259;
+          const float xi_125 = xi_247 * 2.0f + xi_253 * 2.0f + xi_254 * 2.0f + xi_264 * 2.0f;
+          const float xi_126 = xi_125 + xi_248 * 5.0f + xi_265 * 5.0f;
+          const float xi_128 = xi_259 * 2.0f;
+          const float xi_129 = xi_262 * 2.0f;
+          const float xi_130 = xi_245 * 2.0f + xi_257 * 2.0f;
+          const float xi_132 = xi_118 + xi_253;
+          const float xi_133 = xi_132 + xi_14 + xi_22 + xi_247 + xi_251;
           const float xi_135 = xi_133 * xi_134;
           const float xi_136 = random_2_3 - 0.5f;
-          const float xi_141 = xi_265 * 2.0f;
-          const float xi_142 = xi_263 * 2.0f;
-          const float xi_143 = xi_260 * -2.0f + xi_264 * 2.0f;
-          const float xi_144 = xi_14 + xi_141 * -1.0f + xi_142 + xi_143 + xi_19 + xi_4;
+          const float xi_141 = xi_255 * 2.0f;
+          const float xi_142 = xi_258 * 2.0f;
+          const float xi_143 = xi_249 * -2.0f + xi_261 * 2.0f;
+          const float xi_144 = xi_14 - xi_141 + xi_142 + xi_143 + xi_19 + xi_4;
           const float xi_146 = xi_144 * xi_145;
           const float xi_147 = random_1_2 - 0.5f;
           const float xi_152 = random_0_1 - 0.5f;
-          const float xi_166 = xi_122 + xi_250;
-          const float xi_167 = xi_12 + xi_166 + xi_20 + xi_252 + xi_254;
+          const float xi_166 = xi_122 + xi_259;
+          const float xi_167 = xi_12 + xi_166 + xi_20 + xi_245 + xi_265;
           const float xi_168 = xi_134 * xi_167;
           const float xi_169 = random_2_1 - 0.5f;
-          const float xi_171 = xi_13 + xi_141 + xi_142 * -1.0f + xi_143 + xi_3;
+          const float xi_171 = xi_13 + xi_141 - xi_142 + xi_143 + xi_3;
           const float xi_172 = xi_145 * xi_171;
           const float xi_173 = random_2_0 - 0.5f;
-          const float xi_178 = xi_119 + xi_23 + xi_256 + xi_259 + xi_261;
+          const float xi_178 = xi_119 + xi_23 + xi_247 + xi_254 + xi_263;
           const float xi_179 = xi_134 * xi_178;
           const float xi_180 = random_2_2 - 0.5f;
-          const float xi_182 = xi_128 * -1.0f + xi_129 * -1.0f + xi_130 + xi_24 + xi_5;
+          const float xi_182 = -xi_128 - xi_129 + xi_130 + xi_24 + xi_5;
           const float xi_183 = xi_145 * xi_182;
           const float xi_184 = random_1_3 - 0.5f;
           const float xi_212 = xi_182 * xi_211;
@@ -573,28 +528,28 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const float xi_31 = rr_0 * xi_30;
           const float xi_44 = rr_0 * xi_43;
           const float xi_51 = rr_0 * xi_50;
-          const float xi_54 = xi_255 * xi_53;
-          const float xi_59 = xi_262 * xi_53;
-          const float xi_81 = xi_245 * xi_53;
-          const float vel0Term = xi_260 + xi_263 + xi_3;
-          const float vel1Term = xi_265 + xi_4;
-          const float vel2Term = xi_250 + xi_5;
-          const float rho = vel0Term + vel1Term + vel2Term + xi_247 + xi_251 + xi_253 + xi_259 + xi_264 + xi_6;
+          const float xi_54 = xi_246 * xi_53;
+          const float xi_59 = xi_244 * xi_53;
+          const float xi_81 = xi_256 * xi_53;
+          const float vel0Term = xi_249 + xi_258 + xi_3;
+          const float vel1Term = xi_255 + xi_4;
+          const float vel2Term = xi_259 + xi_5;
+          const float rho = vel0Term + vel1Term + vel2Term + xi_247 + xi_248 + xi_250 + xi_252 + xi_261 + xi_6;
           const float xi_105 = kT * rho;
-          const float xi_106 = powf(xi_105 * (-1.0f * (omega_even * -1.0f + 1.0f) * (omega_even * -1.0f + 1.0f) + 1.0f), 0.5f);
+          const float xi_106 = powf(xi_105 * (1.0f - ((-omega_even + 1.0f) * (-omega_even + 1.0f))), 0.5f);
           const float xi_107 = xi_106 * (random_3_0 - 0.5f) * 3.7416573867739413f;
           const float xi_108 = xi_106 * (random_3_2 - 0.5f) * 5.4772255750516612f;
-          const float xi_110 = xi_109 * (random_1_1 - 0.5f) * powf(xi_105 * (-1.0f * (omega_bulk * -1.0f + 1.0f) * (omega_bulk * -1.0f + 1.0f) + 1.0f), 0.5f);
+          const float xi_110 = xi_109 * (random_1_1 - 0.5f) * powf(xi_105 * (1.0f - ((-omega_bulk + 1.0f) * (-omega_bulk + 1.0f))), 0.5f);
           const float xi_111 = xi_106 * (random_3_1 - 0.5f) * 8.3666002653407556f;
-          const float xi_137 = powf(xi_105 * (-1.0f * (omega_odd * -1.0f + 1.0f) * (omega_odd * -1.0f + 1.0f) + 1.0f), 0.5f);
+          const float xi_137 = powf(xi_105 * (1.0f - ((-omega_odd + 1.0f) * (-omega_odd + 1.0f))), 0.5f);
           const float xi_138 = xi_137 * 1.4142135623730951f;
           const float xi_139 = xi_138 * 0.5f;
           const float xi_140 = xi_136 * xi_139;
           const float xi_148 = xi_109 * xi_137;
           const float xi_149 = xi_148 * 0.16666666666666666f;
           const float xi_150 = xi_147 * xi_149;
-          const float xi_151 = xi_146 * -1.0f + xi_150 * -1.0f;
-          const float xi_153 = powf(xi_105 * (-1.0f * (omega_shear * -1.0f + 1.0f) * (omega_shear * -1.0f + 1.0f) + 1.0f), 0.5f);
+          const float xi_151 = -xi_146 - xi_150;
+          const float xi_153 = powf(xi_105 * (1.0f - ((-omega_shear + 1.0f) * (-omega_shear + 1.0f))), 0.5f);
           const float xi_154 = xi_153 * 0.5f;
           const float xi_155 = xi_152 * xi_154;
           const float xi_161 = xi_153 * (random_0_0 - 0.5f) * 1.7320508075688772f;
@@ -602,10 +557,10 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const float xi_170 = xi_139 * xi_169;
           const float xi_174 = xi_149 * xi_173;
           const float xi_175 = xi_172 + xi_174;
-          const float xi_177 = xi_172 * -1.0f + xi_174 * -1.0f;
+          const float xi_177 = -xi_172 - xi_174;
           const float xi_181 = xi_139 * xi_180;
           const float xi_185 = xi_149 * xi_184;
-          const float xi_186 = xi_183 * -1.0f + xi_185 * -1.0f;
+          const float xi_186 = -xi_183 - xi_185;
           const float xi_188 = xi_183 + xi_185;
           const float xi_189 = xi_152 * xi_153 * 0.25f;
           const float xi_192 = xi_107 * 0.083333333333333329f;
@@ -617,108 +572,108 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const float xi_216 = xi_184 * xi_215;
           const float xi_217 = xi_138 * 0.25f;
           const float xi_218 = xi_180 * xi_217;
-          const float xi_219 = xi_212 * -1.0f + xi_214 + xi_216 * -1.0f + xi_218;
+          const float xi_219 = -xi_212 + xi_214 - xi_216 + xi_218;
           const float xi_222 = xi_147 * xi_215;
           const float xi_223 = xi_136 * xi_217;
-          const float xi_224 = xi_220 * -1.0f + xi_221 + xi_222 * -1.0f + xi_223;
-          const float xi_225 = xi_220 + xi_221 * -1.0f + xi_222 + xi_223 * -1.0f;
-          const float xi_227 = xi_189 * -1.0f;
+          const float xi_224 = -xi_220 + xi_221 - xi_222 + xi_223;
+          const float xi_225 = xi_220 - xi_221 + xi_222 - xi_223;
+          const float xi_227 = -xi_189;
           const float xi_230 = xi_111 * 0.035714285714285712f;
           const float xi_232 = xi_154 * (random_0_3 - 0.5f);
           const float xi_237 = xi_169 * xi_217;
           const float xi_238 = xi_173 * xi_215;
-          const float xi_239 = xi_235 * -1.0f + xi_236 + xi_237 * -1.0f + xi_238;
-          const float xi_241 = xi_235 + xi_236 * -1.0f + xi_237 + xi_238 * -1.0f;
-          const float xi_242 = xi_212 + xi_214 * -1.0f + xi_216 + xi_218 * -1.0f;
+          const float xi_239 = -xi_235 + xi_236 - xi_237 + xi_238;
+          const float xi_241 = xi_235 - xi_236 + xi_237 - xi_238;
+          const float xi_242 = xi_212 - xi_214 + xi_216 - xi_218;
           const float xi_0 = ((1.0f) / (rho));
           const float xi_7 = xi_0 * 0.5f;
-          const float u_0 = xi_0 * (vel0Term + xi_13 + xi_8 + xi_9) + xi_255 * xi_7;
-          const float xi_25 = u_0 * xi_255;
+          const float u_0 = xi_0 * (vel0Term + xi_13 + xi_8 + xi_9) + xi_246 * xi_7;
+          const float xi_25 = u_0 * xi_246;
           const float xi_37 = xi_25 * 0.16666666666666666f;
           const float xi_38 = xi_25 * 0.083333333333333329f;
           const float xi_39 = omega_shear * xi_38;
-          const float xi_40 = xi_37 * -1.0f + xi_39;
-          const float xi_56 = xi_25 * xi_55 * -1.0f + xi_37;
-          const float xi_57 = xi_43 * -1.0f + xi_54 + xi_56;
-          const float xi_61 = xi_25 * xi_60 * -1.0f;
+          const float xi_40 = -xi_37 + xi_39;
+          const float xi_56 = -xi_25 * xi_55 + xi_37;
+          const float xi_57 = -xi_43 + xi_54 + xi_56;
+          const float xi_61 = -xi_25 * xi_60;
           const float xi_68 = u_0 * xi_67;
           const float xi_73 = u_0 * xi_72;
-          const float xi_77 = xi_43 + xi_54 * -1.0f + xi_56;
-          const float xi_84 = xi_38 * -1.0f;
-          const float xi_95 = u_0 * xi_245;
+          const float xi_77 = xi_43 - xi_54 + xi_56;
+          const float xi_84 = -xi_38;
+          const float xi_95 = u_0 * xi_256;
           const float xi_96 = xi_95 * 0.25f;
           const float xi_99 = xi_71 * xi_95;
-          const float xi_113 = rho * u_0 * u_0;
-          const float u_1 = xi_0 * (vel1Term + xi_16 + xi_19 + xi_260 + xi_8) + xi_262 * xi_7;
-          const float xi_26 = u_1 * xi_262;
+          const float xi_113 = rho * (u_0 * u_0);
+          const float u_1 = xi_0 * (vel1Term + xi_16 + xi_19 + xi_249 + xi_8) + xi_244 * xi_7;
+          const float xi_26 = u_1 * xi_244;
           const float xi_32 = xi_26 * 0.16666666666666666f;
           const float xi_45 = xi_26 * 0.083333333333333329f;
           const float xi_46 = omega_shear * xi_45;
-          const float xi_47 = xi_32 * -1.0f + xi_46;
-          const float xi_62 = xi_26 * xi_60 * -1.0f;
+          const float xi_47 = -xi_32 + xi_46;
+          const float xi_62 = -xi_26 * xi_60;
           const float xi_69 = u_1 * 0.25f;
-          const float xi_70 = xi_255 * xi_69;
+          const float xi_70 = xi_246 * xi_69;
           const float xi_74 = u_1 * xi_71;
-          const float xi_75 = xi_255 * xi_74;
-          const float xi_76 = xi_68 * -1.0f + xi_70 * -1.0f + xi_73 + xi_75;
-          const float xi_78 = xi_68 + xi_70 + xi_73 * -1.0f + xi_75 * -1.0f;
-          const float xi_86 = xi_245 * xi_69;
-          const float xi_88 = xi_245 * xi_74;
-          const float xi_93 = xi_45 * -1.0f;
-          const float xi_112 = rho * u_1 * u_1;
+          const float xi_75 = xi_246 * xi_74;
+          const float xi_76 = -xi_68 - xi_70 + xi_73 + xi_75;
+          const float xi_78 = xi_68 + xi_70 - xi_73 - xi_75;
+          const float xi_86 = xi_256 * xi_69;
+          const float xi_88 = xi_256 * xi_74;
+          const float xi_93 = -xi_45;
+          const float xi_112 = rho * (u_1 * u_1);
           const float xi_121 = xi_112 + xi_120 + xi_9;
           const float xi_197 = rho * u_1;
-          const float xi_199 = xi_198 * (u_0 * xi_197 + xi_120 + xi_263 + xi_265);
-          const float xi_200 = xi_196 * -1.0f + xi_199 * -1.0f;
+          const float xi_199 = xi_198 * (u_0 * xi_197 + xi_120 + xi_255 + xi_258);
+          const float xi_200 = -xi_196 - xi_199;
           const float xi_201 = xi_196 + xi_199;
-          const float u_2 = xi_0 * (vel2Term + xi_21 + xi_24 + xi_257) + xi_245 * xi_7;
-          const float xi_27 = u_2 * xi_245;
+          const float u_2 = xi_0 * (vel2Term + xi_21 + xi_24 + xi_262) + xi_256 * xi_7;
+          const float xi_27 = u_2 * xi_256;
           const float xi_33 = xi_27 * 0.16666666666666666f;
           const float xi_34 = xi_27 * 0.083333333333333329f;
           const float xi_35 = omega_shear * xi_34;
-          const float xi_36 = xi_33 * -1.0f + xi_35;
-          const float xi_41 = omega_shear * xi_32 * -1.0f + xi_26 * 0.33333333333333331f + xi_36 + xi_40;
-          const float xi_48 = omega_shear * xi_37 * -1.0f + xi_25 * 0.33333333333333331f + xi_36 + xi_47;
-          const float xi_52 = omega_shear * xi_33 * -1.0f + xi_27 * 0.33333333333333331f + xi_40 + xi_47;
-          const float xi_58 = xi_34 * -1.0f;
-          const float xi_63 = xi_27 * xi_60 * -1.0f;
-          const float xi_64 = xi_26 * xi_55 * -1.0f + xi_32 + xi_61 + xi_62 + xi_63;
-          const float xi_65 = xi_30 + xi_59 * -1.0f + xi_64;
+          const float xi_36 = -xi_33 + xi_35;
+          const float xi_41 = -omega_shear * xi_32 + xi_26 * 0.33333333333333331f + xi_36 + xi_40;
+          const float xi_48 = -omega_shear * xi_37 + xi_25 * 0.33333333333333331f + xi_36 + xi_47;
+          const float xi_52 = -omega_shear * xi_33 + xi_27 * 0.33333333333333331f + xi_40 + xi_47;
+          const float xi_58 = -xi_34;
+          const float xi_63 = -xi_27 * xi_60;
+          const float xi_64 = -xi_26 * xi_55 + xi_32 + xi_61 + xi_62 + xi_63;
+          const float xi_65 = xi_30 - xi_59 + xi_64;
           const float xi_66 = xi_35 + xi_58 + xi_65;
-          const float xi_79 = xi_30 * -1.0f + xi_59 + xi_64;
+          const float xi_79 = -xi_30 + xi_59 + xi_64;
           const float xi_80 = xi_35 + xi_58 + xi_79;
-          const float xi_82 = xi_27 * xi_55 * -1.0f + xi_33;
-          const float xi_83 = xi_50 + xi_81 * -1.0f + xi_82;
+          const float xi_82 = -xi_27 * xi_55 + xi_33;
+          const float xi_83 = xi_50 - xi_81 + xi_82;
           const float xi_85 = xi_39 + xi_65 + xi_84;
           const float xi_87 = u_2 * xi_67;
           const float xi_89 = u_2 * xi_72;
-          const float xi_90 = xi_86 + xi_87 + xi_88 * -1.0f + xi_89 * -1.0f;
+          const float xi_90 = xi_86 + xi_87 - xi_88 - xi_89;
           const float xi_91 = xi_39 + xi_79 + xi_84;
-          const float xi_92 = xi_86 * -1.0f + xi_87 * -1.0f + xi_88 + xi_89;
+          const float xi_92 = -xi_86 - xi_87 + xi_88 + xi_89;
           const float xi_94 = xi_46 + xi_61 + xi_62 + xi_63 + xi_83 + xi_93;
-          const float xi_97 = u_2 * xi_255;
+          const float xi_97 = u_2 * xi_246;
           const float xi_98 = xi_97 * 0.25f;
           const float xi_100 = xi_71 * xi_97;
-          const float xi_101 = xi_100 + xi_96 * -1.0f + xi_98 * -1.0f + xi_99;
-          const float xi_102 = xi_100 * -1.0f + xi_96 + xi_98 + xi_99 * -1.0f;
-          const float xi_103 = xi_50 * -1.0f + xi_81 + xi_82;
+          const float xi_101 = xi_100 - xi_96 - xi_98 + xi_99;
+          const float xi_102 = -xi_100 + xi_96 + xi_98 - xi_99;
+          const float xi_103 = -xi_50 + xi_81 + xi_82;
           const float xi_104 = xi_103 + xi_46 + xi_61 + xi_62 + xi_63 + xi_93;
-          const float xi_115 = rho * u_2 * u_2;
-          const float xi_116 = xi_114 + xi_115 * 0.66666666666666663f + xi_244 * 3.0f + xi_256 * 3.0f;
-          const float xi_117 = omega_even * (xi_112 * 0.66666666666666663f + xi_113 * 1.6666666666666667f + xi_116 + xi_246 * 3.0f + xi_247 * 3.0f + xi_248 * -3.0f + xi_258 * -3.0f + xi_259 * -3.0f + xi_261 * -3.0f);
-          const float xi_124 = omega_bulk * (xi_113 + xi_115 + xi_119 + xi_121 + xi_123 + xi_17 + xi_22 + xi_251);
-          const float xi_127 = omega_even * (xi_112 * 2.3333333333333335f + xi_116 + xi_126 + xi_246 * -2.0f + xi_247 * -2.0f + xi_249 * -5.0f + xi_250 * -5.0f + xi_254 * -5.0f + xi_257 * -5.0f);
-          const float xi_131 = omega_even * (xi_114 + xi_115 * 3.0f + xi_126 + xi_128 + xi_129 + xi_130 + xi_244 * -4.0f + xi_246 * 5.0f + xi_247 * 5.0f + xi_256 * -4.0f + xi_260 * -7.0f + xi_263 * -7.0f + xi_264 * -7.0f + xi_265 * -7.0f);
-          const float xi_156 = xi_115 * -1.0f + xi_256;
-          const float xi_157 = omega_shear * (xi_121 + xi_156 + xi_16 + xi_2 + xi_246 * -1.0f + xi_250 + xi_6);
+          const float xi_115 = rho * (u_2 * u_2);
+          const float xi_116 = xi_114 + xi_115 * 0.66666666666666663f + xi_260 * 3.0f + xi_263 * 3.0f;
+          const float xi_117 = omega_even * (xi_112 * 0.66666666666666663f + xi_113 * 1.6666666666666667f + xi_116 + xi_247 * -3.0f + xi_251 * 3.0f + xi_252 * 3.0f + xi_253 * -3.0f + xi_254 * -3.0f + xi_264 * -3.0f);
+          const float xi_124 = omega_bulk * (xi_113 + xi_115 + xi_119 + xi_121 + xi_123 + xi_17 + xi_22 + xi_250);
+          const float xi_127 = omega_even * (xi_112 * 2.3333333333333335f + xi_116 + xi_126 + xi_245 * -5.0f + xi_251 * -2.0f + xi_252 * -2.0f + xi_257 * -5.0f + xi_259 * -5.0f + xi_262 * -5.0f);
+          const float xi_131 = omega_even * (xi_114 + xi_115 * 3.0f + xi_126 + xi_128 + xi_129 + xi_130 + xi_249 * -7.0f + xi_251 * 5.0f + xi_252 * 5.0f + xi_255 * -7.0f + xi_258 * -7.0f + xi_260 * -4.0f + xi_261 * -7.0f + xi_263 * -4.0f);
+          const float xi_156 = -xi_115 + xi_263;
+          const float xi_157 = omega_shear * (xi_121 + xi_156 + xi_16 + xi_2 - xi_251 + xi_259 + xi_6);
           const float xi_158 = xi_157 * 0.125f;
           const float xi_159 = xi_107 * -0.11904761904761904f + xi_131 * -0.01984126984126984f;
-          const float xi_160 = omega_shear * (xi_112 * -1.0f + xi_113 * 2.0f + xi_120 + xi_123 + xi_125 + xi_156 + xi_244 + xi_246 + xi_247 + xi_252 * -2.0f + xi_253 * -2.0f + xi_9);
+          const float xi_160 = omega_shear * (-xi_112 + xi_113 * 2.0f + xi_120 + xi_123 + xi_125 + xi_156 + xi_248 * -2.0f + xi_251 + xi_252 + xi_260 + xi_265 * -2.0f + xi_9);
           const float xi_162 = xi_160 * -0.041666666666666664f + xi_161 * -0.16666666666666666f;
           const float xi_163 = xi_108 * -0.10000000000000001f + xi_117 * -0.050000000000000003f + xi_162;
           const float xi_164 = xi_111 * 0.028571428571428571f + xi_127 * 0.014285714285714285f + xi_155 + xi_158 + xi_159 + xi_163;
           const float xi_176 = xi_111 * -0.071428571428571425f + xi_127 * -0.035714285714285712f + xi_159 + xi_160 * 0.083333333333333329f + xi_161 * 0.33333333333333331f;
-          const float xi_187 = xi_107 * 0.095238095238095233f + xi_111 * -0.042857142857142858f + xi_127 * -0.021428571428571429f + xi_131 * 0.015873015873015872f + xi_155 * -1.0f + xi_158 * -1.0f + xi_163;
+          const float xi_187 = xi_107 * 0.095238095238095233f + xi_111 * -0.042857142857142858f + xi_127 * -0.021428571428571429f + xi_131 * 0.015873015873015872f - xi_155 - xi_158 + xi_163;
           const float xi_190 = xi_157 * 0.0625f;
           const float xi_191 = xi_131 * 0.013888888888888888f;
           const float xi_193 = xi_110 * 0.083333333333333329f + xi_124 * 0.041666666666666664f;
@@ -726,25 +681,25 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const float xi_195 = xi_165 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
           const float xi_202 = xi_151 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
           const float xi_204 = xi_127 * -0.0071428571428571426f;
-          const float xi_205 = xi_198 * (u_2 * xi_197 + xi_132 + xi_17 + xi_261);
+          const float xi_205 = xi_198 * (u_2 * xi_197 + xi_132 + xi_17 + xi_254);
           const float xi_206 = xi_117 * 0.025000000000000001f;
           const float xi_209 = xi_107 * -0.023809523809523808f + xi_131 * -0.003968253968253968f;
           const float xi_210 = xi_162 + xi_193 + xi_203 + xi_204 + xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
-          const float xi_226 = xi_162 + xi_193 + xi_203 * -1.0f + xi_204 + xi_205 * -1.0f + xi_206 + xi_207 + xi_208 + xi_209;
-          const float xi_228 = xi_190 * -1.0f;
+          const float xi_226 = xi_162 + xi_193 - xi_203 + xi_204 - xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
+          const float xi_228 = -xi_190;
           const float xi_229 = xi_127 * 0.017857142857142856f;
           const float xi_231 = xi_188 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
-          const float xi_233 = xi_198 * (rho * u_0 * u_2 + xi_10 + xi_166 + xi_249);
-          const float xi_234 = xi_232 * -1.0f + xi_233 * -1.0f;
+          const float xi_233 = xi_198 * (rho * u_0 * u_2 + xi_10 + xi_166 + xi_257);
+          const float xi_234 = -xi_232 - xi_233;
           const float xi_240 = xi_232 + xi_233;
           const float xi_243 = xi_186 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
-          const float forceTerm_0 = xi_25 * xi_28 + xi_25 * -1.0f + xi_26 * xi_28 + xi_26 * -1.0f + xi_27 * xi_28 + xi_27 * -1.0f;
-          const float forceTerm_1 = xi_29 + xi_31 * -1.0f + xi_41;
-          const float forceTerm_2 = xi_29 * -1.0f + xi_31 + xi_41;
-          const float forceTerm_3 = xi_42 * -1.0f + xi_44 + xi_48;
-          const float forceTerm_4 = xi_42 + xi_44 * -1.0f + xi_48;
-          const float forceTerm_5 = xi_49 + xi_51 * -1.0f + xi_52;
-          const float forceTerm_6 = xi_49 * -1.0f + xi_51 + xi_52;
+          const float forceTerm_0 = xi_25 * xi_28 - xi_25 + xi_26 * xi_28 - xi_26 + xi_27 * xi_28 - xi_27;
+          const float forceTerm_1 = xi_29 - xi_31 + xi_41;
+          const float forceTerm_2 = -xi_29 + xi_31 + xi_41;
+          const float forceTerm_3 = -xi_42 + xi_44 + xi_48;
+          const float forceTerm_4 = xi_42 - xi_44 + xi_48;
+          const float forceTerm_5 = xi_49 - xi_51 + xi_52;
+          const float forceTerm_6 = -xi_49 + xi_51 + xi_52;
           const float forceTerm_7 = xi_57 + xi_66 + xi_76;
           const float forceTerm_8 = xi_66 + xi_77 + xi_78;
           const float forceTerm_9 = xi_57 + xi_78 + xi_80;
@@ -757,25 +712,25 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
           const float forceTerm_16 = xi_103 + xi_90 + xi_91;
           const float forceTerm_17 = xi_102 + xi_104 + xi_57;
           const float forceTerm_18 = xi_101 + xi_104 + xi_77;
-          _data_pdfs_20_30_10[ctr_0] = forceTerm_0 + xi_107 * 0.14285714285714285f + xi_108 * 0.20000000000000001f + xi_110 * -1.0f + xi_111 * 0.085714285714285715f + xi_117 * 0.10000000000000001f + xi_124 * -0.5f + xi_127 * 0.042857142857142858f + xi_131 * 0.023809523809523808f + xi_251;
-          _data_pdfs_20_31_10[ctr_0] = forceTerm_1 + xi_135 * -1.0f + xi_140 * -1.0f + xi_151 + xi_164 + xi_246;
-          _data_pdfs_20_32_10[ctr_0] = forceTerm_2 + xi_135 + xi_140 + xi_164 + xi_165 + xi_247;
-          _data_pdfs_20_33_10[ctr_0] = forceTerm_3 + xi_168 + xi_170 + xi_175 + xi_176 + xi_253;
-          _data_pdfs_20_34_10[ctr_0] = forceTerm_4 + xi_168 * -1.0f + xi_170 * -1.0f + xi_176 + xi_177 + xi_252;
-          _data_pdfs_20_35_10[ctr_0] = forceTerm_5 + xi_179 * -1.0f + xi_181 * -1.0f + xi_186 + xi_187 + xi_256;
-          _data_pdfs_20_36_10[ctr_0] = forceTerm_6 + xi_179 + xi_181 + xi_187 + xi_188 + xi_244;
-          _data_pdfs_20_37_10[ctr_0] = forceTerm_7 + xi_177 + xi_195 + xi_200 + xi_265;
-          _data_pdfs_20_38_10[ctr_0] = forceTerm_8 + xi_175 + xi_195 + xi_201 + xi_260;
-          _data_pdfs_20_39_10[ctr_0] = forceTerm_9 + xi_177 + xi_201 + xi_202 + xi_264;
-          _data_pdfs_20_310_10[ctr_0] = forceTerm_10 + xi_175 + xi_200 + xi_202 + xi_263;
-          _data_pdfs_20_311_10[ctr_0] = forceTerm_11 + xi_210 + xi_219 + xi_224 + xi_248;
-          _data_pdfs_20_312_10[ctr_0] = forceTerm_12 + xi_219 + xi_225 + xi_226 + xi_258;
-          _data_pdfs_20_313_10[ctr_0] = forceTerm_13 + xi_231 + xi_234 + xi_239 + xi_250;
-          _data_pdfs_20_314_10[ctr_0] = forceTerm_14 + xi_231 + xi_240 + xi_241 + xi_257;
-          _data_pdfs_20_315_10[ctr_0] = forceTerm_15 + xi_224 + xi_226 + xi_242 + xi_261;
-          _data_pdfs_20_316_10[ctr_0] = forceTerm_16 + xi_210 + xi_225 + xi_242 + xi_259;
-          _data_pdfs_20_317_10[ctr_0] = forceTerm_17 + xi_239 + xi_240 + xi_243 + xi_254;
-          _data_pdfs_20_318_10[ctr_0] = forceTerm_18 + xi_234 + xi_241 + xi_243 + xi_249;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0] = forceTerm_0 + xi_107 * 0.14285714285714285f + xi_108 * 0.20000000000000001f - xi_110 + xi_111 * 0.085714285714285715f + xi_117 * 0.10000000000000001f + xi_124 * -0.5f + xi_127 * 0.042857142857142858f + xi_131 * 0.023809523809523808f + xi_250;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0] = forceTerm_1 - xi_135 - xi_140 + xi_151 + xi_164 + xi_251;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0] = forceTerm_2 + xi_135 + xi_140 + xi_164 + xi_165 + xi_252;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0] = forceTerm_3 + xi_168 + xi_170 + xi_175 + xi_176 + xi_248;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0] = forceTerm_4 - xi_168 - xi_170 + xi_176 + xi_177 + xi_265;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0] = forceTerm_5 - xi_179 - xi_181 + xi_186 + xi_187 + xi_263;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0] = forceTerm_6 + xi_179 + xi_181 + xi_187 + xi_188 + xi_260;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0] = forceTerm_7 + xi_177 + xi_195 + xi_200 + xi_255;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0] = forceTerm_8 + xi_175 + xi_195 + xi_201 + xi_249;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0] = forceTerm_9 + xi_177 + xi_201 + xi_202 + xi_261;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0] = forceTerm_10 + xi_175 + xi_200 + xi_202 + xi_258;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0] = forceTerm_11 + xi_210 + xi_219 + xi_224 + xi_264;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0] = forceTerm_12 + xi_219 + xi_225 + xi_226 + xi_253;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0] = forceTerm_13 + xi_231 + xi_234 + xi_239 + xi_259;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0] = forceTerm_14 + xi_231 + xi_240 + xi_241 + xi_262;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0] = forceTerm_15 + xi_224 + xi_226 + xi_242 + xi_254;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0] = forceTerm_16 + xi_210 + xi_225 + xi_242 + xi_247;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0] = forceTerm_17 + xi_239 + xi_240 + xi_243 + xi_245;
+          _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0] = forceTerm_18 + xi_234 + xi_241 + xi_243 + xi_257;
         }
       }
     }
@@ -784,40 +739,42 @@ static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsi
 } // namespace internal_48c9ee502281a70505dce0378c55abd5
 
 void CollideSweepSinglePrecisionThermalizedAVX::run(IBlock *block) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
 
-  auto &time_step = this->time_step_;
+  auto &omega_shear = this->omega_shear_;
+  auto &omega_even = this->omega_even_;
+  auto &block_offset_1 = this->block_offset_1_;
   auto &kT = this->kT_;
-  auto &omega_odd = this->omega_odd_;
   auto &seed = this->seed_;
+  auto &time_step = this->time_step_;
+  auto &block_offset_2 = this->block_offset_2_;
+  auto &omega_odd = this->omega_odd_;
+  auto &block_offset_0 = this->block_offset_0_;
   auto &omega_bulk = this->omega_bulk_;
-  auto block_offset_0 = this->block_offset_0_;
-  auto &omega_shear = this->omega_shear_;
-  auto &omega_even = this->omega_even_;
-  auto block_offset_2 = this->block_offset_2_;
-  auto block_offset_1 = this->block_offset_1_;
-  block_offset_generator(block, block_offset_0, block_offset_1, block_offset_2);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -828,6 +785,9 @@ void CollideSweepSinglePrecisionThermalizedAVX::run(IBlock *block) {
 }
 
 void CollideSweepSinglePrecisionThermalizedAVX::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -839,41 +799,40 @@ void CollideSweepSinglePrecisionThermalizedAVX::runOnCellInterval(const shared_p
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
 
-  auto &time_step = this->time_step_;
+  auto &omega_shear = this->omega_shear_;
+  auto &omega_even = this->omega_even_;
+  auto &block_offset_1 = this->block_offset_1_;
   auto &kT = this->kT_;
-  auto &omega_odd = this->omega_odd_;
   auto &seed = this->seed_;
+  auto &time_step = this->time_step_;
+  auto &block_offset_2 = this->block_offset_2_;
+  auto &omega_odd = this->omega_odd_;
+  auto &block_offset_0 = this->block_offset_0_;
   auto &omega_bulk = this->omega_bulk_;
-  auto block_offset_0 = this->block_offset_0_;
-  auto &omega_shear = this->omega_shear_;
-  auto &omega_even = this->omega_even_;
-  auto block_offset_2 = this->block_offset_2_;
-  auto block_offset_1 = this->block_offset_1_;
-  block_offset_generator(block, block_offset_0, block_offset_1, block_offset_2);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -892,4 +851,4 @@ void CollideSweepSinglePrecisionThermalizedAVX::runOnCellInterval(const shared_p
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h
index 36af5562973..b5955f045db 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedAVX.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -51,16 +52,15 @@ namespace pystencils {
 
 class CollideSweepSinglePrecisionThermalizedAVX {
 public:
-  CollideSweepSinglePrecisionThermalizedAVX(
-      BlockDataID forceID_, BlockDataID pdfsID_, uint32_t block_offset_0,
-      uint32_t block_offset_1, uint32_t block_offset_2, float kT,
-      float omega_bulk, float omega_even, float omega_odd, float omega_shear,
-      uint32_t seed, uint32_t time_step)
-      : forceID(forceID_), pdfsID(pdfsID_), block_offset_0_(block_offset_0),
-        block_offset_1_(block_offset_1), block_offset_2_(block_offset_2),
-        kT_(kT), omega_bulk_(omega_bulk), omega_even_(omega_even),
-        omega_odd_(omega_odd), omega_shear_(omega_shear), seed_(seed),
-        time_step_(time_step){};
+  CollideSweepSinglePrecisionThermalizedAVX(BlockDataID forceID_,
+                                            BlockDataID pdfsID_, float kT,
+                                            float omega_bulk, float omega_even,
+                                            float omega_odd, float omega_shear,
+                                            uint32_t seed, uint32_t time_step)
+      : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
+        omega_even_(omega_even), omega_odd_(omega_odd),
+        omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
+        configured_(false){};
 
   void run(IBlock *block);
 
@@ -97,6 +97,15 @@ class CollideSweepSinglePrecisionThermalizedAVX {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {
+    Cell BlockCellBB = blocks->getBlockCellBB(*block).min();
+    block_offset_0_ = uint32_t(BlockCellBB[0]);
+    block_offset_1_ = uint32_t(BlockCellBB[1]);
+    block_offset_2_ = uint32_t(BlockCellBB[2]);
+    configured_ = true;
+  }
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   uint32_t block_offset_0_;
@@ -109,9 +118,7 @@ class CollideSweepSinglePrecisionThermalizedAVX {
   float omega_shear_;
   uint32_t seed_;
   uint32_t time_step_;
-  std::function<void(IBlock *, uint32_t &, uint32_t &, uint32_t &)>
-      block_offset_generator =
-          [](IBlock *const, uint32_t &, uint32_t &, uint32_t &) {};
+  bool configured_;
 };
 
 } // namespace pystencils
@@ -120,4 +127,4 @@ class CollideSweepSinglePrecisionThermalizedAVX {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.cu
new file mode 100644
index 00000000000..3369e5cf3be
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.cu
@@ -0,0 +1,514 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file CollideSweepSinglePrecisionThermalizedCUDA.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#include <cmath>
+
+#include "CollideSweepSinglePrecisionThermalizedCUDA.h"
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+
+#include "philox_rand.h"
+
+#define FUNC_PREFIX __global__
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#pragma GCC diagnostic ignored "-Wshadow"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning push
+#pragma warning(disable : 1599)
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace pystencils {
+
+namespace internal_collidesweepsingleprecisionthermalizedcuda_collidesweepsingleprecisionthermalizedcuda {
+static FUNC_PREFIX __launch_bounds__(256) void collidesweepsingleprecisionthermalizedcuda_collidesweepsingleprecisionthermalizedcuda(float *RESTRICT const _data_force, float *RESTRICT _data_pdfs, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, uint32_t block_offset_0, uint32_t block_offset_1, uint32_t block_offset_2, float kT, float omega_bulk, float omega_even, float omega_odd, float omega_shear, uint32_t seed, uint32_t time_step) {
+  if (blockDim.x * blockIdx.x + threadIdx.x < _size_force_0 && blockDim.y * blockIdx.y + threadIdx.y < _size_force_1 && blockDim.z * blockIdx.z + threadIdx.z < _size_force_2) {
+    const int64_t ctr_0 = blockDim.x * blockIdx.x + threadIdx.x;
+    const int64_t ctr_1 = blockDim.y * blockIdx.y + threadIdx.y;
+    const int64_t ctr_2 = blockDim.z * blockIdx.z + threadIdx.z;
+    const float xi_244 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+    const float xi_245 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3];
+    const float xi_246 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3];
+    const float xi_247 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+    const float xi_248 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3];
+    const float xi_249 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3];
+    const float xi_250 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+    const float xi_251 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+    const float xi_252 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3];
+    const float xi_253 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+    const float xi_254 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+    const float xi_255 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3];
+    const float xi_256 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3];
+    const float xi_257 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
+    const float xi_258 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3];
+    const float xi_259 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+    const float xi_260 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3];
+    const float xi_261 = _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+    const float xi_262 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+    const float xi_263 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3];
+    const float xi_264 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+    const float xi_265 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+
+    float random_3_0{};
+    float random_3_1{};
+    float random_3_2{};
+    float random_3_3{};
+    if (kT > 0.) {
+      philox_float4(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 3, seed, random_3_0, random_3_1, random_3_2, random_3_3);
+    }
+
+    float random_2_0{};
+    float random_2_1{};
+    float random_2_2{};
+    float random_2_3{};
+    if (kT > 0.) {
+      philox_float4(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 2, seed, random_2_0, random_2_1, random_2_2, random_2_3);
+    }
+
+    float random_1_0{};
+    float random_1_1{};
+    float random_1_2{};
+    float random_1_3{};
+    if (kT > 0.) {
+      philox_float4(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 1, seed, random_1_0, random_1_1, random_1_2, random_1_3);
+    }
+
+    float random_0_0{};
+    float random_0_1{};
+    float random_0_2{};
+    float random_0_3{};
+    if (kT > 0.) {
+      philox_float4(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 0, seed, random_0_0, random_0_1, random_0_2, random_0_3);
+    }
+    const float xi_2 = xi_249 + xi_252;
+    const float xi_3 = xi_2 + xi_257;
+    const float xi_4 = xi_245 + xi_246 + xi_259;
+    const float xi_5 = xi_248 + xi_256;
+    const float xi_6 = xi_260 + xi_263;
+    const float xi_8 = -xi_251;
+    const float xi_9 = -xi_265;
+    const float xi_10 = -xi_260;
+    const float xi_11 = -xi_258;
+    const float xi_12 = -xi_264;
+    const float xi_13 = xi_10 + xi_11 + xi_12;
+    const float xi_14 = -xi_250;
+    const float xi_15 = -xi_262;
+    const float xi_16 = xi_14 + xi_15;
+    const float xi_17 = -xi_255;
+    const float xi_18 = -xi_256;
+    const float xi_19 = xi_17 + xi_18;
+    const float xi_20 = -xi_249;
+    const float xi_21 = xi_10 + xi_20;
+    const float xi_22 = -xi_245;
+    const float xi_23 = -xi_263;
+    const float xi_24 = xi_17 + xi_22 + xi_23 + xi_246;
+    const float xi_28 = omega_bulk * 0.5f;
+    const float xi_29 = xi_254 * 0.16666666666666666f;
+    const float xi_30 = xi_254 * 0.083333333333333329f;
+    const float xi_42 = xi_261 * 0.16666666666666666f;
+    const float xi_43 = xi_261 * 0.083333333333333329f;
+    const float xi_49 = xi_247 * 0.16666666666666666f;
+    const float xi_50 = xi_247 * 0.083333333333333329f;
+    const float xi_55 = omega_shear * 0.041666666666666664f;
+    const float xi_60 = omega_bulk * 0.041666666666666664f;
+    const float xi_67 = xi_254 * 0.25f;
+    const float xi_71 = omega_shear * 0.125f;
+    const float xi_72 = xi_254 * xi_71;
+    const float xi_109 = 2.4494897427831779f;
+    const float xi_114 = -xi_244;
+    const float xi_118 = -xi_246;
+    const float xi_119 = xi_118 + xi_18;
+    const float xi_120 = -xi_253 + xi_8;
+    const float xi_122 = -xi_252;
+    const float xi_123 = xi_11 + xi_122 + xi_15 + xi_21;
+    const float xi_125 = xi_245 * 2.0f + xi_246 * 2.0f + xi_255 * 2.0f + xi_256 * 2.0f;
+    const float xi_126 = xi_125 + xi_257 * 5.0f + xi_264 * 5.0f;
+    const float xi_128 = xi_258 * 2.0f;
+    const float xi_129 = xi_252 * 2.0f;
+    const float xi_130 = xi_249 * 2.0f + xi_260 * 2.0f;
+    const float xi_132 = xi_118 + xi_256;
+    const float xi_133 = xi_132 + xi_14 + xi_22 + xi_255 + xi_259;
+    const float xi_134 = omega_odd * 0.25f;
+    const float xi_135 = xi_133 * xi_134;
+    const float xi_136 = random_2_3 - 0.5f;
+    const float xi_141 = xi_265 * 2.0f;
+    const float xi_142 = xi_262 * 2.0f;
+    const float xi_143 = xi_251 * 2.0f + xi_253 * -2.0f;
+    const float xi_144 = xi_14 - xi_141 + xi_142 + xi_143 + xi_19 + xi_4;
+    const float xi_145 = omega_odd * 0.083333333333333329f;
+    const float xi_146 = xi_144 * xi_145;
+    const float xi_147 = random_1_2 - 0.5f;
+    const float xi_152 = random_0_1 - 0.5f;
+    const float xi_166 = xi_122 + xi_258;
+    const float xi_167 = xi_12 + xi_166 + xi_20 + xi_257 + xi_260;
+    const float xi_168 = xi_134 * xi_167;
+    const float xi_169 = random_2_1 - 0.5f;
+    const float xi_171 = xi_13 + xi_141 - xi_142 + xi_143 + xi_3;
+    const float xi_172 = xi_145 * xi_171;
+    const float xi_173 = random_2_0 - 0.5f;
+    const float xi_178 = xi_119 + xi_23 + xi_245 + xi_248 + xi_255;
+    const float xi_179 = xi_134 * xi_178;
+    const float xi_180 = random_2_2 - 0.5f;
+    const float xi_182 = -xi_128 - xi_129 + xi_130 + xi_24 + xi_5;
+    const float xi_183 = xi_145 * xi_182;
+    const float xi_184 = random_1_3 - 0.5f;
+    const float xi_198 = omega_shear * 0.25f;
+    const float xi_211 = omega_odd * 0.041666666666666664f;
+    const float xi_212 = xi_182 * xi_211;
+    const float xi_213 = omega_odd * 0.125f;
+    const float xi_214 = xi_178 * xi_213;
+    const float xi_220 = xi_144 * xi_211;
+    const float xi_221 = xi_133 * xi_213;
+    const float xi_235 = xi_167 * xi_213;
+    const float xi_236 = xi_171 * xi_211;
+    const float rr_0 = 0.0f;
+    const float xi_31 = rr_0 * xi_30;
+    const float xi_44 = rr_0 * xi_43;
+    const float xi_51 = rr_0 * xi_50;
+    const float xi_53 = rr_0 * 0.041666666666666664f;
+    const float xi_54 = xi_261 * xi_53;
+    const float xi_59 = xi_254 * xi_53;
+    const float xi_81 = xi_247 * xi_53;
+    const float vel0Term = xi_253 + xi_262 + xi_3;
+    const float vel1Term = xi_265 + xi_4;
+    const float vel2Term = xi_258 + xi_5;
+    const float rho = vel0Term + vel1Term + vel2Term + xi_244 + xi_250 + xi_251 + xi_255 + xi_264 + xi_6;
+    const float xi_105 = kT * rho;
+    const float xi_106 = powf(xi_105 * (1.0f - (-omega_even + 1.0f) * (-omega_even + 1.0f)), 0.5f);
+    const float xi_107 = xi_106 * (random_3_0 - 0.5f) * 3.7416573867739413f;
+    const float xi_108 = xi_106 * (random_3_2 - 0.5f) * 5.4772255750516612f;
+    const float xi_110 = xi_109 * (random_1_1 - 0.5f) * powf(xi_105 * (1.0f - (-omega_bulk + 1.0f) * (-omega_bulk + 1.0f)), 0.5f);
+    const float xi_111 = xi_106 * (random_3_1 - 0.5f) * 8.3666002653407556f;
+    const float xi_137 = powf(xi_105 * (1.0f - (-omega_odd + 1.0f) * (-omega_odd + 1.0f)), 0.5f);
+    const float xi_138 = xi_137 * 1.4142135623730951f;
+    const float xi_139 = xi_138 * 0.5f;
+    const float xi_140 = xi_136 * xi_139;
+    const float xi_148 = xi_109 * xi_137;
+    const float xi_149 = xi_148 * 0.16666666666666666f;
+    const float xi_150 = xi_147 * xi_149;
+    const float xi_151 = -xi_146 - xi_150;
+    const float xi_153 = powf(xi_105 * (1.0f - (-omega_shear + 1.0f) * (-omega_shear + 1.0f)), 0.5f);
+    const float xi_154 = xi_153 * 0.5f;
+    const float xi_155 = xi_152 * xi_154;
+    const float xi_161 = xi_153 * (random_0_0 - 0.5f) * 1.7320508075688772f;
+    const float xi_165 = xi_146 + xi_150;
+    const float xi_170 = xi_139 * xi_169;
+    const float xi_174 = xi_149 * xi_173;
+    const float xi_175 = xi_172 + xi_174;
+    const float xi_177 = -xi_172 - xi_174;
+    const float xi_181 = xi_139 * xi_180;
+    const float xi_185 = xi_149 * xi_184;
+    const float xi_186 = -xi_183 - xi_185;
+    const float xi_188 = xi_183 + xi_185;
+    const float xi_189 = xi_152 * xi_153 * 0.25f;
+    const float xi_192 = xi_107 * 0.083333333333333329f;
+    const float xi_196 = xi_154 * (random_0_2 - 0.5f);
+    const float xi_203 = xi_154 * (random_1_0 - 0.5f);
+    const float xi_207 = xi_111 * -0.014285714285714285f;
+    const float xi_208 = xi_108 * 0.050000000000000003f;
+    const float xi_215 = xi_148 * 0.083333333333333329f;
+    const float xi_216 = xi_184 * xi_215;
+    const float xi_217 = xi_138 * 0.25f;
+    const float xi_218 = xi_180 * xi_217;
+    const float xi_219 = -xi_212 + xi_214 - xi_216 + xi_218;
+    const float xi_222 = xi_147 * xi_215;
+    const float xi_223 = xi_136 * xi_217;
+    const float xi_224 = -xi_220 + xi_221 - xi_222 + xi_223;
+    const float xi_225 = xi_220 - xi_221 + xi_222 - xi_223;
+    const float xi_227 = -xi_189;
+    const float xi_230 = xi_111 * 0.035714285714285712f;
+    const float xi_232 = xi_154 * (random_0_3 - 0.5f);
+    const float xi_237 = xi_169 * xi_217;
+    const float xi_238 = xi_173 * xi_215;
+    const float xi_239 = -xi_235 + xi_236 - xi_237 + xi_238;
+    const float xi_241 = xi_235 - xi_236 + xi_237 - xi_238;
+    const float xi_242 = xi_212 - xi_214 + xi_216 - xi_218;
+    const float xi_0 = ((1.0f) / (rho));
+    const float xi_7 = xi_0 * 0.5f;
+    const float u_0 = xi_0 * (vel0Term + xi_13 + xi_8 + xi_9) + xi_261 * xi_7;
+    const float xi_25 = u_0 * xi_261;
+    const float xi_37 = xi_25 * 0.16666666666666666f;
+    const float xi_38 = xi_25 * 0.083333333333333329f;
+    const float xi_39 = omega_shear * xi_38;
+    const float xi_40 = -xi_37 + xi_39;
+    const float xi_56 = -xi_25 * xi_55 + xi_37;
+    const float xi_57 = -xi_43 + xi_54 + xi_56;
+    const float xi_61 = -xi_25 * xi_60;
+    const float xi_68 = u_0 * xi_67;
+    const float xi_73 = u_0 * xi_72;
+    const float xi_77 = xi_43 - xi_54 + xi_56;
+    const float xi_84 = -xi_38;
+    const float xi_95 = u_0 * xi_247;
+    const float xi_96 = xi_95 * 0.25f;
+    const float xi_99 = xi_71 * xi_95;
+    const float xi_113 = rho * (u_0 * u_0);
+    const float u_1 = xi_0 * (vel1Term + xi_16 + xi_19 + xi_253 + xi_8) + xi_254 * xi_7;
+    const float xi_26 = u_1 * xi_254;
+    const float xi_32 = xi_26 * 0.16666666666666666f;
+    const float xi_45 = xi_26 * 0.083333333333333329f;
+    const float xi_46 = omega_shear * xi_45;
+    const float xi_47 = -xi_32 + xi_46;
+    const float xi_62 = -xi_26 * xi_60;
+    const float xi_69 = u_1 * 0.25f;
+    const float xi_70 = xi_261 * xi_69;
+    const float xi_74 = u_1 * xi_71;
+    const float xi_75 = xi_261 * xi_74;
+    const float xi_76 = -xi_68 - xi_70 + xi_73 + xi_75;
+    const float xi_78 = xi_68 + xi_70 - xi_73 - xi_75;
+    const float xi_86 = xi_247 * xi_69;
+    const float xi_88 = xi_247 * xi_74;
+    const float xi_93 = -xi_45;
+    const float xi_112 = rho * (u_1 * u_1);
+    const float xi_121 = xi_112 + xi_120 + xi_9;
+    const float xi_197 = rho * u_1;
+    const float xi_199 = xi_198 * (u_0 * xi_197 + xi_120 + xi_262 + xi_265);
+    const float xi_200 = -xi_196 - xi_199;
+    const float xi_201 = xi_196 + xi_199;
+    const float u_2 = xi_0 * (vel2Term + xi_21 + xi_24 + xi_252) + xi_247 * xi_7;
+    const float xi_27 = u_2 * xi_247;
+    const float xi_33 = xi_27 * 0.16666666666666666f;
+    const float xi_34 = xi_27 * 0.083333333333333329f;
+    const float xi_35 = omega_shear * xi_34;
+    const float xi_36 = -xi_33 + xi_35;
+    const float xi_41 = -omega_shear * xi_32 + xi_26 * 0.33333333333333331f + xi_36 + xi_40;
+    const float xi_48 = -omega_shear * xi_37 + xi_25 * 0.33333333333333331f + xi_36 + xi_47;
+    const float xi_52 = -omega_shear * xi_33 + xi_27 * 0.33333333333333331f + xi_40 + xi_47;
+    const float xi_58 = -xi_34;
+    const float xi_63 = -xi_27 * xi_60;
+    const float xi_64 = -xi_26 * xi_55 + xi_32 + xi_61 + xi_62 + xi_63;
+    const float xi_65 = xi_30 - xi_59 + xi_64;
+    const float xi_66 = xi_35 + xi_58 + xi_65;
+    const float xi_79 = -xi_30 + xi_59 + xi_64;
+    const float xi_80 = xi_35 + xi_58 + xi_79;
+    const float xi_82 = -xi_27 * xi_55 + xi_33;
+    const float xi_83 = xi_50 - xi_81 + xi_82;
+    const float xi_85 = xi_39 + xi_65 + xi_84;
+    const float xi_87 = u_2 * xi_67;
+    const float xi_89 = u_2 * xi_72;
+    const float xi_90 = xi_86 + xi_87 - xi_88 - xi_89;
+    const float xi_91 = xi_39 + xi_79 + xi_84;
+    const float xi_92 = -xi_86 - xi_87 + xi_88 + xi_89;
+    const float xi_94 = xi_46 + xi_61 + xi_62 + xi_63 + xi_83 + xi_93;
+    const float xi_97 = u_2 * xi_261;
+    const float xi_98 = xi_97 * 0.25f;
+    const float xi_100 = xi_71 * xi_97;
+    const float xi_101 = xi_100 - xi_96 - xi_98 + xi_99;
+    const float xi_102 = -xi_100 + xi_96 + xi_98 - xi_99;
+    const float xi_103 = -xi_50 + xi_81 + xi_82;
+    const float xi_104 = xi_103 + xi_46 + xi_61 + xi_62 + xi_63 + xi_93;
+    const float xi_115 = rho * (u_2 * u_2);
+    const float xi_116 = xi_114 + xi_115 * 0.66666666666666663f + xi_248 * 3.0f + xi_263 * 3.0f;
+    const float xi_117 = omega_even * (xi_112 * 0.66666666666666663f + xi_113 * 1.6666666666666667f + xi_116 + xi_245 * -3.0f + xi_246 * -3.0f + xi_250 * 3.0f + xi_255 * -3.0f + xi_256 * -3.0f + xi_259 * 3.0f);
+    const float xi_124 = omega_bulk * (xi_113 + xi_115 + xi_119 + xi_121 + xi_123 + xi_17 + xi_22 + xi_244);
+    const float xi_127 = omega_even * (xi_112 * 2.3333333333333335f + xi_116 + xi_126 + xi_249 * -5.0f + xi_250 * -2.0f + xi_252 * -5.0f + xi_258 * -5.0f + xi_259 * -2.0f + xi_260 * -5.0f);
+    const float xi_131 = omega_even * (xi_114 + xi_115 * 3.0f + xi_126 + xi_128 + xi_129 + xi_130 + xi_248 * -4.0f + xi_250 * 5.0f + xi_251 * -7.0f + xi_253 * -7.0f + xi_259 * 5.0f + xi_262 * -7.0f + xi_263 * -4.0f + xi_265 * -7.0f);
+    const float xi_156 = -xi_115 + xi_248;
+    const float xi_157 = omega_shear * (xi_121 + xi_156 + xi_16 + xi_2 + xi_258 - xi_259 + xi_6);
+    const float xi_158 = xi_157 * 0.125f;
+    const float xi_159 = xi_107 * -0.11904761904761904f + xi_131 * -0.01984126984126984f;
+    const float xi_160 = omega_shear * (-xi_112 + xi_113 * 2.0f + xi_120 + xi_123 + xi_125 + xi_156 + xi_250 + xi_257 * -2.0f + xi_259 + xi_263 + xi_264 * -2.0f + xi_9);
+    const float xi_162 = xi_160 * -0.041666666666666664f + xi_161 * -0.16666666666666666f;
+    const float xi_163 = xi_108 * -0.10000000000000001f + xi_117 * -0.050000000000000003f + xi_162;
+    const float xi_164 = xi_111 * 0.028571428571428571f + xi_127 * 0.014285714285714285f + xi_155 + xi_158 + xi_159 + xi_163;
+    const float xi_176 = xi_111 * -0.071428571428571425f + xi_127 * -0.035714285714285712f + xi_159 + xi_160 * 0.083333333333333329f + xi_161 * 0.33333333333333331f;
+    const float xi_187 = xi_107 * 0.095238095238095233f + xi_111 * -0.042857142857142858f + xi_127 * -0.021428571428571429f + xi_131 * 0.015873015873015872f - xi_155 - xi_158 + xi_163;
+    const float xi_190 = xi_157 * 0.0625f;
+    const float xi_191 = xi_131 * 0.013888888888888888f;
+    const float xi_193 = xi_110 * 0.083333333333333329f + xi_124 * 0.041666666666666664f;
+    const float xi_194 = xi_160 * 0.020833333333333332f + xi_161 * 0.083333333333333329f + xi_193;
+    const float xi_195 = xi_165 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
+    const float xi_202 = xi_151 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
+    const float xi_204 = xi_127 * -0.0071428571428571426f;
+    const float xi_205 = xi_198 * (u_2 * xi_197 + xi_132 + xi_17 + xi_245);
+    const float xi_206 = xi_117 * 0.025000000000000001f;
+    const float xi_209 = xi_107 * -0.023809523809523808f + xi_131 * -0.003968253968253968f;
+    const float xi_210 = xi_162 + xi_193 + xi_203 + xi_204 + xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
+    const float xi_226 = xi_162 + xi_193 - xi_203 + xi_204 - xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
+    const float xi_228 = -xi_190;
+    const float xi_229 = xi_127 * 0.017857142857142856f;
+    const float xi_231 = xi_188 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
+    const float xi_233 = xi_198 * (rho * u_0 * u_2 + xi_10 + xi_166 + xi_249);
+    const float xi_234 = -xi_232 - xi_233;
+    const float xi_240 = xi_232 + xi_233;
+    const float xi_243 = xi_186 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
+    const float forceTerm_0 = xi_25 * xi_28 - xi_25 + xi_26 * xi_28 - xi_26 + xi_27 * xi_28 - xi_27;
+    const float forceTerm_1 = xi_29 - xi_31 + xi_41;
+    const float forceTerm_2 = -xi_29 + xi_31 + xi_41;
+    const float forceTerm_3 = -xi_42 + xi_44 + xi_48;
+    const float forceTerm_4 = xi_42 - xi_44 + xi_48;
+    const float forceTerm_5 = xi_49 - xi_51 + xi_52;
+    const float forceTerm_6 = -xi_49 + xi_51 + xi_52;
+    const float forceTerm_7 = xi_57 + xi_66 + xi_76;
+    const float forceTerm_8 = xi_66 + xi_77 + xi_78;
+    const float forceTerm_9 = xi_57 + xi_78 + xi_80;
+    const float forceTerm_10 = xi_76 + xi_77 + xi_80;
+    const float forceTerm_11 = xi_83 + xi_85 + xi_90;
+    const float forceTerm_12 = xi_83 + xi_91 + xi_92;
+    const float forceTerm_13 = xi_101 + xi_57 + xi_94;
+    const float forceTerm_14 = xi_102 + xi_77 + xi_94;
+    const float forceTerm_15 = xi_103 + xi_85 + xi_92;
+    const float forceTerm_16 = xi_103 + xi_90 + xi_91;
+    const float forceTerm_17 = xi_102 + xi_104 + xi_57;
+    const float forceTerm_18 = xi_101 + xi_104 + xi_77;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = forceTerm_0 + xi_107 * 0.14285714285714285f + xi_108 * 0.20000000000000001f - xi_110 + xi_111 * 0.085714285714285715f + xi_117 * 0.10000000000000001f + xi_124 * -0.5f + xi_127 * 0.042857142857142858f + xi_131 * 0.023809523809523808f + xi_244;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = forceTerm_1 - xi_135 - xi_140 + xi_151 + xi_164 + xi_259;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = forceTerm_2 + xi_135 + xi_140 + xi_164 + xi_165 + xi_250;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = forceTerm_3 + xi_168 + xi_170 + xi_175 + xi_176 + xi_264;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = forceTerm_4 - xi_168 - xi_170 + xi_176 + xi_177 + xi_257;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = forceTerm_5 - xi_179 - xi_181 + xi_186 + xi_187 + xi_248;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = forceTerm_6 + xi_179 + xi_181 + xi_187 + xi_188 + xi_263;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = forceTerm_7 + xi_177 + xi_195 + xi_200 + xi_265;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = forceTerm_8 + xi_175 + xi_195 + xi_201 + xi_253;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = forceTerm_9 + xi_177 + xi_201 + xi_202 + xi_251;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = forceTerm_10 + xi_175 + xi_200 + xi_202 + xi_262;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = forceTerm_11 + xi_210 + xi_219 + xi_224 + xi_246;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = forceTerm_12 + xi_219 + xi_225 + xi_226 + xi_256;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = forceTerm_13 + xi_231 + xi_234 + xi_239 + xi_258;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = forceTerm_14 + xi_231 + xi_240 + xi_241 + xi_252;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = forceTerm_15 + xi_224 + xi_226 + xi_242 + xi_245;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = forceTerm_16 + xi_210 + xi_225 + xi_242 + xi_255;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = forceTerm_17 + xi_239 + xi_240 + xi_243 + xi_260;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = forceTerm_18 + xi_234 + xi_241 + xi_243 + xi_249;
+  }
+}
+} // namespace internal_collidesweepsingleprecisionthermalizedcuda_collidesweepsingleprecisionthermalizedcuda
+
+void CollideSweepSinglePrecisionThermalizedCUDA::run(IBlock *block, gpuStream_t stream) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
+  auto pdfs = block->getData<gpu::GPUField<float>>(pdfsID);
+  auto force = block->getData<gpu::GPUField<float>>(forceID);
+
+  auto &kT = this->kT_;
+  auto &block_offset_1 = this->block_offset_1_;
+  auto &omega_bulk = this->omega_bulk_;
+  auto &omega_odd = this->omega_odd_;
+  auto &omega_even = this->omega_even_;
+  auto &block_offset_0 = this->block_offset_0_;
+  auto &block_offset_2 = this->block_offset_2_;
+  auto &omega_shear = this->omega_shear_;
+  auto &seed = this->seed_;
+  auto &time_step = this->time_step_;
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
+  float *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+  float *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_collidesweepsingleprecisionthermalizedcuda_collidesweepsingleprecisionthermalizedcuda::collidesweepsingleprecisionthermalizedcuda_collidesweepsingleprecisionthermalizedcuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, block_offset_0, block_offset_1, block_offset_2, kT, omega_bulk, omega_even, omega_odd, omega_shear, seed, time_step);
+}
+
+void CollideSweepSinglePrecisionThermalizedCUDA::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block, gpuStream_t stream) {
+  if (!this->configured_)
+    WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
+
+  CellInterval ci = globalCellInterval;
+  CellInterval blockBB = blocks->getBlockCellBB(*block);
+  blockBB.expand(ghostLayers);
+  ci.intersect(blockBB);
+  blocks->transformGlobalToBlockLocalCellInterval(ci, *block);
+  if (ci.empty())
+    return;
+
+  auto pdfs = block->getData<gpu::GPUField<float>>(pdfsID);
+  auto force = block->getData<gpu::GPUField<float>>(forceID);
+
+  auto &kT = this->kT_;
+  auto &block_offset_1 = this->block_offset_1_;
+  auto &omega_bulk = this->omega_bulk_;
+  auto &omega_odd = this->omega_odd_;
+  auto &omega_even = this->omega_even_;
+  auto &block_offset_0 = this->block_offset_0_;
+  auto &block_offset_2 = this->block_offset_2_;
+  auto &omega_shear = this->omega_shear_;
+  auto &seed = this->seed_;
+  auto &time_step = this->time_step_;
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
+  float *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+  float *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_collidesweepsingleprecisionthermalizedcuda_collidesweepsingleprecisionthermalizedcuda::collidesweepsingleprecisionthermalizedcuda_collidesweepsingleprecisionthermalizedcuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, block_offset_0, block_offset_1, block_offset_2, kT, omega_bulk, omega_even, omega_odd, omega_shear, seed, time_step);
+}
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h
new file mode 100644
index 00000000000..cf55997b2b5
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h
@@ -0,0 +1,139 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file CollideSweepSinglePrecisionThermalizedCUDA.h
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "field/SwapableCompare.h"
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace pystencils {
+
+class CollideSweepSinglePrecisionThermalizedCUDA {
+public:
+  CollideSweepSinglePrecisionThermalizedCUDA(BlockDataID forceID_,
+                                             BlockDataID pdfsID_, float kT,
+                                             float omega_bulk, float omega_even,
+                                             float omega_odd, float omega_shear,
+                                             uint32_t seed, uint32_t time_step)
+      : forceID(forceID_), pdfsID(pdfsID_), kT_(kT), omega_bulk_(omega_bulk),
+        omega_even_(omega_even), omega_odd_(omega_odd),
+        omega_shear_(omega_shear), seed_(seed), time_step_(time_step),
+        configured_(false){};
+
+  void run(IBlock *block, gpuStream_t stream = nullptr);
+
+  void runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers, IBlock *block,
+                         gpuStream_t stream = nullptr);
+
+  void operator()(IBlock *block, gpuStream_t stream = nullptr) {
+    run(block, stream);
+  }
+
+  static std::function<void(IBlock *)> getSweep(
+      const shared_ptr<CollideSweepSinglePrecisionThermalizedCUDA> &kernel) {
+    return [kernel](IBlock *b) { kernel->run(b); };
+  }
+
+  static std::function<void(IBlock *, gpuStream_t)> getSweepOnCellInterval(
+      const shared_ptr<CollideSweepSinglePrecisionThermalizedCUDA> &kernel,
+      const shared_ptr<StructuredBlockStorage> &blocks,
+      const CellInterval &globalCellInterval, cell_idx_t ghostLayers = 1) {
+    return [kernel, blocks, globalCellInterval,
+            ghostLayers](IBlock *b, gpuStream_t stream = nullptr) {
+      kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                                stream);
+    };
+  }
+
+  std::function<void(IBlock *)> getSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->run(b, stream); };
+  }
+
+  std::function<void(IBlock *)>
+  getSweepOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers = 1,
+                         gpuStream_t stream = nullptr) {
+    return [this, blocks, globalCellInterval, ghostLayers, stream](IBlock *b) {
+      this->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                              stream);
+    };
+  }
+
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {
+    Cell BlockCellBB = blocks->getBlockCellBB(*block).min();
+    block_offset_0_ = uint32_t(BlockCellBB[0]);
+    block_offset_1_ = uint32_t(BlockCellBB[1]);
+    block_offset_2_ = uint32_t(BlockCellBB[2]);
+    configured_ = true;
+  }
+
+  BlockDataID forceID;
+  BlockDataID pdfsID;
+  uint32_t block_offset_0_;
+  uint32_t block_offset_1_;
+  uint32_t block_offset_2_;
+  float kT_;
+  float omega_bulk_;
+  float omega_even_;
+  float omega_odd_;
+  float omega_shear_;
+  uint32_t seed_;
+  uint32_t time_step_;
+  bool configured_;
+};
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precision.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precision.cpp
index 1ab063d94be..ef39da01ce7 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precision.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precision.cpp
@@ -17,9 +17,7 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
-
-#include <cmath>
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include "Dynamic_UBB_double_precision.h"
 #include "core/DataTypes.h"
@@ -47,7 +45,7 @@ namespace lbm {
 #pragma diag_suppress 177
 #endif
 #endif
-
+// NOLINTBEGIN(readability-non-const-parameter*)
 namespace internal_451fd042b8d7665063ea81b98853365b {
 static FUNC_PREFIX void dynamic_ubb_double_precision_boundary_Dynamic_UBB_double_precision(uint8_t *RESTRICT const _data_indexVector, double *RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize) {
 
@@ -68,11 +66,12 @@ static FUNC_PREFIX void dynamic_ubb_double_precision_boundary_Dynamic_UBB_double
     const double vel1Term = _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_2 + 11 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z - _stride_pdfs_2 + 15 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x - _stride_pdfs_0 + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + 7 * _stride_pdfs_3];
     const double vel2Term = _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_2 + 5 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_2 + 12 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x - _stride_pdfs_0 + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_2 + 13 * _stride_pdfs_3];
     const double rho = vel0Term + vel1Term + vel2Term + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_2 * z - _stride_pdfs_2 + 6 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_2 * z] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + 2 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z - _stride_pdfs_2 + 16 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x - _stride_pdfs_0 + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 3 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x - _stride_pdfs_0 + _stride_pdfs_1 * y + _stride_pdfs_2 * z - _stride_pdfs_2 + 17 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x - _stride_pdfs_0 + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + 9 * _stride_pdfs_3];
-    _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_0 * neighbour_offset_x[dir] + _stride_pdfs_1 * y + _stride_pdfs_1 * neighbour_offset_y[dir] + _stride_pdfs_2 * z + _stride_pdfs_2 * neighbour_offset_z[dir] + _stride_pdfs_3 * f_in_inv_dir_idx[dir]] = rho * (6.0 * ((double)(neighbour_offset_x[dir])) * *((double *)(&_data_indexVector[40 * ctr_0 + 16])) + 6.0 * ((double)(neighbour_offset_y[dir])) * *((double *)(&_data_indexVector[40 * ctr_0 + 24])) + 6.0 * ((double)(neighbour_offset_z[dir])) * *((double *)(&_data_indexVector[40 * ctr_0 + 32]))) * -1.0 * weights[dir] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_3 * dir];
+    _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_0 * neighbour_offset_x[dir] + _stride_pdfs_1 * y + _stride_pdfs_1 * neighbour_offset_y[dir] + _stride_pdfs_2 * z + _stride_pdfs_2 * neighbour_offset_z[dir] + _stride_pdfs_3 * f_in_inv_dir_idx[dir]] = -rho * (6.0 * ((double)(neighbour_offset_x[dir])) * *((double *)(&_data_indexVector[40 * ctr_0 + 16])) + 6.0 * ((double)(neighbour_offset_y[dir])) * *((double *)(&_data_indexVector[40 * ctr_0 + 24])) + 6.0 * ((double)(neighbour_offset_z[dir])) * *((double *)(&_data_indexVector[40 * ctr_0 + 32]))) * weights[dir] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_3 * dir];
   }
 }
 } // namespace internal_451fd042b8d7665063ea81b98853365b
 
+// NOLINTEND(readability-non-const-parameter*)
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
@@ -93,7 +92,7 @@ void Dynamic_UBB_double_precision::run_impl(IBlock *block, IndexVectors::Type ty
 
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
 
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precision.h
index 7c9a644ac83..7e6642037f1 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precision.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "blockforest/StructuredBlockForest.h"
 #include "core/debug/Debug.h"
@@ -31,9 +32,7 @@
 #include "field/FlagField.h"
 #include "field/GhostLayerField.h"
 
-#include <cassert>
-#include <functional>
-#include <memory>
+#include <set>
 #include <vector>
 
 #ifdef __GNUC__
@@ -44,6 +43,10 @@
 #define RESTRICT
 #endif
 
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+using walberla::half;
+#endif
+
 namespace walberla {
 namespace lbm {
 
@@ -87,11 +90,11 @@ class Dynamic_UBB_double_precision {
   };
 
   Dynamic_UBB_double_precision(
-      const std::shared_ptr<StructuredBlockForest> &blocks, BlockDataID pdfsID_,
-      std::function<Vector3<double>(
-          const Cell &, const std::shared_ptr<StructuredBlockForest> &,
-          IBlock &)> &velocityCallback)
-      : elementInitaliser(velocityCallback), pdfsID(pdfsID_) {
+      const shared_ptr<StructuredBlockForest> &blocks, BlockDataID pdfsID_,
+      std::function<Vector3<double>(const Cell &,
+                                    const shared_ptr<StructuredBlockForest> &,
+                                    IBlock &)> &velocityCallback)
+      : elementInitialiser(velocityCallback), pdfsID(pdfsID_) {
     auto createIdxVector = [](IBlock *const, StructuredBlockStorage *const) {
       return new IndexVectors();
     };
@@ -120,7 +123,7 @@ class Dynamic_UBB_double_precision {
   }
 
   template <typename FlagField_T>
-  void fillFromFlagField(const std::shared_ptr<StructuredBlockForest> &blocks,
+  void fillFromFlagField(const shared_ptr<StructuredBlockForest> &blocks,
                          ConstBlockDataID flagFieldID, FlagUID boundaryFlagUID,
                          FlagUID domainFlagUID) {
     for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
@@ -129,7 +132,7 @@ class Dynamic_UBB_double_precision {
   }
 
   template <typename FlagField_T>
-  void fillFromFlagField(const std::shared_ptr<StructuredBlockForest> &blocks,
+  void fillFromFlagField(const shared_ptr<StructuredBlockForest> &blocks,
                          IBlock *block, ConstBlockDataID flagFieldID,
                          FlagUID boundaryFlagUID, FlagUID domainFlagUID) {
     auto *indexVectors = block->getData<IndexVectors>(indexVectorID);
@@ -139,8 +142,9 @@ class Dynamic_UBB_double_precision {
 
     auto *flagField = block->getData<FlagField_T>(flagFieldID);
 
-    assert(flagField->flagExists(boundaryFlagUID) and
-           flagField->flagExists(domainFlagUID));
+    if (!(flagField->flagExists(boundaryFlagUID) &&
+          flagField->flagExists(domainFlagUID)))
+      return;
 
     auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
     auto domainFlag = flagField->getFlag(domainFlagUID);
@@ -160,11 +164,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(0, 0, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 0);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 0, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -181,11 +185,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(0, 1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 1);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -202,11 +206,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(0, -1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 2);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() - 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -223,11 +227,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(-1, 0, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 3);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() - 1, it.y() + 0, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -244,11 +248,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(1, 0, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 4);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 1, it.y() + 0, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -265,11 +269,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(0, 0, 1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 5);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 0, it.z() + 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -286,11 +290,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(0, 0, -1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 6);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 0, it.z() - 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -307,11 +311,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(-1, 1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 7);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() - 1, it.y() + 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -328,11 +332,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(1, 1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 8);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 1, it.y() + 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -349,11 +353,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(-1, -1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 9);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() - 1, it.y() - 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -370,11 +374,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(1, -1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 10);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 1, it.y() - 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -391,11 +395,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(0, 1, 1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 11);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 1, it.z() + 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -412,11 +416,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(0, -1, 1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 12);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() - 1, it.z() + 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -433,11 +437,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(-1, 0, 1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 13);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() - 1, it.y() + 0, it.z() + 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -454,11 +458,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(1, 0, 1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 14);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 1, it.y() + 0, it.z() + 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -475,11 +479,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(0, 1, -1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 15);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 1, it.z() - 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -496,11 +500,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(0, -1, -1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 16);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() - 1, it.z() - 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -517,11 +521,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(-1, 0, -1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 17);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() - 1, it.y() + 0, it.z() - 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -538,11 +542,11 @@ class Dynamic_UBB_double_precision {
 
       if (isFlagSet(it.neighbor(1, 0, -1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 18);
-        Vector3<double> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 1, it.y() + 0, it.z() - 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -559,12 +563,12 @@ class Dynamic_UBB_double_precision {
 
   BlockDataID indexVectorID;
   std::function<Vector3<double>(
-      const Cell &, const std::shared_ptr<StructuredBlockForest> &, IBlock &)>
-      elementInitaliser;
+      const Cell &, const shared_ptr<StructuredBlockForest> &, IBlock &)>
+      elementInitialiser;
 
 public:
   BlockDataID pdfsID;
 };
 
 } // namespace lbm
-} // namespace walberla
\ No newline at end of file
+} // namespace walberla
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precisionCUDA.cu
new file mode 100644
index 00000000000..8d35b5d929f
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precisionCUDA.cu
@@ -0,0 +1,187 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file Dynamic_UBB_double_precisionCUDA.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#include "Dynamic_UBB_double_precisionCUDA.h"
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "gpu/ErrorChecking.h"
+
+#define FUNC_PREFIX __global__
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#if defined(__NVCC__)
+#define RESTRICT __restrict__
+#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177 // unused variable
+#else
+#pragma push
+#pragma diag_suppress 177 // unused variable
+#endif
+#elif defined(__clang__)
+#if defined(__CUDA__)
+#if defined(__CUDA_ARCH__)
+// clang compiling CUDA code in device mode
+#define RESTRICT __restrict__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wstrict-aliasing"
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wconversion"
+#pragma clang diagnostic ignored "-Wsign-compare"
+#else
+// clang compiling CUDA code in host mode
+#define RESTRICT __restrict__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wstrict-aliasing"
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wconversion"
+#pragma clang diagnostic ignored "-Wsign-compare"
+#endif
+#endif
+#elif defined(__GNUC__) or defined(__GNUG__)
+#define RESTRICT __restrict__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#elif defined(_MSC_VER)
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+// NOLINTBEGIN(readability-non-const-parameter*)
+namespace internal_dynamic_ubb_double_precisioncuda_boundary_Dynamic_UBB_double_precisionCUDA {
+static FUNC_PREFIX __launch_bounds__(256) void dynamic_ubb_double_precisioncuda_boundary_Dynamic_UBB_double_precisionCUDA(uint8_t *RESTRICT const _data_indexVector, double *RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize) {
+
+  const int32_t f_in_inv_dir_idx[] = {0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13};
+
+  const double weights[] = {0.33333333333333333, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778};
+
+  const int32_t neighbour_offset_x[] = {0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1};
+  const int32_t neighbour_offset_y[] = {0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0};
+  const int32_t neighbour_offset_z[] = {0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1};
+
+  if (blockDim.x * blockIdx.x + threadIdx.x < indexVectorSize) {
+    uint8_t *RESTRICT _data_indexVector_10 = _data_indexVector;
+    const int32_t x = *((int32_t *)(&_data_indexVector_10[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x]));
+    uint8_t *RESTRICT _data_indexVector_14 = _data_indexVector + 4;
+    const int32_t y = *((int32_t *)(&_data_indexVector_14[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x]));
+    uint8_t *RESTRICT _data_indexVector_18 = _data_indexVector + 8;
+    const int32_t z = *((int32_t *)(&_data_indexVector_18[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x]));
+    uint8_t *RESTRICT _data_indexVector_112 = _data_indexVector + 12;
+    const int32_t dir = *((int32_t *)(&_data_indexVector_112[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x]));
+    double *RESTRICT _data_pdfs_10_2m1_318 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z - _stride_pdfs_2 + 18 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 4 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_11_20_38 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + 8 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_1m1_20_310 = _data_pdfs + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + 10 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_10_21_314 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_2 + 14 * _stride_pdfs_3;
+    const double vel0Term = _data_pdfs_10_20_34[_stride_pdfs_0 * x + _stride_pdfs_0] + _data_pdfs_10_21_314[_stride_pdfs_0 * x + _stride_pdfs_0] + _data_pdfs_10_2m1_318[_stride_pdfs_0 * x + _stride_pdfs_0] + _data_pdfs_11_20_38[_stride_pdfs_0 * x + _stride_pdfs_0] + _data_pdfs_1m1_20_310[_stride_pdfs_0 * x + _stride_pdfs_0];
+    double *RESTRICT _data_pdfs_11_2m1_315 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z - _stride_pdfs_2 + 15 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_11_20_37 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + 7 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_11_20_31 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_11_21_311 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_2 + 11 * _stride_pdfs_3;
+    const double vel1Term = _data_pdfs_11_20_31[_stride_pdfs_0 * x] + _data_pdfs_11_20_37[_stride_pdfs_0 * x - _stride_pdfs_0] + _data_pdfs_11_21_311[_stride_pdfs_0 * x] + _data_pdfs_11_2m1_315[_stride_pdfs_0 * x];
+    double *RESTRICT _data_pdfs_1m1_21_312 = _data_pdfs + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_2 + 12 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_10_21_313 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_2 + 13 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_10_21_35 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_2 + 5 * _stride_pdfs_3;
+    const double vel2Term = _data_pdfs_10_21_313[_stride_pdfs_0 * x - _stride_pdfs_0] + _data_pdfs_10_21_35[_stride_pdfs_0 * x] + _data_pdfs_1m1_21_312[_stride_pdfs_0 * x];
+    double *RESTRICT _data_pdfs_1m1_2m1_316 = _data_pdfs + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z - _stride_pdfs_2 + 16 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_10_2m1_317 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z - _stride_pdfs_2 + 17 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_10_2m1_36 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z - _stride_pdfs_2 + 6 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z;
+    double *RESTRICT _data_pdfs_1m1_20_39 = _data_pdfs + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + 9 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_1m1_20_32 = _data_pdfs + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + 2 * _stride_pdfs_3;
+    double *RESTRICT _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 3 * _stride_pdfs_3;
+    const double rho = vel0Term + vel1Term + vel2Term + _data_pdfs_10_20_30[_stride_pdfs_0 * x] + _data_pdfs_10_20_33[_stride_pdfs_0 * x - _stride_pdfs_0] + _data_pdfs_10_2m1_317[_stride_pdfs_0 * x - _stride_pdfs_0] + _data_pdfs_10_2m1_36[_stride_pdfs_0 * x] + _data_pdfs_1m1_20_32[_stride_pdfs_0 * x] + _data_pdfs_1m1_20_39[_stride_pdfs_0 * x - _stride_pdfs_0] + _data_pdfs_1m1_2m1_316[_stride_pdfs_0 * x];
+    double *RESTRICT _data_pdfs760dce667daab9ae = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 * neighbour_offset_y[dir] + _stride_pdfs_2 * z + _stride_pdfs_2 * neighbour_offset_z[dir] + _stride_pdfs_3 * f_in_inv_dir_idx[dir];
+    uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
+    uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
+    uint8_t *RESTRICT _data_indexVector_132 = _data_indexVector + 32;
+    double *RESTRICT _data_pdfs_10_200a5bfb2297cee9db = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_3 * dir;
+    _data_pdfs760dce667daab9ae[_stride_pdfs_0 * x + _stride_pdfs_0 * neighbour_offset_x[dir]] = -rho * (6.0 * ((double)(neighbour_offset_x[dir])) * *((double *)(&_data_indexVector_116[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x])) + 6.0 * ((double)(neighbour_offset_y[dir])) * *((double *)(&_data_indexVector_124[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x])) + 6.0 * ((double)(neighbour_offset_z[dir])) * *((double *)(&_data_indexVector_132[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x]))) * weights[dir] + _data_pdfs_10_200a5bfb2297cee9db[_stride_pdfs_0 * x];
+  }
+}
+} // namespace internal_dynamic_ubb_double_precisioncuda_boundary_Dynamic_UBB_double_precisionCUDA
+
+// NOLINTEND(readability-non-const-parameter*)
+
+#if defined(__clang__)
+#if defined(__CUDA__)
+#if defined(__CUDA_ARCH__)
+// clang compiling CUDA code in device mode
+#pragma clang diagnostic pop
+#else
+// clang compiling CUDA code in host mode
+#pragma clang diagnostic pop
+#endif
+#endif
+#elif defined(__GNUC__) or defined(__GNUG__)
+#pragma GCC diagnostic pop
+#elif defined(__CUDACC__)
+#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+#pragma nv_diagnostic pop
+#else
+#pragma pop
+#endif
+#endif
+
+void Dynamic_UBB_double_precisionCUDA::run_impl(IBlock *block, IndexVectors::Type type, gpuStream_t stream) {
+  auto *indexVectors = block->getData<IndexVectors>(indexVectorID);
+  int32_t indexVectorSize = int32_c(indexVectors->indexVector(type).size());
+  if (indexVectorSize == 0)
+    return;
+
+  auto pointer = indexVectors->pointerGpu(type);
+
+  uint8_t *_data_indexVector = reinterpret_cast<uint8_t *>(pointer);
+
+  auto pdfs = block->getData<gpu::GPUField<double>>(pdfsID);
+
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+  double *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  dim3 _block(uint32_c(((256 < indexVectorSize) ? 256 : indexVectorSize)), uint32_c(1), uint32_c(1));
+  dim3 _grid(uint32_c(((indexVectorSize) % (((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ((int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize))) + 1)), uint32_c(1), uint32_c(1));
+  internal_dynamic_ubb_double_precisioncuda_boundary_Dynamic_UBB_double_precisionCUDA::dynamic_ubb_double_precisioncuda_boundary_Dynamic_UBB_double_precisionCUDA<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+}
+
+void Dynamic_UBB_double_precisionCUDA::run(IBlock *block, gpuStream_t stream) {
+  run_impl(block, IndexVectors::ALL, stream);
+}
+
+void Dynamic_UBB_double_precisionCUDA::inner(IBlock *block, gpuStream_t stream) {
+  run_impl(block, IndexVectors::INNER, stream);
+}
+
+void Dynamic_UBB_double_precisionCUDA::outer(IBlock *block, gpuStream_t stream) {
+  run_impl(block, IndexVectors::OUTER, stream);
+}
+
+} // namespace lbm
+} // namespace walberla
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precisionCUDA.h
new file mode 100644
index 00000000000..2b71cc44d1f
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precisionCUDA.h
@@ -0,0 +1,602 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file Dynamic_UBB_double_precisionCUDA.h
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "blockforest/StructuredBlockForest.h"
+#include "core/debug/Debug.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "field/FlagField.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+
+#include <set>
+#include <vector>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+using walberla::half;
+#endif
+
+namespace walberla {
+namespace lbm {
+
+class Dynamic_UBB_double_precisionCUDA {
+public:
+  struct IndexInfo {
+    int32_t x;
+    int32_t y;
+    int32_t z;
+    int32_t dir;
+    double vel_0;
+    double vel_1;
+    double vel_2;
+    IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_)
+        : x(x_), y(y_), z(z_), dir(dir_), vel_0(), vel_1(), vel_2() {}
+    bool operator==(const IndexInfo &o) const {
+      return x == o.x && y == o.y && z == o.z && dir == o.dir &&
+             floatIsEqual(vel_0, o.vel_0) && floatIsEqual(vel_1, o.vel_1) &&
+             floatIsEqual(vel_2, o.vel_2);
+    }
+  };
+
+  class IndexVectors {
+  public:
+    using CpuIndexVector = std::vector<IndexInfo>;
+
+    enum Type { ALL = 0, INNER = 1, OUTER = 2, NUM_TYPES = 3 };
+
+    IndexVectors() = default;
+    bool operator==(IndexVectors const &other) const {
+      return other.cpuVectors_ == cpuVectors_;
+    }
+
+    ~IndexVectors() {
+      for (auto &gpuVec : gpuVectors_)
+        WALBERLA_GPU_CHECK(gpuFree(gpuVec));
+    }
+    CpuIndexVector &indexVector(Type t) { return cpuVectors_[t]; }
+    IndexInfo *pointerCpu(Type t) { return cpuVectors_[t].data(); }
+
+    IndexInfo *pointerGpu(Type t) { return gpuVectors_[t]; }
+    void syncGPU() {
+      for (auto &gpuVec : gpuVectors_)
+        WALBERLA_GPU_CHECK(gpuFree(gpuVec));
+      gpuVectors_.resize(cpuVectors_.size());
+
+      WALBERLA_ASSERT_EQUAL(cpuVectors_.size(), NUM_TYPES);
+      for (size_t i = 0; i < cpuVectors_.size(); ++i) {
+        auto &gpuVec = gpuVectors_[i];
+        auto &cpuVec = cpuVectors_[i];
+        WALBERLA_GPU_CHECK(
+            gpuMalloc(&gpuVec, sizeof(IndexInfo) * cpuVec.size()));
+        WALBERLA_GPU_CHECK(gpuMemcpy(gpuVec, &cpuVec[0],
+                                     sizeof(IndexInfo) * cpuVec.size(),
+                                     gpuMemcpyHostToDevice));
+      }
+    }
+
+  private:
+    std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+    using GpuIndexVector = IndexInfo *;
+    std::vector<GpuIndexVector> gpuVectors_;
+  };
+
+  Dynamic_UBB_double_precisionCUDA(
+      const shared_ptr<StructuredBlockForest> &blocks, BlockDataID pdfsID_,
+      std::function<Vector3<double>(const Cell &,
+                                    const shared_ptr<StructuredBlockForest> &,
+                                    IBlock &)> &velocityCallback)
+      : elementInitialiser(velocityCallback), pdfsID(pdfsID_) {
+    auto createIdxVector = [](IBlock *const, StructuredBlockStorage *const) {
+      return new IndexVectors();
+    };
+    indexVectorID = blocks->addStructuredBlockData<IndexVectors>(
+        createIdxVector, "IndexField_Dynamic_UBB_double_precisionCUDA");
+  };
+
+  void run(IBlock *block, gpuStream_t stream = nullptr);
+
+  void operator()(IBlock *block, gpuStream_t stream = nullptr) {
+    run(block, stream);
+  }
+
+  void inner(IBlock *block, gpuStream_t stream = nullptr);
+
+  void outer(IBlock *block, gpuStream_t stream = nullptr);
+
+  std::function<void(IBlock *)> getSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->run(b, stream); };
+  }
+
+  std::function<void(IBlock *)> getInnerSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->inner(b, stream); };
+  }
+
+  std::function<void(IBlock *)> getOuterSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->outer(b, stream); };
+  }
+
+  template <typename FlagField_T>
+  void fillFromFlagField(const shared_ptr<StructuredBlockForest> &blocks,
+                         ConstBlockDataID flagFieldID, FlagUID boundaryFlagUID,
+                         FlagUID domainFlagUID) {
+    for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+      fillFromFlagField<FlagField_T>(blocks, &*blockIt, flagFieldID,
+                                     boundaryFlagUID, domainFlagUID);
+  }
+
+  template <typename FlagField_T>
+  void fillFromFlagField(const shared_ptr<StructuredBlockForest> &blocks,
+                         IBlock *block, ConstBlockDataID flagFieldID,
+                         FlagUID boundaryFlagUID, FlagUID domainFlagUID) {
+    auto *indexVectors = block->getData<IndexVectors>(indexVectorID);
+    auto &indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+    auto &indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+    auto &indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+    auto *flagField = block->getData<FlagField_T>(flagFieldID);
+
+    if (!(flagField->flagExists(boundaryFlagUID) &&
+          flagField->flagExists(domainFlagUID)))
+      return;
+
+    auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+    auto domainFlag = flagField->getFlag(domainFlagUID);
+
+    auto inner = flagField->xyzSize();
+    inner.expand(cell_idx_t(-1));
+
+    indexVectorAll.clear();
+    indexVectorInner.clear();
+    indexVectorOuter.clear();
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 0, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 0);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 0, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 1);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, -1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 2);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() - 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(-1, 0, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 3);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() - 1, it.y() + 0, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(1, 0, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 4);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 1, it.y() + 0, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 0, 1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 5);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 0, it.z() + 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 0, -1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 6);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 0, it.z() - 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(-1, 1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 7);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() - 1, it.y() + 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(1, 1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 8);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 1, it.y() + 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(-1, -1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 9);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() - 1, it.y() - 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(1, -1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 10);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 1, it.y() - 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 1, 1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 11);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 1, it.z() + 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, -1, 1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 12);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() - 1, it.z() + 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(-1, 0, 1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 13);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() - 1, it.y() + 0, it.z() + 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(1, 0, 1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 14);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 1, it.y() + 0, it.z() + 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 1, -1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 15);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 1, it.z() - 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, -1, -1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 16);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() - 1, it.z() - 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(-1, 0, -1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 17);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() - 1, it.y() + 0, it.z() - 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(1, 0, -1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 18);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 1, it.y() + 0, it.z() - 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    indexVectors->syncGPU();
+  }
+
+private:
+  void run_impl(IBlock *block, IndexVectors::Type type,
+                gpuStream_t stream = nullptr);
+
+  BlockDataID indexVectorID;
+  std::function<Vector3<double>(
+      const Cell &, const shared_ptr<StructuredBlockForest> &, IBlock &)>
+      elementInitialiser;
+
+public:
+  BlockDataID pdfsID;
+};
+
+} // namespace lbm
+} // namespace walberla
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precision.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precision.cpp
index 36c70e20e91..33dbd780d87 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precision.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precision.cpp
@@ -17,9 +17,7 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
-
-#include <cmath>
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include "Dynamic_UBB_single_precision.h"
 #include "core/DataTypes.h"
@@ -47,7 +45,7 @@ namespace lbm {
 #pragma diag_suppress 177
 #endif
 #endif
-
+// NOLINTBEGIN(readability-non-const-parameter*)
 namespace internal_efdc97602c407e557fff6737dd9b4d80 {
 static FUNC_PREFIX void dynamic_ubb_single_precision_boundary_Dynamic_UBB_single_precision(uint8_t *RESTRICT const _data_indexVector, float *RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize) {
 
@@ -68,11 +66,12 @@ static FUNC_PREFIX void dynamic_ubb_single_precision_boundary_Dynamic_UBB_single
     const float vel1Term = _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_2 + 11 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z - _stride_pdfs_2 + 15 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x - _stride_pdfs_0 + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + 7 * _stride_pdfs_3];
     const float vel2Term = _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_2 + 5 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_2 + 12 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x - _stride_pdfs_0 + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_2 + 13 * _stride_pdfs_3];
     const float rho = vel0Term + vel1Term + vel2Term + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_2 * z - _stride_pdfs_2 + 6 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_2 * z] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + 2 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z - _stride_pdfs_2 + 16 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x - _stride_pdfs_0 + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 3 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x - _stride_pdfs_0 + _stride_pdfs_1 * y + _stride_pdfs_2 * z - _stride_pdfs_2 + 17 * _stride_pdfs_3] + _data_pdfs[_stride_pdfs_0 * x - _stride_pdfs_0 + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + 9 * _stride_pdfs_3];
-    _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_0 * neighbour_offset_x[dir] + _stride_pdfs_1 * y + _stride_pdfs_1 * neighbour_offset_y[dir] + _stride_pdfs_2 * z + _stride_pdfs_2 * neighbour_offset_z[dir] + _stride_pdfs_3 * f_in_inv_dir_idx[dir]] = rho * (6.0f * ((float)(neighbour_offset_x[dir])) * *((float *)(&_data_indexVector[28 * ctr_0 + 16])) + 6.0f * ((float)(neighbour_offset_y[dir])) * *((float *)(&_data_indexVector[28 * ctr_0 + 20])) + 6.0f * ((float)(neighbour_offset_z[dir])) * *((float *)(&_data_indexVector[28 * ctr_0 + 24]))) * -1.0f * weights[dir] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_3 * dir];
+    _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_0 * neighbour_offset_x[dir] + _stride_pdfs_1 * y + _stride_pdfs_1 * neighbour_offset_y[dir] + _stride_pdfs_2 * z + _stride_pdfs_2 * neighbour_offset_z[dir] + _stride_pdfs_3 * f_in_inv_dir_idx[dir]] = -rho * (6.0f * ((float)(neighbour_offset_x[dir])) * *((float *)(&_data_indexVector[28 * ctr_0 + 16])) + 6.0f * ((float)(neighbour_offset_y[dir])) * *((float *)(&_data_indexVector[28 * ctr_0 + 20])) + 6.0f * ((float)(neighbour_offset_z[dir])) * *((float *)(&_data_indexVector[28 * ctr_0 + 24]))) * weights[dir] + _data_pdfs[_stride_pdfs_0 * x + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_3 * dir];
   }
 }
 } // namespace internal_efdc97602c407e557fff6737dd9b4d80
 
+// NOLINTEND(readability-non-const-parameter*)
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
@@ -93,7 +92,7 @@ void Dynamic_UBB_single_precision::run_impl(IBlock *block, IndexVectors::Type ty
 
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
 
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precision.h
index ab7f27e111c..ba75a5b49bc 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precision.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "blockforest/StructuredBlockForest.h"
 #include "core/debug/Debug.h"
@@ -31,9 +32,7 @@
 #include "field/FlagField.h"
 #include "field/GhostLayerField.h"
 
-#include <cassert>
-#include <functional>
-#include <memory>
+#include <set>
 #include <vector>
 
 #ifdef __GNUC__
@@ -44,6 +43,10 @@
 #define RESTRICT
 #endif
 
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+using walberla::half;
+#endif
+
 namespace walberla {
 namespace lbm {
 
@@ -87,11 +90,11 @@ class Dynamic_UBB_single_precision {
   };
 
   Dynamic_UBB_single_precision(
-      const std::shared_ptr<StructuredBlockForest> &blocks, BlockDataID pdfsID_,
-      std::function<Vector3<float>(
-          const Cell &, const std::shared_ptr<StructuredBlockForest> &,
-          IBlock &)> &velocityCallback)
-      : elementInitaliser(velocityCallback), pdfsID(pdfsID_) {
+      const shared_ptr<StructuredBlockForest> &blocks, BlockDataID pdfsID_,
+      std::function<Vector3<float>(const Cell &,
+                                   const shared_ptr<StructuredBlockForest> &,
+                                   IBlock &)> &velocityCallback)
+      : elementInitialiser(velocityCallback), pdfsID(pdfsID_) {
     auto createIdxVector = [](IBlock *const, StructuredBlockStorage *const) {
       return new IndexVectors();
     };
@@ -120,7 +123,7 @@ class Dynamic_UBB_single_precision {
   }
 
   template <typename FlagField_T>
-  void fillFromFlagField(const std::shared_ptr<StructuredBlockForest> &blocks,
+  void fillFromFlagField(const shared_ptr<StructuredBlockForest> &blocks,
                          ConstBlockDataID flagFieldID, FlagUID boundaryFlagUID,
                          FlagUID domainFlagUID) {
     for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
@@ -129,7 +132,7 @@ class Dynamic_UBB_single_precision {
   }
 
   template <typename FlagField_T>
-  void fillFromFlagField(const std::shared_ptr<StructuredBlockForest> &blocks,
+  void fillFromFlagField(const shared_ptr<StructuredBlockForest> &blocks,
                          IBlock *block, ConstBlockDataID flagFieldID,
                          FlagUID boundaryFlagUID, FlagUID domainFlagUID) {
     auto *indexVectors = block->getData<IndexVectors>(indexVectorID);
@@ -139,8 +142,9 @@ class Dynamic_UBB_single_precision {
 
     auto *flagField = block->getData<FlagField_T>(flagFieldID);
 
-    assert(flagField->flagExists(boundaryFlagUID) and
-           flagField->flagExists(domainFlagUID));
+    if (!(flagField->flagExists(boundaryFlagUID) &&
+          flagField->flagExists(domainFlagUID)))
+      return;
 
     auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
     auto domainFlag = flagField->getFlag(domainFlagUID);
@@ -160,11 +164,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(0, 0, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 0);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 0, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -181,11 +185,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(0, 1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 1);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -202,11 +206,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(0, -1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 2);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() - 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -223,11 +227,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(-1, 0, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 3);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() - 1, it.y() + 0, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -244,11 +248,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(1, 0, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 4);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 1, it.y() + 0, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -265,11 +269,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(0, 0, 1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 5);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 0, it.z() + 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -286,11 +290,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(0, 0, -1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 6);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 0, it.z() - 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -307,11 +311,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(-1, 1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 7);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() - 1, it.y() + 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -328,11 +332,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(1, 1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 8);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 1, it.y() + 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -349,11 +353,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(-1, -1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 9);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() - 1, it.y() - 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -370,11 +374,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(1, -1, 0, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 10);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 1, it.y() - 1, it.z() + 0), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -391,11 +395,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(0, 1, 1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 11);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 1, it.z() + 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -412,11 +416,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(0, -1, 1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 12);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() - 1, it.z() + 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -433,11 +437,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(-1, 0, 1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 13);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() - 1, it.y() + 0, it.z() + 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -454,11 +458,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(1, 0, 1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 14);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 1, it.y() + 0, it.z() + 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -475,11 +479,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(0, 1, -1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 15);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() + 1, it.z() - 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -496,11 +500,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(0, -1, -1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 16);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 0, it.y() - 1, it.z() - 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -517,11 +521,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(-1, 0, -1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 17);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() - 1, it.y() + 0, it.z() - 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -538,11 +542,11 @@ class Dynamic_UBB_single_precision {
 
       if (isFlagSet(it.neighbor(1, 0, -1, 0), boundaryFlag)) {
         auto element = IndexInfo(it.x(), it.y(), it.z(), 18);
-        Vector3<float> InitialisatonAdditionalData = elementInitaliser(
+        auto const InitialisationAdditionalData = elementInitialiser(
             Cell(it.x() + 1, it.y() + 0, it.z() - 1), blocks, *block);
-        element.vel_0 = InitialisatonAdditionalData[0];
-        element.vel_1 = InitialisatonAdditionalData[1];
-        element.vel_2 = InitialisatonAdditionalData[2];
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
         indexVectorAll.push_back(element);
         if (inner.contains(it.x(), it.y(), it.z()))
           indexVectorInner.push_back(element);
@@ -559,12 +563,12 @@ class Dynamic_UBB_single_precision {
 
   BlockDataID indexVectorID;
   std::function<Vector3<float>(
-      const Cell &, const std::shared_ptr<StructuredBlockForest> &, IBlock &)>
-      elementInitaliser;
+      const Cell &, const shared_ptr<StructuredBlockForest> &, IBlock &)>
+      elementInitialiser;
 
 public:
   BlockDataID pdfsID;
 };
 
 } // namespace lbm
-} // namespace walberla
\ No newline at end of file
+} // namespace walberla
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precisionCUDA.cu
new file mode 100644
index 00000000000..a046e6f9a46
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precisionCUDA.cu
@@ -0,0 +1,187 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file Dynamic_UBB_single_precisionCUDA.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#include "Dynamic_UBB_single_precisionCUDA.h"
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "gpu/ErrorChecking.h"
+
+#define FUNC_PREFIX __global__
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#if defined(__NVCC__)
+#define RESTRICT __restrict__
+#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177 // unused variable
+#else
+#pragma push
+#pragma diag_suppress 177 // unused variable
+#endif
+#elif defined(__clang__)
+#if defined(__CUDA__)
+#if defined(__CUDA_ARCH__)
+// clang compiling CUDA code in device mode
+#define RESTRICT __restrict__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wstrict-aliasing"
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wconversion"
+#pragma clang diagnostic ignored "-Wsign-compare"
+#else
+// clang compiling CUDA code in host mode
+#define RESTRICT __restrict__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wstrict-aliasing"
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wconversion"
+#pragma clang diagnostic ignored "-Wsign-compare"
+#endif
+#endif
+#elif defined(__GNUC__) or defined(__GNUG__)
+#define RESTRICT __restrict__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#elif defined(_MSC_VER)
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+// NOLINTBEGIN(readability-non-const-parameter*)
+namespace internal_dynamic_ubb_single_precisioncuda_boundary_Dynamic_UBB_single_precisionCUDA {
+static FUNC_PREFIX __launch_bounds__(256) void dynamic_ubb_single_precisioncuda_boundary_Dynamic_UBB_single_precisionCUDA(uint8_t *RESTRICT const _data_indexVector, float *RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize) {
+
+  const int32_t f_in_inv_dir_idx[] = {0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13};
+
+  const float weights[] = {0.33333333333333333f, 0.055555555555555556f, 0.055555555555555556f, 0.055555555555555556f, 0.055555555555555556f, 0.055555555555555556f, 0.055555555555555556f, 0.027777777777777778f, 0.027777777777777778f, 0.027777777777777778f, 0.027777777777777778f, 0.027777777777777778f, 0.027777777777777778f, 0.027777777777777778f, 0.027777777777777778f, 0.027777777777777778f, 0.027777777777777778f, 0.027777777777777778f, 0.027777777777777778f};
+
+  const int32_t neighbour_offset_x[] = {0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1};
+  const int32_t neighbour_offset_y[] = {0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0};
+  const int32_t neighbour_offset_z[] = {0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1};
+
+  if (blockDim.x * blockIdx.x + threadIdx.x < indexVectorSize) {
+    uint8_t *RESTRICT _data_indexVector_10 = _data_indexVector;
+    const int32_t x = *((int32_t *)(&_data_indexVector_10[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
+    uint8_t *RESTRICT _data_indexVector_14 = _data_indexVector + 4;
+    const int32_t y = *((int32_t *)(&_data_indexVector_14[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
+    uint8_t *RESTRICT _data_indexVector_18 = _data_indexVector + 8;
+    const int32_t z = *((int32_t *)(&_data_indexVector_18[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
+    uint8_t *RESTRICT _data_indexVector_112 = _data_indexVector + 12;
+    const int32_t dir = *((int32_t *)(&_data_indexVector_112[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
+    float *RESTRICT _data_pdfs_10_2m1_318 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z - _stride_pdfs_2 + 18 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 4 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_11_20_38 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + 8 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_1m1_20_310 = _data_pdfs + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + 10 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_10_21_314 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_2 + 14 * _stride_pdfs_3;
+    const float vel0Term = _data_pdfs_10_20_34[_stride_pdfs_0 * x + _stride_pdfs_0] + _data_pdfs_10_21_314[_stride_pdfs_0 * x + _stride_pdfs_0] + _data_pdfs_10_2m1_318[_stride_pdfs_0 * x + _stride_pdfs_0] + _data_pdfs_11_20_38[_stride_pdfs_0 * x + _stride_pdfs_0] + _data_pdfs_1m1_20_310[_stride_pdfs_0 * x + _stride_pdfs_0];
+    float *RESTRICT _data_pdfs_11_2m1_315 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z - _stride_pdfs_2 + 15 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_11_20_37 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + 7 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_11_20_31 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_11_21_311 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_2 + 11 * _stride_pdfs_3;
+    const float vel1Term = _data_pdfs_11_20_31[_stride_pdfs_0 * x] + _data_pdfs_11_20_37[_stride_pdfs_0 * x - _stride_pdfs_0] + _data_pdfs_11_21_311[_stride_pdfs_0 * x] + _data_pdfs_11_2m1_315[_stride_pdfs_0 * x];
+    float *RESTRICT _data_pdfs_1m1_21_312 = _data_pdfs + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + _stride_pdfs_2 + 12 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_10_21_313 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_2 + 13 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_10_21_35 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_2 + 5 * _stride_pdfs_3;
+    const float vel2Term = _data_pdfs_10_21_313[_stride_pdfs_0 * x - _stride_pdfs_0] + _data_pdfs_10_21_35[_stride_pdfs_0 * x] + _data_pdfs_1m1_21_312[_stride_pdfs_0 * x];
+    float *RESTRICT _data_pdfs_1m1_2m1_316 = _data_pdfs + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z - _stride_pdfs_2 + 16 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_10_2m1_317 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z - _stride_pdfs_2 + 17 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_10_2m1_36 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z - _stride_pdfs_2 + 6 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z;
+    float *RESTRICT _data_pdfs_1m1_20_39 = _data_pdfs + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + 9 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_1m1_20_32 = _data_pdfs + _stride_pdfs_1 * y - _stride_pdfs_1 + _stride_pdfs_2 * z + 2 * _stride_pdfs_3;
+    float *RESTRICT _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 3 * _stride_pdfs_3;
+    const float rho = vel0Term + vel1Term + vel2Term + _data_pdfs_10_20_30[_stride_pdfs_0 * x] + _data_pdfs_10_20_33[_stride_pdfs_0 * x - _stride_pdfs_0] + _data_pdfs_10_2m1_317[_stride_pdfs_0 * x - _stride_pdfs_0] + _data_pdfs_10_2m1_36[_stride_pdfs_0 * x] + _data_pdfs_1m1_20_32[_stride_pdfs_0 * x] + _data_pdfs_1m1_20_39[_stride_pdfs_0 * x - _stride_pdfs_0] + _data_pdfs_1m1_2m1_316[_stride_pdfs_0 * x];
+    float *RESTRICT _data_pdfs51aa77f0c2cd7c8d = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 * neighbour_offset_y[dir] + _stride_pdfs_2 * z + _stride_pdfs_2 * neighbour_offset_z[dir] + _stride_pdfs_3 * f_in_inv_dir_idx[dir];
+    uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
+    uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
+    uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
+    float *RESTRICT _data_pdfs_10_20cc174ab22360a76a = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_3 * dir;
+    _data_pdfs51aa77f0c2cd7c8d[_stride_pdfs_0 * x + _stride_pdfs_0 * neighbour_offset_x[dir]] = -rho * (6.0f * ((float)(neighbour_offset_x[dir])) * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 6.0f * ((float)(neighbour_offset_y[dir])) * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 6.0f * ((float)(neighbour_offset_z[dir])) * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]))) * weights[dir] + _data_pdfs_10_20cc174ab22360a76a[_stride_pdfs_0 * x];
+  }
+}
+} // namespace internal_dynamic_ubb_single_precisioncuda_boundary_Dynamic_UBB_single_precisionCUDA
+
+// NOLINTEND(readability-non-const-parameter*)
+
+#if defined(__clang__)
+#if defined(__CUDA__)
+#if defined(__CUDA_ARCH__)
+// clang compiling CUDA code in device mode
+#pragma clang diagnostic pop
+#else
+// clang compiling CUDA code in host mode
+#pragma clang diagnostic pop
+#endif
+#endif
+#elif defined(__GNUC__) or defined(__GNUG__)
+#pragma GCC diagnostic pop
+#elif defined(__CUDACC__)
+#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+#pragma nv_diagnostic pop
+#else
+#pragma pop
+#endif
+#endif
+
+void Dynamic_UBB_single_precisionCUDA::run_impl(IBlock *block, IndexVectors::Type type, gpuStream_t stream) {
+  auto *indexVectors = block->getData<IndexVectors>(indexVectorID);
+  int32_t indexVectorSize = int32_c(indexVectors->indexVector(type).size());
+  if (indexVectorSize == 0)
+    return;
+
+  auto pointer = indexVectors->pointerGpu(type);
+
+  uint8_t *_data_indexVector = reinterpret_cast<uint8_t *>(pointer);
+
+  auto pdfs = block->getData<gpu::GPUField<float>>(pdfsID);
+
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+  float *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  dim3 _block(uint32_c(((256 < indexVectorSize) ? 256 : indexVectorSize)), uint32_c(1), uint32_c(1));
+  dim3 _grid(uint32_c(((indexVectorSize) % (((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ((int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize))) + 1)), uint32_c(1), uint32_c(1));
+  internal_dynamic_ubb_single_precisioncuda_boundary_Dynamic_UBB_single_precisionCUDA::dynamic_ubb_single_precisioncuda_boundary_Dynamic_UBB_single_precisionCUDA<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+}
+
+void Dynamic_UBB_single_precisionCUDA::run(IBlock *block, gpuStream_t stream) {
+  run_impl(block, IndexVectors::ALL, stream);
+}
+
+void Dynamic_UBB_single_precisionCUDA::inner(IBlock *block, gpuStream_t stream) {
+  run_impl(block, IndexVectors::INNER, stream);
+}
+
+void Dynamic_UBB_single_precisionCUDA::outer(IBlock *block, gpuStream_t stream) {
+  run_impl(block, IndexVectors::OUTER, stream);
+}
+
+} // namespace lbm
+} // namespace walberla
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precisionCUDA.h
new file mode 100644
index 00000000000..8380a4e33b5
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_single_precisionCUDA.h
@@ -0,0 +1,602 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file Dynamic_UBB_single_precisionCUDA.h
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "blockforest/StructuredBlockForest.h"
+#include "core/debug/Debug.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "field/FlagField.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+
+#include <set>
+#include <vector>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+using walberla::half;
+#endif
+
+namespace walberla {
+namespace lbm {
+
+class Dynamic_UBB_single_precisionCUDA {
+public:
+  struct IndexInfo {
+    int32_t x;
+    int32_t y;
+    int32_t z;
+    int32_t dir;
+    float vel_0;
+    float vel_1;
+    float vel_2;
+    IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_)
+        : x(x_), y(y_), z(z_), dir(dir_), vel_0(), vel_1(), vel_2() {}
+    bool operator==(const IndexInfo &o) const {
+      return x == o.x && y == o.y && z == o.z && dir == o.dir &&
+             floatIsEqual(vel_0, o.vel_0) && floatIsEqual(vel_1, o.vel_1) &&
+             floatIsEqual(vel_2, o.vel_2);
+    }
+  };
+
+  class IndexVectors {
+  public:
+    using CpuIndexVector = std::vector<IndexInfo>;
+
+    enum Type { ALL = 0, INNER = 1, OUTER = 2, NUM_TYPES = 3 };
+
+    IndexVectors() = default;
+    bool operator==(IndexVectors const &other) const {
+      return other.cpuVectors_ == cpuVectors_;
+    }
+
+    ~IndexVectors() {
+      for (auto &gpuVec : gpuVectors_)
+        WALBERLA_GPU_CHECK(gpuFree(gpuVec));
+    }
+    CpuIndexVector &indexVector(Type t) { return cpuVectors_[t]; }
+    IndexInfo *pointerCpu(Type t) { return cpuVectors_[t].data(); }
+
+    IndexInfo *pointerGpu(Type t) { return gpuVectors_[t]; }
+    void syncGPU() {
+      for (auto &gpuVec : gpuVectors_)
+        WALBERLA_GPU_CHECK(gpuFree(gpuVec));
+      gpuVectors_.resize(cpuVectors_.size());
+
+      WALBERLA_ASSERT_EQUAL(cpuVectors_.size(), NUM_TYPES);
+      for (size_t i = 0; i < cpuVectors_.size(); ++i) {
+        auto &gpuVec = gpuVectors_[i];
+        auto &cpuVec = cpuVectors_[i];
+        WALBERLA_GPU_CHECK(
+            gpuMalloc(&gpuVec, sizeof(IndexInfo) * cpuVec.size()));
+        WALBERLA_GPU_CHECK(gpuMemcpy(gpuVec, &cpuVec[0],
+                                     sizeof(IndexInfo) * cpuVec.size(),
+                                     gpuMemcpyHostToDevice));
+      }
+    }
+
+  private:
+    std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+    using GpuIndexVector = IndexInfo *;
+    std::vector<GpuIndexVector> gpuVectors_;
+  };
+
+  Dynamic_UBB_single_precisionCUDA(
+      const shared_ptr<StructuredBlockForest> &blocks, BlockDataID pdfsID_,
+      std::function<Vector3<float>(const Cell &,
+                                   const shared_ptr<StructuredBlockForest> &,
+                                   IBlock &)> &velocityCallback)
+      : elementInitialiser(velocityCallback), pdfsID(pdfsID_) {
+    auto createIdxVector = [](IBlock *const, StructuredBlockStorage *const) {
+      return new IndexVectors();
+    };
+    indexVectorID = blocks->addStructuredBlockData<IndexVectors>(
+        createIdxVector, "IndexField_Dynamic_UBB_single_precisionCUDA");
+  };
+
+  void run(IBlock *block, gpuStream_t stream = nullptr);
+
+  void operator()(IBlock *block, gpuStream_t stream = nullptr) {
+    run(block, stream);
+  }
+
+  void inner(IBlock *block, gpuStream_t stream = nullptr);
+
+  void outer(IBlock *block, gpuStream_t stream = nullptr);
+
+  std::function<void(IBlock *)> getSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->run(b, stream); };
+  }
+
+  std::function<void(IBlock *)> getInnerSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->inner(b, stream); };
+  }
+
+  std::function<void(IBlock *)> getOuterSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->outer(b, stream); };
+  }
+
+  template <typename FlagField_T>
+  void fillFromFlagField(const shared_ptr<StructuredBlockForest> &blocks,
+                         ConstBlockDataID flagFieldID, FlagUID boundaryFlagUID,
+                         FlagUID domainFlagUID) {
+    for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+      fillFromFlagField<FlagField_T>(blocks, &*blockIt, flagFieldID,
+                                     boundaryFlagUID, domainFlagUID);
+  }
+
+  template <typename FlagField_T>
+  void fillFromFlagField(const shared_ptr<StructuredBlockForest> &blocks,
+                         IBlock *block, ConstBlockDataID flagFieldID,
+                         FlagUID boundaryFlagUID, FlagUID domainFlagUID) {
+    auto *indexVectors = block->getData<IndexVectors>(indexVectorID);
+    auto &indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+    auto &indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+    auto &indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+    auto *flagField = block->getData<FlagField_T>(flagFieldID);
+
+    if (!(flagField->flagExists(boundaryFlagUID) &&
+          flagField->flagExists(domainFlagUID)))
+      return;
+
+    auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+    auto domainFlag = flagField->getFlag(domainFlagUID);
+
+    auto inner = flagField->xyzSize();
+    inner.expand(cell_idx_t(-1));
+
+    indexVectorAll.clear();
+    indexVectorInner.clear();
+    indexVectorOuter.clear();
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 0, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 0);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 0, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 1);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, -1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 2);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() - 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(-1, 0, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 3);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() - 1, it.y() + 0, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(1, 0, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 4);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 1, it.y() + 0, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 0, 1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 5);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 0, it.z() + 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 0, -1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 6);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 0, it.z() - 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(-1, 1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 7);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() - 1, it.y() + 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(1, 1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 8);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 1, it.y() + 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(-1, -1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 9);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() - 1, it.y() - 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(1, -1, 0, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 10);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 1, it.y() - 1, it.z() + 0), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 1, 1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 11);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 1, it.z() + 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, -1, 1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 12);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() - 1, it.z() + 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(-1, 0, 1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 13);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() - 1, it.y() + 0, it.z() + 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(1, 0, 1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 14);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 1, it.y() + 0, it.z() + 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, 1, -1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 15);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() + 1, it.z() - 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(0, -1, -1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 16);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 0, it.y() - 1, it.z() - 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(-1, 0, -1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 17);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() - 1, it.y() + 0, it.z() - 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    for (auto it = flagField->beginWithGhostLayerXYZ(
+             cell_idx_c(flagField->nrOfGhostLayers() - 1));
+         it != flagField->end(); ++it) {
+      if (!isFlagSet(it, domainFlag))
+        continue;
+
+      if (isFlagSet(it.neighbor(1, 0, -1, 0), boundaryFlag)) {
+        auto element = IndexInfo(it.x(), it.y(), it.z(), 18);
+        auto const InitialisationAdditionalData = elementInitialiser(
+            Cell(it.x() + 1, it.y() + 0, it.z() - 1), blocks, *block);
+        element.vel_0 = InitialisationAdditionalData[0];
+        element.vel_1 = InitialisationAdditionalData[1];
+        element.vel_2 = InitialisationAdditionalData[2];
+        indexVectorAll.push_back(element);
+        if (inner.contains(it.x(), it.y(), it.z()))
+          indexVectorInner.push_back(element);
+        else
+          indexVectorOuter.push_back(element);
+      }
+    }
+
+    indexVectors->syncGPU();
+  }
+
+private:
+  void run_impl(IBlock *block, IndexVectors::Type type,
+                gpuStream_t stream = nullptr);
+
+  BlockDataID indexVectorID;
+  std::function<Vector3<float>(
+      const Cell &, const shared_ptr<StructuredBlockForest> &, IBlock &)>
+      elementInitialiser;
+
+public:
+  BlockDataID pdfsID;
+};
+
+} // namespace lbm
+} // namespace walberla
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h
index c05a863edba..c73cb58c148 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h
@@ -18,12 +18,11 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
-/**
- * @file
+/*
  * Lattice field accessors.
  * Adapted from the waLBerla source file
  * https://i10git.cs.fau.de/walberla/walberla/-/blob/a16141524c58ab88386e2a0f8fdd7c63c5edd704/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
@@ -42,19 +41,18 @@
 
 #include <array>
 #include <cassert>
+#include <iterator>
 #include <tuple>
 #include <vector>
 
 #ifdef WALBERLA_CXX_COMPILER_IS_GNU
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
 #ifdef WALBERLA_CXX_COMPILER_IS_CLANG
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #endif
 
 namespace walberla {
@@ -62,8 +60,8 @@ namespace lbm {
 namespace accessor {
 
 namespace Population {
-inline std::array<double, 19u>
-get(GhostLayerField<double, uint_t{19u}> const *pdf_field, Cell const &cell) {
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                Cell const &cell) {
   double const &xyz0 = pdf_field->get(cell, uint_t{0u});
   std::array<double, 19u> pop;
   pop[0u] = pdf_field->getF(&xyz0, uint_t{0u});
@@ -112,8 +110,56 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
   pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
 }
 
-inline void broadcast(GhostLayerField<double, uint_t{19u}> *pdf_field,
-                      std::array<double, 19u> const &pop) {
+inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
+                GhostLayerField<double, uint_t{3u}> const *force_field,
+                std::array<double, 19u> const &pop, Cell const &cell) {
+  auto &xyz0 = pdf_field->get(cell, uint_t{0u});
+  const double f_0 = pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+  const double f_1 = pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+  const double f_2 = pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+  const double f_3 = pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+  const double f_4 = pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+  const double f_5 = pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+  const double f_6 = pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+  const double f_7 = pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+  const double f_8 = pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+  const double f_9 = pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+  const double f_10 = pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+  const double f_11 = pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+  const double f_12 = pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+  const double f_13 = pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+  const double f_14 = pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+  const double f_15 = pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+  const double f_16 = pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+  const double f_17 = pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+  const double f_18 = pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+  const auto x = cell.x();
+  const auto y = cell.y();
+  const auto z = cell.z();
+  const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const double vel1Term = f_1 + f_11 + f_15 + f_7;
+  const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const double vel2Term = f_12 + f_13 + f_5;
+  const double momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                     vel1Term + vel2Term;
+  const double md_0 =
+      force_field->get(x, y, z, 0) * 0.50000000000000000 + momdensity_0;
+  const double md_1 =
+      force_field->get(x, y, z, 1) * 0.50000000000000000 + momdensity_1;
+  const double md_2 =
+      force_field->get(x, y, z, 2) * 0.50000000000000000 + momdensity_2;
+  const auto rho_inv = double{1} / rho;
+  velocity_field->get(cell, uint_t{0u}) = md_0 * rho_inv;
+  velocity_field->get(cell, uint_t{1u}) = md_1 * rho_inv;
+  velocity_field->get(cell, uint_t{2u}) = md_2 * rho_inv;
+}
+
+inline void initialize(GhostLayerField<double, uint_t{19u}> *pdf_field,
+                       std::array<double, 19u> const &pop) {
   WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdf_field, {
     double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
     pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
@@ -138,9 +184,8 @@ inline void broadcast(GhostLayerField<double, uint_t{19u}> *pdf_field,
   });
 }
 
-inline std::vector<double>
-get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
-    CellInterval const &ci) {
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                CellInterval const &ci) {
   std::vector<double> out;
   out.reserve(ci.numCells() * uint_t(19u));
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
@@ -175,31 +220,86 @@ get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
 inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
                 std::vector<double> const &values, CellInterval const &ci) {
   assert(uint_c(values.size()) == ci.numCells() * uint_t(19u));
-  auto values_ptr = values.data();
+  auto pop = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+        pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+        pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+        pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+        pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+        pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+        pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+        pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+        pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+        pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+        pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+        pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+        pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+        pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+        pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+        pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+        pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+        pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+        pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+        std::advance(pop, 19);
+      }
+    }
+  }
+}
+
+inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
+                GhostLayerField<double, uint_t{3u}> const *force_field,
+                std::vector<double> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(19u));
+  auto pop = values.data();
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
     for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
       for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
         double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
-        pdf_field->getF(&xyz0, uint_t{0u}) = values_ptr[0u];
-        pdf_field->getF(&xyz0, uint_t{1u}) = values_ptr[1u];
-        pdf_field->getF(&xyz0, uint_t{2u}) = values_ptr[2u];
-        pdf_field->getF(&xyz0, uint_t{3u}) = values_ptr[3u];
-        pdf_field->getF(&xyz0, uint_t{4u}) = values_ptr[4u];
-        pdf_field->getF(&xyz0, uint_t{5u}) = values_ptr[5u];
-        pdf_field->getF(&xyz0, uint_t{6u}) = values_ptr[6u];
-        pdf_field->getF(&xyz0, uint_t{7u}) = values_ptr[7u];
-        pdf_field->getF(&xyz0, uint_t{8u}) = values_ptr[8u];
-        pdf_field->getF(&xyz0, uint_t{9u}) = values_ptr[9u];
-        pdf_field->getF(&xyz0, uint_t{10u}) = values_ptr[10u];
-        pdf_field->getF(&xyz0, uint_t{11u}) = values_ptr[11u];
-        pdf_field->getF(&xyz0, uint_t{12u}) = values_ptr[12u];
-        pdf_field->getF(&xyz0, uint_t{13u}) = values_ptr[13u];
-        pdf_field->getF(&xyz0, uint_t{14u}) = values_ptr[14u];
-        pdf_field->getF(&xyz0, uint_t{15u}) = values_ptr[15u];
-        pdf_field->getF(&xyz0, uint_t{16u}) = values_ptr[16u];
-        pdf_field->getF(&xyz0, uint_t{17u}) = values_ptr[17u];
-        pdf_field->getF(&xyz0, uint_t{18u}) = values_ptr[18u];
-        values_ptr += 19u;
+        const double f_0 = pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+        const double f_1 = pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+        const double f_2 = pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+        const double f_3 = pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+        const double f_4 = pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+        const double f_5 = pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+        const double f_6 = pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+        const double f_7 = pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+        const double f_8 = pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+        const double f_9 = pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+        const double f_10 = pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+        const double f_11 = pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+        const double f_12 = pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+        const double f_13 = pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+        const double f_14 = pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+        const double f_15 = pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+        const double f_16 = pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+        const double f_17 = pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+        const double f_18 = pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 +
+                           vel0Term + vel1Term + vel2Term;
+        const double md_0 =
+            force_field->get(x, y, z, 0) * 0.50000000000000000 + momdensity_0;
+        const double md_1 =
+            force_field->get(x, y, z, 1) * 0.50000000000000000 + momdensity_1;
+        const double md_2 =
+            force_field->get(x, y, z, 2) * 0.50000000000000000 + momdensity_2;
+        const auto rho_inv = double{1} / rho;
+        velocity_field->get(x, y, z, uint_t{0u}) = md_0 * rho_inv;
+        velocity_field->get(x, y, z, uint_t{1u}) = md_1 * rho_inv;
+        velocity_field->get(x, y, z, uint_t{2u}) = md_2 * rho_inv;
+        std::advance(pop, 19);
       }
     }
   }
@@ -207,8 +307,8 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
 } // namespace Population
 
 namespace Vector {
-inline Vector3<double> get(GhostLayerField<double, uint_t{3u}> const *vec_field,
-                           Cell const &cell) {
+inline auto get(GhostLayerField<double, uint_t{3u}> const *vec_field,
+                Cell const &cell) {
   const double &xyz0 = vec_field->get(cell, uint_t{0u});
   Vector3<double> vec;
   vec[0] = vec_field->getF(&xyz0, uint_t{0u});
@@ -233,8 +333,8 @@ inline void add(GhostLayerField<double, uint_t{3u}> *vec_field,
   vec_field->getF(&xyz0, uint_t{2u}) += vec[2u];
 }
 
-inline void broadcast(GhostLayerField<double, uint_t{3u}> *vec_field,
-                      Vector3<double> const &vec) {
+inline void initialize(GhostLayerField<double, uint_t{3u}> *vec_field,
+                       Vector3<double> const &vec) {
   WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(vec_field, {
     double &xyz0 = vec_field->get(x, y, z, uint_t{0u});
     vec_field->getF(&xyz0, uint_t{0u}) = vec[0u];
@@ -253,9 +353,8 @@ inline void add_to_all(GhostLayerField<double, uint_t{3u}> *vec_field,
   });
 }
 
-inline std::vector<double>
-get(GhostLayerField<double, uint_t{3u}> const *vec_field,
-    CellInterval const &ci) {
+inline auto get(GhostLayerField<double, uint_t{3u}> const *vec_field,
+                CellInterval const &ci) {
   std::vector<double> out;
   out.reserve(ci.numCells() * uint_t(3u));
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
@@ -282,7 +381,7 @@ inline void set(GhostLayerField<double, uint_t{3u}> *vec_field,
         vec_field->getF(&xyz0, uint_t{0u}) = values_ptr[0u];
         vec_field->getF(&xyz0, uint_t{1u}) = values_ptr[1u];
         vec_field->getF(&xyz0, uint_t{2u}) = values_ptr[2u];
-        values_ptr += 3u;
+        std::advance(values_ptr, 3);
       }
     }
   }
@@ -291,8 +390,8 @@ inline void set(GhostLayerField<double, uint_t{3u}> *vec_field,
 
 namespace EquilibriumDistribution {
 inline double get(stencil::Direction const direction,
-                  Vector3<double> const &u = Vector3<double>(double(0.0)),
-                  double rho = double(1.0)) {
+                  Vector3<double> const &u = Vector3<double>(double{0}),
+                  double rho = double{1}) {
 
   using namespace stencil;
   switch (direction) {
@@ -569,7 +668,7 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
                      vel1Term + vel2Term;
 
   // calculate current velocity (before density change)
-  const double conversion = double(1) / rho;
+  const double conversion = double{1} / rho;
   Vector3<double> velocity;
   velocity[0u] = momdensity_0 * conversion;
   velocity[1u] = momdensity_1 * conversion;
@@ -657,7 +756,7 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
                            vel0Term + vel1Term + vel2Term;
 
         // calculate current velocity (before density change)
-        const double conversion = double(1) / rho;
+        const double conversion = double{1} / rho;
         Vector3<double> velocity;
         velocity[0u] = momdensity_0 * conversion;
         velocity[1u] = momdensity_1 * conversion;
@@ -672,7 +771,108 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
 } // namespace Density
 
 namespace Velocity {
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                GhostLayerField<double, uint_t{3u}> const *force_field,
+                Cell const &cell) {
+  const double &xyz0 = pdf_field->get(cell, uint_t{0u});
+  const double f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+  const double f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+  const double f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+  const double f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+  const double f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+  const double f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+  const double f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+  const double f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+  const double f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+  const double f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+  const double f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+  const double f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+  const double f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+  const double f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+  const double f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+  const double f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+  const double f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+  const double f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+  const double f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+  const auto x = cell.x();
+  const auto y = cell.y();
+  const auto z = cell.z();
+  const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const double vel1Term = f_1 + f_11 + f_15 + f_7;
+  const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const double vel2Term = f_12 + f_13 + f_5;
+  const double momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                     vel1Term + vel2Term;
+  const double md_0 =
+      force_field->get(x, y, z, 0) * 0.50000000000000000 + momdensity_0;
+  const double md_1 =
+      force_field->get(x, y, z, 1) * 0.50000000000000000 + momdensity_1;
+  const double md_2 =
+      force_field->get(x, y, z, 2) * 0.50000000000000000 + momdensity_2;
+  const double rho_inv = double{1} / rho;
+
+  return Vector3<double>(md_0 * rho_inv, md_1 * rho_inv, md_2 * rho_inv);
+}
+
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                GhostLayerField<double, uint_t{3u}> const *force_field,
+                CellInterval const &ci) {
+  std::vector<double> out;
+  out.reserve(ci.numCells() * uint_t(3u));
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        const double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        const double f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+        const double f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+        const double f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+        const double f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+        const double f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+        const double f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+        const double f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+        const double f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+        const double f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+        const double f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+        const double f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+        const double f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+        const double f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+        const double f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+        const double f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+        const double f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+        const double f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+        const double f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+        const double f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 +
+                           vel0Term + vel1Term + vel2Term;
+        const double md_0 =
+            force_field->get(x, y, z, 0) * 0.50000000000000000 + momdensity_0;
+        const double md_1 =
+            force_field->get(x, y, z, 1) * 0.50000000000000000 + momdensity_1;
+        const double md_2 =
+            force_field->get(x, y, z, 2) * 0.50000000000000000 + momdensity_2;
+        const double rho_inv = double{1} / rho;
+        out.emplace_back(md_0 * rho_inv);
+        out.emplace_back(md_1 * rho_inv);
+        out.emplace_back(md_2 * rho_inv);
+      }
+    }
+  }
+  return out;
+}
+
 inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
                 GhostLayerField<double, uint_t{3u}> const *force_field,
                 Vector3<double> const &u, Cell const &cell) {
   const double &xyz0 = pdf_field->get(cell, uint_t{0u});
@@ -710,15 +910,183 @@ inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
       -force_field->get(x, y, z, 1) * 0.50000000000000000 / rho + u[1];
   const double u_2 =
       -force_field->get(x, y, z, 2) * 0.50000000000000000 / rho + u[2];
+  velocity_field->get(x, y, z, uint_t{0u}) = u[0u];
+  velocity_field->get(x, y, z, uint_t{1u}) = u[1u];
+  velocity_field->get(x, y, z, uint_t{2u}) = u[2u];
 
   Equilibrium::set(pdf_field, Vector3<double>(u_0, u_1, u_2), rho, cell);
 }
+
+inline void set(GhostLayerField<double, uint_t{19u}> *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
+                GhostLayerField<double, uint_t{3u}> const *force_field,
+                std::vector<double> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(3u));
+  auto u = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        double &pdf_xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        double &vel_xyz0 = velocity_field->get(x, y, z, uint_t{0u});
+        const double f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+        const double f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+        const double f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+        const double f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+        const double f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+        const double f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+        const double f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+        const double f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+        const double f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+        const double f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+        const double f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+        const double f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+        const double f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+        const double f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+        const double f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+        const double f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+        const double f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+        const double f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+        const double f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 +
+                           vel0Term + vel1Term + vel2Term;
+
+        const double u_0 =
+            -force_field->get(x, y, z, 0) * 0.50000000000000000 / rho + u[0];
+        const double u_1 =
+            -force_field->get(x, y, z, 1) * 0.50000000000000000 / rho + u[1];
+        const double u_2 =
+            -force_field->get(x, y, z, 2) * 0.50000000000000000 / rho + u[2];
+        velocity_field->getF(&vel_xyz0, uint_t{0u}) = u[0u];
+        velocity_field->getF(&vel_xyz0, uint_t{1u}) = u[1u];
+        velocity_field->getF(&vel_xyz0, uint_t{2u}) = u[2u];
+
+        std::advance(u, 3);
+
+        Equilibrium::set(pdf_field, Vector3<double>(u_0, u_1, u_2), rho,
+                         Cell{x, y, z});
+      }
+    }
+  }
+}
 } // namespace Velocity
 
+namespace Force {
+inline void set(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
+                GhostLayerField<double, uint_t{3u}> *force_field,
+                Vector3<double> const &force, Cell const &cell) {
+  double const &pdf_xyz0 = pdf_field->get(cell, uint_t{0u});
+  double &vel_xyz0 = velocity_field->get(cell, uint_t{0u});
+  double &laf_xyz0 = force_field->get(cell, uint_t{0u});
+  const double f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+  const double f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+  const double f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+  const double f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+  const double f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+  const double f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+  const double f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+  const double f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+  const double f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+  const double f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+  const double f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+  const double f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+  const double f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+  const double f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+  const double f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+  const double f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+  const double f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+  const double f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+  const double f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+  const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const double vel1Term = f_1 + f_11 + f_15 + f_7;
+  const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const double vel2Term = f_12 + f_13 + f_5;
+  const double momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                     vel1Term + vel2Term;
+  const double md_0 = force[0u] * 0.50000000000000000 + momdensity_0;
+  const double md_1 = force[1u] * 0.50000000000000000 + momdensity_1;
+  const double md_2 = force[2u] * 0.50000000000000000 + momdensity_2;
+  auto const rho_inv = double{1} / rho;
+
+  force_field->getF(&laf_xyz0, uint_t{0u}) = force[0u];
+  force_field->getF(&laf_xyz0, uint_t{1u}) = force[1u];
+  force_field->getF(&laf_xyz0, uint_t{2u}) = force[2u];
+
+  velocity_field->getF(&vel_xyz0, uint_t{0u}) = md_0 * rho_inv;
+  velocity_field->getF(&vel_xyz0, uint_t{1u}) = md_1 * rho_inv;
+  velocity_field->getF(&vel_xyz0, uint_t{2u}) = md_2 * rho_inv;
+}
+
+inline void set(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                GhostLayerField<double, uint_t{3u}> *velocity_field,
+                GhostLayerField<double, uint_t{3u}> *force_field,
+                std::vector<double> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(3u));
+  auto force = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        double const &pdf_xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        double &vel_xyz0 = velocity_field->get(x, y, z, uint_t{0u});
+        double &laf_xyz0 = force_field->get(x, y, z, uint_t{0u});
+        const double f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+        const double f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+        const double f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+        const double f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+        const double f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+        const double f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+        const double f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+        const double f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+        const double f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+        const double f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+        const double f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+        const double f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+        const double f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+        const double f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+        const double f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+        const double f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+        const double f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+        const double f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+        const double f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 +
+                           vel0Term + vel1Term + vel2Term;
+        const double md_0 = force[0u] * 0.50000000000000000 + momdensity_0;
+        const double md_1 = force[1u] * 0.50000000000000000 + momdensity_1;
+        const double md_2 = force[2u] * 0.50000000000000000 + momdensity_2;
+        auto const rho_inv = double{1} / rho;
+
+        force_field->getF(&laf_xyz0, uint_t{0u}) = force[0u];
+        force_field->getF(&laf_xyz0, uint_t{1u}) = force[1u];
+        force_field->getF(&laf_xyz0, uint_t{2u}) = force[2u];
+
+        velocity_field->getF(&vel_xyz0, uint_t{0u}) = md_0 * rho_inv;
+        velocity_field->getF(&vel_xyz0, uint_t{1u}) = md_1 * rho_inv;
+        velocity_field->getF(&vel_xyz0, uint_t{2u}) = md_2 * rho_inv;
+
+        std::advance(force, 3);
+      }
+    }
+  }
+}
+} // namespace Force
+
 namespace MomentumDensity {
-inline Vector3<double>
-reduce(GhostLayerField<double, uint_t{19u}> const *pdf_field,
-       GhostLayerField<double, uint_t{3u}> const *force_field) {
+inline auto reduce(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                   GhostLayerField<double, uint_t{3u}> const *force_field) {
   Vector3<double> momentumDensity(double{0});
   WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
     const double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
@@ -767,8 +1135,8 @@ reduce(GhostLayerField<double, uint_t{19u}> const *pdf_field,
 } // namespace MomentumDensity
 
 namespace PressureTensor {
-inline Matrix3<double>
-get(GhostLayerField<double, uint_t{19u}> const *pdf_field, Cell const &cell) {
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                Cell const &cell) {
   const double &xyz0 = pdf_field->get(cell, uint_t{0u});
   const double f_0 = pdf_field->getF(&xyz0, uint_t{0u});
   const double f_1 = pdf_field->getF(&xyz0, uint_t{1u});
@@ -817,6 +1185,63 @@ get(GhostLayerField<double, uint_t{19u}> const *pdf_field, Cell const &cell) {
 
   return pressureTensor;
 }
+
+inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
+                CellInterval const &ci) {
+  std::vector<double> out;
+  out.reserve(ci.numCells() * uint_t(9u));
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        const double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        const double f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+        const double f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+        const double f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+        const double f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+        const double f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+        const double f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+        const double f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+        const double f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+        const double f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+        const double f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+        const double f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+        const double f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+        const double f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+        const double f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+        const double f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+        const double f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+        const double f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+        const double f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+        const double f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+        const double p_0 =
+            f_10 + f_13 + f_14 + f_17 + f_18 + f_3 + f_4 + f_7 + f_8 + f_9;
+        const double p_1 = -f_10 - f_7 + f_8 + f_9;
+        const double p_2 = -f_13 + f_14 + f_17 - f_18;
+        const double p_3 = -f_10 - f_7 + f_8 + f_9;
+        const double p_4 =
+            f_1 + f_10 + f_11 + f_12 + f_15 + f_16 + f_2 + f_7 + f_8 + f_9;
+        const double p_5 = f_11 - f_12 - f_15 + f_16;
+        const double p_6 = -f_13 + f_14 + f_17 - f_18;
+        const double p_7 = f_11 - f_12 - f_15 + f_16;
+        const double p_8 =
+            f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
+
+        out.emplace_back(p_0);
+        out.emplace_back(p_1);
+        out.emplace_back(p_2);
+
+        out.emplace_back(p_3);
+        out.emplace_back(p_4);
+        out.emplace_back(p_5);
+
+        out.emplace_back(p_6);
+        out.emplace_back(p_7);
+        out.emplace_back(p_8);
+      }
+    }
+  }
+  return out;
+}
 } // namespace PressureTensor
 
 } // namespace accessor
@@ -829,4 +1254,4 @@ get(GhostLayerField<double, uint_t{19u}> const *pdf_field, Cell const &cell) {
 
 #ifdef WALBERLA_CXX_COMPILER_IS_CLANG
 #pragma clang diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cu
new file mode 100644
index 00000000000..62dda871008
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cu
@@ -0,0 +1,1198 @@
+/*
+ * Copyright (C) 2023-2024 The ESPResSo project
+ * Copyright (C) 2020 The waLBerla project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+/**
+ * @file
+ * Lattice field accessors.
+ * Adapted from the waLBerla source file
+ * https://i10git.cs.fau.de/walberla/walberla/-/blob/a16141524c58ab88386e2a0f8fdd7c63c5edd704/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
+ */
+
+#include <core/DataTypes.h>
+#include <core/cell/Cell.h>
+#include <core/cell/CellInterval.h>
+#include <core/math/Matrix3.h>
+#include <core/math/Vector3.h>
+
+#include <field/iterators/IteratorMacros.h>
+
+#include <gpu/FieldAccessor.h>
+#include <gpu/FieldIndexing.h>
+#include <gpu/GPUField.h>
+#include <gpu/Kernel.h>
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+
+#include <array>
+#include <vector>
+
+#if defined(__NVCC__)
+#define RESTRICT __restrict__
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177 // unused variable
+#elif defined(__clang__)
+#if defined(__CUDA__)
+#if defined(__CUDA_ARCH__)
+// clang compiling CUDA code in device mode
+#define RESTRICT __restrict__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-variable"
+#else
+// clang compiling CUDA code in host mode
+#define RESTRICT __restrict__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-variable"
+#endif
+#endif
+#elif defined(__GNUC__) or defined(__GNUG__)
+#define RESTRICT __restrict__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#elif defined(_MSC_VER)
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+/** @brief Get linear index of flattened data with original layout @c fzyx. */
+static __forceinline__ __device__ uint getLinearIndex(uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim, uint fOffset) {
+  auto const x = threadIdx.x;
+  auto const y = blockIdx.x;
+  auto const z = blockIdx.y;
+  auto const f = blockIdx.z;
+  auto const ySize = gridDim.x;
+  auto const zSize = gridDim.y;
+  auto const fSize = fOffset;
+  return f +
+         z * fSize +
+         y * fSize * zSize +
+         x * fSize * zSize * ySize;
+}
+
+namespace walberla {
+namespace lbm {
+namespace accessor {
+
+namespace Population {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<double> pdf,
+    double *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
+  pdf.set(blockIdx, threadIdx);
+  pop += offset;
+  if (pdf.isValidPosition()) {
+    pop[0u] = pdf.get(0u);
+    pop[1u] = pdf.get(1u);
+    pop[2u] = pdf.get(2u);
+    pop[3u] = pdf.get(3u);
+    pop[4u] = pdf.get(4u);
+    pop[5u] = pdf.get(5u);
+    pop[6u] = pdf.get(6u);
+    pop[7u] = pdf.get(7u);
+    pop[8u] = pdf.get(8u);
+    pop[9u] = pdf.get(9u);
+    pop[10u] = pdf.get(10u);
+    pop[11u] = pdf.get(11u);
+    pop[12u] = pdf.get(12u);
+    pop[13u] = pdf.get(13u);
+    pop[14u] = pdf.get(14u);
+    pop[15u] = pdf.get(15u);
+    pop[16u] = pdf.get(16u);
+    pop[17u] = pdf.get(17u);
+    pop[18u] = pdf.get(18u);
+  }
+}
+
+__global__ void kernel_set(
+    gpu::FieldAccessor<double> pdf,
+    double const *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
+  pdf.set(blockIdx, threadIdx);
+  pop += offset;
+  if (pdf.isValidPosition()) {
+    pdf.get(0u) = pop[0u];
+    pdf.get(1u) = pop[1u];
+    pdf.get(2u) = pop[2u];
+    pdf.get(3u) = pop[3u];
+    pdf.get(4u) = pop[4u];
+    pdf.get(5u) = pop[5u];
+    pdf.get(6u) = pop[6u];
+    pdf.get(7u) = pop[7u];
+    pdf.get(8u) = pop[8u];
+    pdf.get(9u) = pop[9u];
+    pdf.get(10u) = pop[10u];
+    pdf.get(11u) = pop[11u];
+    pdf.get(12u) = pop[12u];
+    pdf.get(13u) = pop[13u];
+    pdf.get(14u) = pop[14u];
+    pdf.get(15u) = pop[15u];
+    pdf.get(16u) = pop[16u];
+    pdf.get(17u) = pop[17u];
+    pdf.get(18u) = pop[18u];
+  }
+}
+
+__global__ void kernel_broadcast(
+    gpu::FieldAccessor<double> pdf,
+    double const *RESTRICT pop) {
+  pdf.set(blockIdx, threadIdx);
+  if (pdf.isValidPosition()) {
+    pdf.get(0u) = pop[0u];
+    pdf.get(1u) = pop[1u];
+    pdf.get(2u) = pop[2u];
+    pdf.get(3u) = pop[3u];
+    pdf.get(4u) = pop[4u];
+    pdf.get(5u) = pop[5u];
+    pdf.get(6u) = pop[6u];
+    pdf.get(7u) = pop[7u];
+    pdf.get(8u) = pop[8u];
+    pdf.get(9u) = pop[9u];
+    pdf.get(10u) = pop[10u];
+    pdf.get(11u) = pop[11u];
+    pdf.get(12u) = pop[12u];
+    pdf.get(13u) = pop[13u];
+    pdf.get(14u) = pop[14u];
+    pdf.get(15u) = pop[15u];
+    pdf.get(16u) = pop[16u];
+    pdf.get(17u) = pop[17u];
+    pdf.get(18u) = pop[18u];
+  }
+}
+
+__global__ void kernel_set_vel(
+    gpu::FieldAccessor<double> pdf,
+    gpu::FieldAccessor<double> velocity,
+    gpu::FieldAccessor<double> force,
+    double const *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
+  pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  pop += offset;
+  if (pdf.isValidPosition()) {
+    const double f_0 = pdf.get(0u) = pop[0u];
+    const double f_1 = pdf.get(1u) = pop[1u];
+    const double f_2 = pdf.get(2u) = pop[2u];
+    const double f_3 = pdf.get(3u) = pop[3u];
+    const double f_4 = pdf.get(4u) = pop[4u];
+    const double f_5 = pdf.get(5u) = pop[5u];
+    const double f_6 = pdf.get(6u) = pop[6u];
+    const double f_7 = pdf.get(7u) = pop[7u];
+    const double f_8 = pdf.get(8u) = pop[8u];
+    const double f_9 = pdf.get(9u) = pop[9u];
+    const double f_10 = pdf.get(10u) = pop[10u];
+    const double f_11 = pdf.get(11u) = pop[11u];
+    const double f_12 = pdf.get(12u) = pop[12u];
+    const double f_13 = pdf.get(13u) = pop[13u];
+    const double f_14 = pdf.get(14u) = pop[14u];
+    const double f_15 = pdf.get(15u) = pop[15u];
+    const double f_16 = pdf.get(16u) = pop[16u];
+    const double f_17 = pdf.get(17u) = pop[17u];
+    const double f_18 = pdf.get(18u) = pop[18u];
+    const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const double vel1Term = f_1 + f_11 + f_15 + f_7;
+    const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const double vel2Term = f_12 + f_13 + f_5;
+    const double momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const double md_0 = force.get(0) * 0.50000000000000000 + momdensity_0;
+    const double md_1 = force.get(1) * 0.50000000000000000 + momdensity_1;
+    const double md_2 = force.get(2) * 0.50000000000000000 + momdensity_2;
+    const double rho_inv = double{1} / rho;
+    velocity.get(0u) = md_0 * rho_inv;
+    velocity.get(1u) = md_1 * rho_inv;
+    velocity.get(2u) = md_2 * rho_inv;
+  }
+}
+// LCOV_EXCL_STOP
+
+std::array<double, 19u> get(
+    gpu::GPUField<double> const *pdf_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(19u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::array<double, 19u> pop;
+  thrust::copy(dev_data.begin(), dev_data.end(), pop.data());
+  return pop;
+}
+
+void set(
+    gpu::GPUField<double> *pdf_field,
+    std::array<double, 19u> const &pop,
+    Cell const &cell) {
+  thrust::device_vector<double> dev_data(pop.begin(), pop.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  CellInterval ci(cell, cell);
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+void set(
+    gpu::GPUField<double> *pdf_field,
+    gpu::GPUField<double> *velocity_field,
+    gpu::GPUField<double> const *force_field,
+    std::array<double, 19u> const &pop,
+    Cell const &cell) {
+  thrust::device_vector<double> dev_data(pop.begin(), pop.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  CellInterval ci(cell, cell);
+  auto kernel = gpu::make_kernel(kernel_set_vel);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+void initialize(
+    gpu::GPUField<double> *pdf_field,
+    std::array<double, 19u> const &pop) {
+  CellInterval ci = pdf_field->xyzSizeWithGhostLayer();
+  thrust::device_vector<double> dev_data(pop.begin(), pop.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_broadcast);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+std::vector<double> get(
+    gpu::GPUField<double> const *pdf_field,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(ci.numCells() * 19u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<double> out(ci.numCells() * 19u);
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+
+void set(
+    gpu::GPUField<double> *pdf_field,
+    std::vector<double> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+void set(
+    gpu::GPUField<double> *pdf_field,
+    gpu::GPUField<double> *velocity_field,
+    gpu::GPUField<double> const *force_field,
+    std::vector<double> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set_vel);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Population
+
+namespace Vector {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<double> vec,
+    double *u_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  vec.set(blockIdx, threadIdx);
+  u_out += offset;
+  if (vec.isValidPosition()) {
+    u_out[0u] = vec.get(0u);
+    u_out[1u] = vec.get(1u);
+    u_out[2u] = vec.get(2u);
+  }
+}
+
+__global__ void kernel_set(
+    gpu::FieldAccessor<double> vec,
+    double const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  vec.set(blockIdx, threadIdx);
+  u_in += offset;
+  if (vec.isValidPosition()) {
+    vec.get(0u) = u_in[0u];
+    vec.get(1u) = u_in[1u];
+    vec.get(2u) = u_in[2u];
+  }
+}
+
+__global__ void kernel_broadcast(
+    gpu::FieldAccessor<double> vec,
+    double const *RESTRICT u_in) {
+  vec.set(blockIdx, threadIdx);
+  if (vec.isValidPosition()) {
+    vec.get(0u) = u_in[0u];
+    vec.get(1u) = u_in[1u];
+    vec.get(2u) = u_in[2u];
+  }
+}
+
+__global__ void kernel_add(
+    gpu::FieldAccessor<double> vec,
+    double const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  vec.set(blockIdx, threadIdx);
+  u_in += offset;
+  if (vec.isValidPosition()) {
+    vec.get(0u) += u_in[0u];
+    vec.get(1u) += u_in[1u];
+    vec.get(2u) += u_in[2u];
+  }
+}
+
+__global__ void kernel_broadcast_add(
+    gpu::FieldAccessor<double> vec,
+    double const *RESTRICT u_in) {
+  vec.set(blockIdx, threadIdx);
+  if (vec.isValidPosition()) {
+    vec.get(0u) += u_in[0u];
+    vec.get(1u) += u_in[1u];
+    vec.get(2u) += u_in[2u];
+  }
+}
+// LCOV_EXCL_STOP
+
+Vector3<double> get(
+    gpu::GPUField<double> const *vec_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*vec_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  Vector3<double> vec;
+  thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
+  return vec;
+}
+
+void set(
+    gpu::GPUField<double> *vec_field,
+    Vector3<double> const &vec,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(vec.data(), vec.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*vec_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+void add(
+    gpu::GPUField<double> *vec_field,
+    Vector3<double> const &vec,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(vec.data(), vec.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_add);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*vec_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+void initialize(
+    gpu::GPUField<double> *vec_field,
+    Vector3<double> const &vec) {
+  CellInterval ci = vec_field->xyzSizeWithGhostLayer();
+  thrust::device_vector<double> dev_data(vec.data(), vec.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_broadcast);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*vec_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+void add_to_all(
+    gpu::GPUField<double> *vec_field,
+    Vector3<double> const &vec) {
+  CellInterval ci = vec_field->xyzSizeWithGhostLayer();
+  thrust::device_vector<double> dev_data(vec.data(), vec.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_broadcast_add);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*vec_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+std::vector<double> get(
+    gpu::GPUField<double> const *vec_field,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(ci.numCells() * 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*vec_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<double> out(ci.numCells() * 3u);
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+
+void set(
+    gpu::GPUField<double> *vec_field,
+    std::vector<double> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*vec_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Vector
+
+namespace Interpolation {
+// LCOV_EXCL_START
+/** @brief Calculate interpolation weights. */
+static __forceinline__ __device__ void calculate_weights(
+    double const *RESTRICT const pos,
+    int *RESTRICT const corner,
+    double *RESTRICT const weights,
+    uint gl) {
+#pragma unroll
+  for (int dim = 0; dim < 3; ++dim) {
+    auto const fractional_index = pos[dim] - double{0.5};
+    auto const nmp = floorf(fractional_index);
+    auto const distance = fractional_index - nmp - double{0.5};
+    corner[dim] = __double2int_rn(nmp) + static_cast<int>(gl);
+    weights[dim * 2 + 0] = double{0.5} - distance;
+    weights[dim * 2 + 1] = double{0.5} + distance;
+  }
+}
+
+__global__ void kernel_get(
+    gpu::FieldAccessor<double> vec,
+    double const *RESTRICT const pos,
+    double *RESTRICT const vel,
+    uint n_pos,
+    uint gl) {
+
+  uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
+                   blockDim.x * blockIdx.x + threadIdx.x;
+
+  vec.set({0u, 0u, 0u}, {0u, 0u, 0u});
+  if (vec.isValidPosition() and pos_index < n_pos) {
+    auto const array_offset = pos_index * uint(3u);
+    int corner[3];
+    double weights[3][2];
+    calculate_weights(pos + array_offset, corner, &weights[0][0], gl);
+#pragma unroll
+    for (int i = 0; i < 2; i++) {
+      auto const cx = corner[0] + i;
+      auto const wx = weights[0][i];
+#pragma unroll
+      for (int j = 0; j < 2; j++) {
+        auto const cy = corner[1] + j;
+        auto const wxy = wx * weights[1][j];
+#pragma unroll
+        for (int k = 0; k < 2; k++) {
+          auto const cz = corner[2] + k;
+          auto const weight = wxy * weights[2][k];
+          vel[array_offset + 0u] += weight * vec.getNeighbor(cx, cy, cz, 0u);
+          vel[array_offset + 1u] += weight * vec.getNeighbor(cx, cy, cz, 1u);
+          vel[array_offset + 2u] += weight * vec.getNeighbor(cx, cy, cz, 2u);
+        }
+      }
+    }
+  }
+}
+
+__global__ void kernel_set(
+    gpu::FieldAccessor<double> vec,
+    double const *RESTRICT const pos,
+    double const *RESTRICT const forces,
+    uint n_pos,
+    uint gl) {
+
+  uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
+                   blockDim.x * blockIdx.x + threadIdx.x;
+
+  vec.set({0u, 0u, 0u}, {0u, 0u, 0u});
+  if (vec.isValidPosition() and pos_index < n_pos) {
+    auto const array_offset = pos_index * uint(3u);
+    int corner[3];
+    double weights[3][2];
+    calculate_weights(pos + array_offset, corner, &weights[0][0], gl);
+#pragma unroll
+    for (int i = 0; i < 2; i++) {
+      auto const cx = corner[0] + i;
+      auto const wx = weights[0][i];
+#pragma unroll
+      for (int j = 0; j < 2; j++) {
+        auto const cy = corner[1] + j;
+        auto const wxy = wx * weights[1][j];
+#pragma unroll
+        for (int k = 0; k < 2; k++) {
+          auto const cz = corner[2] + k;
+          auto const weight = wxy * weights[2][k];
+          atomicAdd(&vec.getNeighbor(cx, cy, cz, 0u),
+                    weight * forces[array_offset + 0u]);
+          atomicAdd(&vec.getNeighbor(cx, cy, cz, 1u),
+                    weight * forces[array_offset + 1u]);
+          atomicAdd(&vec.getNeighbor(cx, cy, cz, 2u),
+                    weight * forces[array_offset + 2u]);
+        }
+      }
+    }
+  }
+}
+// LCOV_EXCL_STOP
+
+static dim3 calculate_dim_grid(uint const threads_x,
+                               uint const blocks_per_grid_y,
+                               uint const threads_per_block) {
+  assert(threads_x >= 1u);
+  assert(blocks_per_grid_y >= 1u);
+  assert(threads_per_block >= 1u);
+  auto const threads_y = threads_per_block * blocks_per_grid_y;
+  auto const blocks_per_grid_x = (threads_x + threads_y - 1) / threads_y;
+  return make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+}
+
+std::vector<double>
+get(
+    gpu::GPUField<double> const *vec_field,
+    std::vector<double> const &pos,
+    uint gl) {
+  thrust::device_vector<double> dev_pos(pos.begin(), pos.end());
+  thrust::device_vector<double> dev_vel(pos.size());
+  auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
+  auto const dev_vel_ptr = thrust::raw_pointer_cast(dev_vel.data());
+
+  auto const threads_per_block = uint(64u);
+  auto const n_pos = static_cast<uint>(pos.size() / 3ul);
+  auto const dim_grid = calculate_dim_grid(n_pos, 4u, threads_per_block);
+  kernel_get<<<dim_grid, threads_per_block, 0u, nullptr>>>(
+      gpu::FieldIndexing<double>::withGhostLayerXYZ(*vec_field, gl).gpuAccess(),
+      dev_pos_ptr, dev_vel_ptr, n_pos, gl);
+
+  std::vector<double> out(pos.size());
+  thrust::copy(dev_vel.begin(), dev_vel.end(), out.data());
+  return out;
+}
+
+void set(
+    gpu::GPUField<double> const *vec_field,
+    std::vector<double> const &pos,
+    std::vector<double> const &forces,
+    uint gl) {
+  thrust::device_vector<double> dev_pos(pos.begin(), pos.end());
+  thrust::device_vector<double> dev_for(forces.begin(), forces.end());
+  auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
+  auto const dev_for_ptr = thrust::raw_pointer_cast(dev_for.data());
+
+  auto const threads_per_block = uint(64u);
+  auto const n_pos = static_cast<uint>(pos.size() / 3ul);
+  auto const dim_grid = calculate_dim_grid(n_pos, 4u, threads_per_block);
+  kernel_set<<<dim_grid, threads_per_block, 0u, nullptr>>>(
+      gpu::FieldIndexing<double>::withGhostLayerXYZ(*vec_field, gl).gpuAccess(),
+      dev_pos_ptr, dev_for_ptr, n_pos, gl);
+}
+} // namespace Interpolation
+
+namespace Equilibrium {
+// LCOV_EXCL_START
+__device__ void kernel_set_device(
+    gpu::FieldAccessor<double> pdf,
+    double const *RESTRICT const u,
+    double rho) {
+
+  pdf.get(0u) = rho * -0.33333333333333331 * (u[0] * u[0]) + rho * -0.33333333333333331 * (u[1] * u[1]) + rho * -0.33333333333333331 * (u[2] * u[2]) + rho * 0.33333333333333331;
+  pdf.get(1u) = rho * -0.16666666666666666 * (u[0] * u[0]) + rho * -0.16666666666666666 * (u[2] * u[2]) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * u[1] + rho * 0.16666666666666666 * (u[1] * u[1]);
+  pdf.get(2u) = rho * -0.16666666666666666 * u[1] + rho * -0.16666666666666666 * (u[0] * u[0]) + rho * -0.16666666666666666 * (u[2] * u[2]) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u[1] * u[1]);
+  pdf.get(3u) = rho * -0.16666666666666666 * u[0] + rho * -0.16666666666666666 * (u[1] * u[1]) + rho * -0.16666666666666666 * (u[2] * u[2]) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u[0] * u[0]);
+  pdf.get(4u) = rho * -0.16666666666666666 * (u[1] * u[1]) + rho * -0.16666666666666666 * (u[2] * u[2]) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * u[0] + rho * 0.16666666666666666 * (u[0] * u[0]);
+  pdf.get(5u) = rho * -0.16666666666666666 * (u[0] * u[0]) + rho * -0.16666666666666666 * (u[1] * u[1]) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * u[2] + rho * 0.16666666666666666 * (u[2] * u[2]);
+  pdf.get(6u) = rho * -0.16666666666666666 * u[2] + rho * -0.16666666666666666 * (u[0] * u[0]) + rho * -0.16666666666666666 * (u[1] * u[1]) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u[2] * u[2]);
+  pdf.get(7u) = rho * -0.083333333333333329 * u[0] + rho * -0.25 * u[0] * u[1] + rho * 0.027777777777777776 + rho * 0.083333333333333329 * u[1] + rho * 0.083333333333333329 * (u[0] * u[0]) + rho * 0.083333333333333329 * (u[1] * u[1]);
+  pdf.get(8u) = rho * 0.027777777777777776 + rho * 0.083333333333333329 * u[0] + rho * 0.083333333333333329 * u[1] + rho * 0.083333333333333329 * (u[0] * u[0]) + rho * 0.083333333333333329 * (u[1] * u[1]) + rho * 0.25 * u[0] * u[1];
+  pdf.get(9u) = rho * -0.083333333333333329 * u[0] + rho * -0.083333333333333329 * u[1] + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u[0] * u[0]) + rho * 0.083333333333333329 * (u[1] * u[1]) + rho * 0.25 * u[0] * u[1];
+  pdf.get(10u) = rho * -0.083333333333333329 * u[1] + rho * -0.25 * u[0] * u[1] + rho * 0.027777777777777776 + rho * 0.083333333333333329 * u[0] + rho * 0.083333333333333329 * (u[0] * u[0]) + rho * 0.083333333333333329 * (u[1] * u[1]);
+  pdf.get(11u) = rho * 0.027777777777777776 + rho * 0.083333333333333329 * u[1] + rho * 0.083333333333333329 * u[2] + rho * 0.083333333333333329 * (u[1] * u[1]) + rho * 0.083333333333333329 * (u[2] * u[2]) + rho * 0.25 * u[1] * u[2];
+  pdf.get(12u) = rho * -0.083333333333333329 * u[1] + rho * -0.25 * u[1] * u[2] + rho * 0.027777777777777776 + rho * 0.083333333333333329 * u[2] + rho * 0.083333333333333329 * (u[1] * u[1]) + rho * 0.083333333333333329 * (u[2] * u[2]);
+  pdf.get(13u) = rho * -0.083333333333333329 * u[0] + rho * -0.25 * u[0] * u[2] + rho * 0.027777777777777776 + rho * 0.083333333333333329 * u[2] + rho * 0.083333333333333329 * (u[0] * u[0]) + rho * 0.083333333333333329 * (u[2] * u[2]);
+  pdf.get(14u) = rho * 0.027777777777777776 + rho * 0.083333333333333329 * u[0] + rho * 0.083333333333333329 * u[2] + rho * 0.083333333333333329 * (u[0] * u[0]) + rho * 0.083333333333333329 * (u[2] * u[2]) + rho * 0.25 * u[0] * u[2];
+  pdf.get(15u) = rho * -0.083333333333333329 * u[2] + rho * -0.25 * u[1] * u[2] + rho * 0.027777777777777776 + rho * 0.083333333333333329 * u[1] + rho * 0.083333333333333329 * (u[1] * u[1]) + rho * 0.083333333333333329 * (u[2] * u[2]);
+  pdf.get(16u) = rho * -0.083333333333333329 * u[1] + rho * -0.083333333333333329 * u[2] + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u[1] * u[1]) + rho * 0.083333333333333329 * (u[2] * u[2]) + rho * 0.25 * u[1] * u[2];
+  pdf.get(17u) = rho * -0.083333333333333329 * u[0] + rho * -0.083333333333333329 * u[2] + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u[0] * u[0]) + rho * 0.083333333333333329 * (u[2] * u[2]) + rho * 0.25 * u[0] * u[2];
+  pdf.get(18u) = rho * -0.083333333333333329 * u[2] + rho * -0.25 * u[0] * u[2] + rho * 0.027777777777777776 + rho * 0.083333333333333329 * u[0] + rho * 0.083333333333333329 * (u[0] * u[0]) + rho * 0.083333333333333329 * (u[2] * u[2]);
+}
+// LCOV_EXCL_STOP
+} // namespace Equilibrium
+
+namespace Density {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<double> pdf,
+    double *RESTRICT rho_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
+  pdf.set(blockIdx, threadIdx);
+  rho_out += offset;
+  if (pdf.isValidPosition()) {
+    double const f_0 = pdf.get(0u);
+    double const f_1 = pdf.get(1u);
+    double const f_2 = pdf.get(2u);
+    double const f_3 = pdf.get(3u);
+    double const f_4 = pdf.get(4u);
+    double const f_5 = pdf.get(5u);
+    double const f_6 = pdf.get(6u);
+    double const f_7 = pdf.get(7u);
+    double const f_8 = pdf.get(8u);
+    double const f_9 = pdf.get(9u);
+    double const f_10 = pdf.get(10u);
+    double const f_11 = pdf.get(11u);
+    double const f_12 = pdf.get(12u);
+    double const f_13 = pdf.get(13u);
+    double const f_14 = pdf.get(14u);
+    double const f_15 = pdf.get(15u);
+    double const f_16 = pdf.get(16u);
+    double const f_17 = pdf.get(17u);
+    double const f_18 = pdf.get(18u);
+    const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const double vel1Term = f_1 + f_11 + f_15 + f_7;
+    const double vel2Term = f_12 + f_13 + f_5;
+    const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    rho_out[0u] = rho;
+  }
+}
+
+__global__ void kernel_set(
+    gpu::FieldAccessor<double> pdf,
+    double const *RESTRICT rho_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
+  pdf.set(blockIdx, threadIdx);
+  rho_in += offset;
+  if (pdf.isValidPosition()) {
+    double const f_0 = pdf.get(0u);
+    double const f_1 = pdf.get(1u);
+    double const f_2 = pdf.get(2u);
+    double const f_3 = pdf.get(3u);
+    double const f_4 = pdf.get(4u);
+    double const f_5 = pdf.get(5u);
+    double const f_6 = pdf.get(6u);
+    double const f_7 = pdf.get(7u);
+    double const f_8 = pdf.get(8u);
+    double const f_9 = pdf.get(9u);
+    double const f_10 = pdf.get(10u);
+    double const f_11 = pdf.get(11u);
+    double const f_12 = pdf.get(12u);
+    double const f_13 = pdf.get(13u);
+    double const f_14 = pdf.get(14u);
+    double const f_15 = pdf.get(15u);
+    double const f_16 = pdf.get(16u);
+    double const f_17 = pdf.get(17u);
+    double const f_18 = pdf.get(18u);
+    const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const double vel1Term = f_1 + f_11 + f_15 + f_7;
+    const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const double vel2Term = f_12 + f_13 + f_5;
+    const double momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+
+    // calculate current velocity (before density change)
+    double const rho_inv = double{1} / rho;
+    double const u_old[3] = {momdensity_0 * rho_inv, momdensity_1 * rho_inv, momdensity_2 * rho_inv};
+
+    Equilibrium::kernel_set_device(pdf, u_old, rho_in[0u]);
+  }
+}
+// LCOV_EXCL_STOP
+
+double get(
+    gpu::GPUField<double> const *pdf_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(1u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  double rho = dev_data[0u];
+  return rho;
+}
+
+void set(
+    gpu::GPUField<double> *pdf_field,
+    const double rho,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(1u, rho);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+std::vector<double> get(
+    gpu::GPUField<double> const *pdf_field,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(ci.numCells());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<double> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.begin());
+  return out;
+}
+
+void set(
+    gpu::GPUField<double> *pdf_field,
+    std::vector<double> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Density
+
+namespace Velocity {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<double> pdf,
+    gpu::FieldAccessor<double> force,
+    double *RESTRICT u_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  u_out += offset;
+  if (pdf.isValidPosition()) {
+    double const f_0 = pdf.get(0u);
+    double const f_1 = pdf.get(1u);
+    double const f_2 = pdf.get(2u);
+    double const f_3 = pdf.get(3u);
+    double const f_4 = pdf.get(4u);
+    double const f_5 = pdf.get(5u);
+    double const f_6 = pdf.get(6u);
+    double const f_7 = pdf.get(7u);
+    double const f_8 = pdf.get(8u);
+    double const f_9 = pdf.get(9u);
+    double const f_10 = pdf.get(10u);
+    double const f_11 = pdf.get(11u);
+    double const f_12 = pdf.get(12u);
+    double const f_13 = pdf.get(13u);
+    double const f_14 = pdf.get(14u);
+    double const f_15 = pdf.get(15u);
+    double const f_16 = pdf.get(16u);
+    double const f_17 = pdf.get(17u);
+    double const f_18 = pdf.get(18u);
+    const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const double vel1Term = f_1 + f_11 + f_15 + f_7;
+    const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const double vel2Term = f_12 + f_13 + f_5;
+    const double momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const double md_0 = force.get(0) * 0.50000000000000000 + momdensity_0;
+    const double md_1 = force.get(1) * 0.50000000000000000 + momdensity_1;
+    const double md_2 = force.get(2) * 0.50000000000000000 + momdensity_2;
+    auto const rho_inv = double{1} / rho;
+    u_out[0u] = md_0 * rho_inv;
+    u_out[1u] = md_1 * rho_inv;
+    u_out[2u] = md_2 * rho_inv;
+  }
+}
+
+__global__ void kernel_set(
+    gpu::FieldAccessor<double> pdf,
+    gpu::FieldAccessor<double> velocity,
+    gpu::FieldAccessor<double> force,
+    double const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  u_in += offset;
+  if (pdf.isValidPosition()) {
+    double const f_0 = pdf.get(0u);
+    double const f_1 = pdf.get(1u);
+    double const f_2 = pdf.get(2u);
+    double const f_3 = pdf.get(3u);
+    double const f_4 = pdf.get(4u);
+    double const f_5 = pdf.get(5u);
+    double const f_6 = pdf.get(6u);
+    double const f_7 = pdf.get(7u);
+    double const f_8 = pdf.get(8u);
+    double const f_9 = pdf.get(9u);
+    double const f_10 = pdf.get(10u);
+    double const f_11 = pdf.get(11u);
+    double const f_12 = pdf.get(12u);
+    double const f_13 = pdf.get(13u);
+    double const f_14 = pdf.get(14u);
+    double const f_15 = pdf.get(15u);
+    double const f_16 = pdf.get(16u);
+    double const f_17 = pdf.get(17u);
+    double const f_18 = pdf.get(18u);
+    double const *RESTRICT const u = u_in;
+    const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const double vel1Term = f_1 + f_11 + f_15 + f_7;
+    const double vel2Term = f_12 + f_13 + f_5;
+    const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const double u_0 = -force.get(0) * 0.50000000000000000 / rho + u[0];
+    const double u_1 = -force.get(1) * 0.50000000000000000 / rho + u[1];
+    const double u_2 = -force.get(2) * 0.50000000000000000 / rho + u[2];
+    velocity.get(0u) = u_in[0u];
+    velocity.get(1u) = u_in[1u];
+    velocity.get(2u) = u_in[2u];
+
+    double u_new[3] = {u_0, u_1, u_2};
+
+    Equilibrium::kernel_set_device(pdf, u_new, rho);
+  }
+}
+// LCOV_EXCL_STOP
+
+Vector3<double> get(
+    gpu::GPUField<double> const *pdf_field,
+    gpu::GPUField<double> const *force_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  Vector3<double> vec;
+  thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
+  return vec;
+}
+
+std::vector<double> get(
+    gpu::GPUField<double> const *pdf_field,
+    gpu::GPUField<double> const *force_field,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<double> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+
+void set(
+    gpu::GPUField<double> *pdf_field,
+    gpu::GPUField<double> *velocity_field,
+    gpu::GPUField<double> const *force_field,
+    Vector3<double> const &u,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(u.data(), u.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+void set(
+    gpu::GPUField<double> *pdf_field,
+    gpu::GPUField<double> *velocity_field,
+    gpu::GPUField<double> const *force_field,
+    std::vector<double> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Velocity
+
+namespace Force {
+// LCOV_EXCL_START
+__global__ void kernel_set(
+    gpu::FieldAccessor<double> pdf,
+    gpu::FieldAccessor<double> velocity,
+    gpu::FieldAccessor<double> force,
+    double const *RESTRICT f_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  f_in += offset;
+  if (pdf.isValidPosition()) {
+    double const f_0 = pdf.get(0u);
+    double const f_1 = pdf.get(1u);
+    double const f_2 = pdf.get(2u);
+    double const f_3 = pdf.get(3u);
+    double const f_4 = pdf.get(4u);
+    double const f_5 = pdf.get(5u);
+    double const f_6 = pdf.get(6u);
+    double const f_7 = pdf.get(7u);
+    double const f_8 = pdf.get(8u);
+    double const f_9 = pdf.get(9u);
+    double const f_10 = pdf.get(10u);
+    double const f_11 = pdf.get(11u);
+    double const f_12 = pdf.get(12u);
+    double const f_13 = pdf.get(13u);
+    double const f_14 = pdf.get(14u);
+    double const f_15 = pdf.get(15u);
+    double const f_16 = pdf.get(16u);
+    double const f_17 = pdf.get(17u);
+    double const f_18 = pdf.get(18u);
+    const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const double vel1Term = f_1 + f_11 + f_15 + f_7;
+    const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const double vel2Term = f_12 + f_13 + f_5;
+    const double momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const double md_0 = f_in[0u] * 0.50000000000000000 + momdensity_0;
+    const double md_1 = f_in[1u] * 0.50000000000000000 + momdensity_1;
+    const double md_2 = f_in[2u] * 0.50000000000000000 + momdensity_2;
+    auto const rho_inv = double{1} / rho;
+
+    force.get(0u) = f_in[0u];
+    force.get(1u) = f_in[1u];
+    force.get(2u) = f_in[2u];
+
+    velocity.get(0u) = md_0 * rho_inv;
+    velocity.get(1u) = md_1 * rho_inv;
+    velocity.get(2u) = md_2 * rho_inv;
+  }
+}
+// LCOV_EXCL_STOP
+
+void set(gpu::GPUField<double> const *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> *force_field,
+         Vector3<double> const &u,
+         Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(u.data(), u.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
+void set(gpu::GPUField<double> const *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> *force_field,
+         std::vector<double> const &values,
+         CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Force
+
+namespace MomentumDensity {
+// LCOV_EXCL_START
+__global__ void kernel_sum(
+    gpu::FieldAccessor<double> pdf,
+    gpu::FieldAccessor<double> force,
+    double *RESTRICT out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  out += offset;
+  if (pdf.isValidPosition()) {
+    double const f_0 = pdf.get(0u);
+    double const f_1 = pdf.get(1u);
+    double const f_2 = pdf.get(2u);
+    double const f_3 = pdf.get(3u);
+    double const f_4 = pdf.get(4u);
+    double const f_5 = pdf.get(5u);
+    double const f_6 = pdf.get(6u);
+    double const f_7 = pdf.get(7u);
+    double const f_8 = pdf.get(8u);
+    double const f_9 = pdf.get(9u);
+    double const f_10 = pdf.get(10u);
+    double const f_11 = pdf.get(11u);
+    double const f_12 = pdf.get(12u);
+    double const f_13 = pdf.get(13u);
+    double const f_14 = pdf.get(14u);
+    double const f_15 = pdf.get(15u);
+    double const f_16 = pdf.get(16u);
+    double const f_17 = pdf.get(17u);
+    double const f_18 = pdf.get(18u);
+    const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const double momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const double vel1Term = f_1 + f_11 + f_15 + f_7;
+    const double momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const double vel2Term = f_12 + f_13 + f_5;
+    const double momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const double md_0 = force.get(0) * 0.50000000000000000 + momdensity_0;
+    const double md_1 = force.get(1) * 0.50000000000000000 + momdensity_1;
+    const double md_2 = force.get(2) * 0.50000000000000000 + momdensity_2;
+    out[0u] += md_0;
+    out[1u] += md_1;
+    out[2u] += md_2;
+  }
+}
+// LCOV_EXCL_STOP
+
+Vector3<double> reduce(
+    gpu::GPUField<double> const *pdf_field,
+    gpu::GPUField<double> const *force_field) {
+  thrust::device_vector<double> dev_data(3u, double{0});
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
+    Cell cell(x, y, z);
+    CellInterval ci(cell, cell);
+    auto kernel = gpu::make_kernel(kernel_sum);
+    kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+    kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+    kernel.addParam(dev_data_ptr);
+    kernel();
+  });
+  Vector3<double> mom(double{0});
+  thrust::copy(dev_data.begin(), dev_data.begin() + 3u, mom.data());
+  return mom;
+}
+} // namespace MomentumDensity
+
+namespace PressureTensor {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<double> pdf,
+    double *RESTRICT p_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 9u);
+  pdf.set(blockIdx, threadIdx);
+  p_out += offset;
+  if (pdf.isValidPosition()) {
+    double const f_0 = pdf.get(0u);
+    double const f_1 = pdf.get(1u);
+    double const f_2 = pdf.get(2u);
+    double const f_3 = pdf.get(3u);
+    double const f_4 = pdf.get(4u);
+    double const f_5 = pdf.get(5u);
+    double const f_6 = pdf.get(6u);
+    double const f_7 = pdf.get(7u);
+    double const f_8 = pdf.get(8u);
+    double const f_9 = pdf.get(9u);
+    double const f_10 = pdf.get(10u);
+    double const f_11 = pdf.get(11u);
+    double const f_12 = pdf.get(12u);
+    double const f_13 = pdf.get(13u);
+    double const f_14 = pdf.get(14u);
+    double const f_15 = pdf.get(15u);
+    double const f_16 = pdf.get(16u);
+    double const f_17 = pdf.get(17u);
+    double const f_18 = pdf.get(18u);
+    const double p_0 = f_10 + f_13 + f_14 + f_17 + f_18 + f_3 + f_4 + f_7 + f_8 + f_9;
+    const double p_1 = -f_10 - f_7 + f_8 + f_9;
+    const double p_2 = -f_13 + f_14 + f_17 - f_18;
+    const double p_3 = -f_10 - f_7 + f_8 + f_9;
+    const double p_4 = f_1 + f_10 + f_11 + f_12 + f_15 + f_16 + f_2 + f_7 + f_8 + f_9;
+    const double p_5 = f_11 - f_12 - f_15 + f_16;
+    const double p_6 = -f_13 + f_14 + f_17 - f_18;
+    const double p_7 = f_11 - f_12 - f_15 + f_16;
+    const double p_8 = f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
+    p_out[0u] = p_0;
+    p_out[1u] = p_1;
+    p_out[2u] = p_2;
+    p_out[3u] = p_3;
+    p_out[4u] = p_4;
+    p_out[5u] = p_5;
+    p_out[6u] = p_6;
+    p_out[7u] = p_7;
+    p_out[8u] = p_8;
+  }
+}
+// LCOV_EXCL_STOP
+
+Matrix3<double> get(
+    gpu::GPUField<double> const *pdf_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(9u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  Matrix3<double> out;
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+
+std::vector<double> get(
+    gpu::GPUField<double> const *pdf_field,
+    CellInterval const &ci) {
+  thrust::device_vector<double> dev_data(9u * ci.numCells());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<double> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+} // namespace PressureTensor
+
+} // namespace accessor
+} // namespace lbm
+} // namespace walberla
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh
new file mode 100644
index 00000000000..6ac754f263c
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2023-2024 The ESPResSo project
+ * Copyright (C) 2020 The waLBerla project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+/**
+ * @file
+ * Lattice field accessors.
+ * Adapted from the waLBerla source file
+ * https://i10git.cs.fau.de/walberla/walberla/-/blob/a16141524c58ab88386e2a0f8fdd7c63c5edd704/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
+ */
+
+#pragma once
+
+#include <core/DataTypes.h>
+#include <core/cell/Cell.h>
+#include <core/cell/CellInterval.h>
+#include <core/math/Matrix3.h>
+#include <core/math/Vector3.h>
+
+#include <gpu/GPUField.h>
+
+#include <array>
+#include <tuple>
+#include <vector>
+
+namespace walberla {
+namespace lbm {
+namespace accessor {
+
+namespace Population {
+/** @brief Get populations from a single cell. */
+std::array<double, 19u> get(gpu::GPUField<double> const *pdf_field,
+                            Cell const &cell);
+/** @brief Set populations on a single cell. */
+void set(gpu::GPUField<double> *pdf_field, std::array<double, 19u> const &pop,
+         Cell const &cell);
+/** @brief Set populations and recalculate velocities on a single cell. */
+void set(gpu::GPUField<double> *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> const *force_field,
+         std::array<double, 19u> const &pop, Cell const &cell);
+/** @brief Initialize all cells with the same value. */
+void initialize(gpu::GPUField<double> *pdf_field,
+                std::array<double, 19u> const &pop);
+/** @brief Get populations from a cell interval. */
+std::vector<double> get(gpu::GPUField<double> const *pdf_field,
+                        CellInterval const &ci);
+/** @brief Set populations on a cell interval. */
+void set(gpu::GPUField<double> *pdf_field, std::vector<double> const &values,
+         CellInterval const &ci);
+/** @brief Set populations and recalculate velocities on a cell interval. */
+void set(gpu::GPUField<double> *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> const *force_field,
+         std::vector<double> const &values, CellInterval const &ci);
+} // namespace Population
+
+namespace Vector {
+/** @brief Get value from a single cell. */
+Vector3<double> get(gpu::GPUField<double> const *field, Cell const &cell);
+/** @brief Set value on a single cell. */
+void set(gpu::GPUField<double> *field, Vector3<double> const &vec,
+         Cell const &cell);
+/** @brief Add value to a single cell. */
+void add(gpu::GPUField<double> *field, Vector3<double> const &vec,
+         Cell const &cell);
+/** @brief Initialize all cells with the same value. */
+void initialize(gpu::GPUField<double> *field, Vector3<double> const &vec);
+/** @brief Add value to all cells. */
+void add_to_all(gpu::GPUField<double> *field, Vector3<double> const &vec);
+/** @brief Get values from a cell interval. */
+std::vector<double> get(gpu::GPUField<double> const *vec_field,
+                        CellInterval const &ci);
+/** @brief Set values on a cell interval. */
+void set(gpu::GPUField<double> *vec_field, std::vector<double> const &values,
+         CellInterval const &ci);
+
+} // namespace Vector
+
+namespace Interpolation {
+std::vector<double> get(gpu::GPUField<double> const *vec_field,
+                        std::vector<double> const &pos, uint gl);
+void set(gpu::GPUField<double> const *vec_field, std::vector<double> const &pos,
+         std::vector<double> const &forces, uint gl);
+} // namespace Interpolation
+
+namespace Density {
+double get(gpu::GPUField<double> const *pdf_field, Cell const &cell);
+void set(gpu::GPUField<double> *pdf_field, double const rho, Cell const &cell);
+std::vector<double> get(gpu::GPUField<double> const *pdf_field,
+                        CellInterval const &ci);
+void set(gpu::GPUField<double> *pdf_field, std::vector<double> const &values,
+         CellInterval const &ci);
+} // namespace Density
+
+namespace Velocity {
+Vector3<double> get(gpu::GPUField<double> const *pdf_field,
+                    gpu::GPUField<double> const *force_field, Cell const &cell);
+std::vector<double> get(gpu::GPUField<double> const *pdf_field,
+                        gpu::GPUField<double> const *force_field,
+                        CellInterval const &ci);
+void set(gpu::GPUField<double> *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> const *force_field, Vector3<double> const &u,
+         Cell const &cell);
+void set(gpu::GPUField<double> *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> const *force_field,
+         std::vector<double> const &values, CellInterval const &ci);
+} // namespace Velocity
+
+namespace Force {
+void set(gpu::GPUField<double> const *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> *force_field, Vector3<double> const &u,
+         Cell const &cell);
+void set(gpu::GPUField<double> const *pdf_field,
+         gpu::GPUField<double> *velocity_field,
+         gpu::GPUField<double> *force_field, std::vector<double> const &values,
+         CellInterval const &ci);
+} // namespace Force
+
+namespace DensityAndVelocity {
+std::tuple<double, Vector3<double>>
+get(gpu::GPUField<double> const *pdf_field,
+    gpu::GPUField<double> const *force_field, Cell const &cell);
+void set(gpu::GPUField<double> *pdf_field, gpu::GPUField<double> *force_field,
+         Vector3<double> const &u, double const rho, Cell const &cell);
+} // namespace DensityAndVelocity
+
+namespace DensityAndMomentumDensity {
+std::tuple<double, Vector3<double>>
+get(gpu::GPUField<double> const *pdf_field,
+    gpu::GPUField<double> const *force_field, Cell const &cell);
+} // namespace DensityAndMomentumDensity
+
+namespace MomentumDensity {
+Vector3<double> reduce(gpu::GPUField<double> const *pdf_field,
+                       gpu::GPUField<double> const *force_field);
+} // namespace MomentumDensity
+
+namespace PressureTensor {
+Matrix3<double> get(gpu::GPUField<double> const *pdf_field, Cell const &cell);
+std::vector<double> get(gpu::GPUField<double> const *pdf_field,
+                        CellInterval const &ci);
+} // namespace PressureTensor
+
+} // namespace accessor
+} // namespace lbm
+} // namespace walberla
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h
index 1790c5b984f..a7711c4f780 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h
@@ -18,12 +18,11 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
-/**
- * @file
+/*
  * Lattice field accessors.
  * Adapted from the waLBerla source file
  * https://i10git.cs.fau.de/walberla/walberla/-/blob/a16141524c58ab88386e2a0f8fdd7c63c5edd704/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
@@ -42,19 +41,18 @@
 
 #include <array>
 #include <cassert>
+#include <iterator>
 #include <tuple>
 #include <vector>
 
 #ifdef WALBERLA_CXX_COMPILER_IS_GNU
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
 #ifdef WALBERLA_CXX_COMPILER_IS_CLANG
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wunused-parameter"
 #endif
 
 namespace walberla {
@@ -62,8 +60,8 @@ namespace lbm {
 namespace accessor {
 
 namespace Population {
-inline std::array<float, 19u>
-get(GhostLayerField<float, uint_t{19u}> const *pdf_field, Cell const &cell) {
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                Cell const &cell) {
   float const &xyz0 = pdf_field->get(cell, uint_t{0u});
   std::array<float, 19u> pop;
   pop[0u] = pdf_field->getF(&xyz0, uint_t{0u});
@@ -112,8 +110,56 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
   pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
 }
 
-inline void broadcast(GhostLayerField<float, uint_t{19u}> *pdf_field,
-                      std::array<float, 19u> const &pop) {
+inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
+                GhostLayerField<float, uint_t{3u}> const *force_field,
+                std::array<float, 19u> const &pop, Cell const &cell) {
+  auto &xyz0 = pdf_field->get(cell, uint_t{0u});
+  const float f_0 = pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+  const float f_1 = pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+  const float f_2 = pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+  const float f_3 = pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+  const float f_4 = pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+  const float f_5 = pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+  const float f_6 = pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+  const float f_7 = pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+  const float f_8 = pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+  const float f_9 = pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+  const float f_10 = pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+  const float f_11 = pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+  const float f_12 = pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+  const float f_13 = pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+  const float f_14 = pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+  const float f_15 = pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+  const float f_16 = pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+  const float f_17 = pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+  const float f_18 = pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+  const auto x = cell.x();
+  const auto y = cell.y();
+  const auto z = cell.z();
+  const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const float vel1Term = f_1 + f_11 + f_15 + f_7;
+  const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const float vel2Term = f_12 + f_13 + f_5;
+  const float momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                    vel1Term + vel2Term;
+  const float md_0 =
+      force_field->get(x, y, z, 0) * 0.50000000000000000f + momdensity_0;
+  const float md_1 =
+      force_field->get(x, y, z, 1) * 0.50000000000000000f + momdensity_1;
+  const float md_2 =
+      force_field->get(x, y, z, 2) * 0.50000000000000000f + momdensity_2;
+  const auto rho_inv = float{1} / rho;
+  velocity_field->get(cell, uint_t{0u}) = md_0 * rho_inv;
+  velocity_field->get(cell, uint_t{1u}) = md_1 * rho_inv;
+  velocity_field->get(cell, uint_t{2u}) = md_2 * rho_inv;
+}
+
+inline void initialize(GhostLayerField<float, uint_t{19u}> *pdf_field,
+                       std::array<float, 19u> const &pop) {
   WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdf_field, {
     float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
     pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
@@ -138,9 +184,8 @@ inline void broadcast(GhostLayerField<float, uint_t{19u}> *pdf_field,
   });
 }
 
-inline std::vector<float>
-get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
-    CellInterval const &ci) {
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                CellInterval const &ci) {
   std::vector<float> out;
   out.reserve(ci.numCells() * uint_t(19u));
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
@@ -175,31 +220,86 @@ get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
 inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
                 std::vector<float> const &values, CellInterval const &ci) {
   assert(uint_c(values.size()) == ci.numCells() * uint_t(19u));
-  auto values_ptr = values.data();
+  auto pop = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+        pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+        pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+        pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+        pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+        pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+        pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+        pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+        pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+        pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+        pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+        pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+        pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+        pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+        pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+        pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+        pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+        pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+        pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+        std::advance(pop, 19);
+      }
+    }
+  }
+}
+
+inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
+                GhostLayerField<float, uint_t{3u}> const *force_field,
+                std::vector<float> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(19u));
+  auto pop = values.data();
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
     for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
       for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
         float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
-        pdf_field->getF(&xyz0, uint_t{0u}) = values_ptr[0u];
-        pdf_field->getF(&xyz0, uint_t{1u}) = values_ptr[1u];
-        pdf_field->getF(&xyz0, uint_t{2u}) = values_ptr[2u];
-        pdf_field->getF(&xyz0, uint_t{3u}) = values_ptr[3u];
-        pdf_field->getF(&xyz0, uint_t{4u}) = values_ptr[4u];
-        pdf_field->getF(&xyz0, uint_t{5u}) = values_ptr[5u];
-        pdf_field->getF(&xyz0, uint_t{6u}) = values_ptr[6u];
-        pdf_field->getF(&xyz0, uint_t{7u}) = values_ptr[7u];
-        pdf_field->getF(&xyz0, uint_t{8u}) = values_ptr[8u];
-        pdf_field->getF(&xyz0, uint_t{9u}) = values_ptr[9u];
-        pdf_field->getF(&xyz0, uint_t{10u}) = values_ptr[10u];
-        pdf_field->getF(&xyz0, uint_t{11u}) = values_ptr[11u];
-        pdf_field->getF(&xyz0, uint_t{12u}) = values_ptr[12u];
-        pdf_field->getF(&xyz0, uint_t{13u}) = values_ptr[13u];
-        pdf_field->getF(&xyz0, uint_t{14u}) = values_ptr[14u];
-        pdf_field->getF(&xyz0, uint_t{15u}) = values_ptr[15u];
-        pdf_field->getF(&xyz0, uint_t{16u}) = values_ptr[16u];
-        pdf_field->getF(&xyz0, uint_t{17u}) = values_ptr[17u];
-        pdf_field->getF(&xyz0, uint_t{18u}) = values_ptr[18u];
-        values_ptr += 19u;
+        const float f_0 = pdf_field->getF(&xyz0, uint_t{0u}) = pop[0u];
+        const float f_1 = pdf_field->getF(&xyz0, uint_t{1u}) = pop[1u];
+        const float f_2 = pdf_field->getF(&xyz0, uint_t{2u}) = pop[2u];
+        const float f_3 = pdf_field->getF(&xyz0, uint_t{3u}) = pop[3u];
+        const float f_4 = pdf_field->getF(&xyz0, uint_t{4u}) = pop[4u];
+        const float f_5 = pdf_field->getF(&xyz0, uint_t{5u}) = pop[5u];
+        const float f_6 = pdf_field->getF(&xyz0, uint_t{6u}) = pop[6u];
+        const float f_7 = pdf_field->getF(&xyz0, uint_t{7u}) = pop[7u];
+        const float f_8 = pdf_field->getF(&xyz0, uint_t{8u}) = pop[8u];
+        const float f_9 = pdf_field->getF(&xyz0, uint_t{9u}) = pop[9u];
+        const float f_10 = pdf_field->getF(&xyz0, uint_t{10u}) = pop[10u];
+        const float f_11 = pdf_field->getF(&xyz0, uint_t{11u}) = pop[11u];
+        const float f_12 = pdf_field->getF(&xyz0, uint_t{12u}) = pop[12u];
+        const float f_13 = pdf_field->getF(&xyz0, uint_t{13u}) = pop[13u];
+        const float f_14 = pdf_field->getF(&xyz0, uint_t{14u}) = pop[14u];
+        const float f_15 = pdf_field->getF(&xyz0, uint_t{15u}) = pop[15u];
+        const float f_16 = pdf_field->getF(&xyz0, uint_t{16u}) = pop[16u];
+        const float f_17 = pdf_field->getF(&xyz0, uint_t{17u}) = pop[17u];
+        const float f_18 = pdf_field->getF(&xyz0, uint_t{18u}) = pop[18u];
+        const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const float vel1Term = f_1 + f_11 + f_15 + f_7;
+        const float momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const float vel2Term = f_12 + f_13 + f_5;
+        const float momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                          vel1Term + vel2Term;
+        const float md_0 =
+            force_field->get(x, y, z, 0) * 0.50000000000000000f + momdensity_0;
+        const float md_1 =
+            force_field->get(x, y, z, 1) * 0.50000000000000000f + momdensity_1;
+        const float md_2 =
+            force_field->get(x, y, z, 2) * 0.50000000000000000f + momdensity_2;
+        const auto rho_inv = float{1} / rho;
+        velocity_field->get(x, y, z, uint_t{0u}) = md_0 * rho_inv;
+        velocity_field->get(x, y, z, uint_t{1u}) = md_1 * rho_inv;
+        velocity_field->get(x, y, z, uint_t{2u}) = md_2 * rho_inv;
+        std::advance(pop, 19);
       }
     }
   }
@@ -207,8 +307,8 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
 } // namespace Population
 
 namespace Vector {
-inline Vector3<float> get(GhostLayerField<float, uint_t{3u}> const *vec_field,
-                          Cell const &cell) {
+inline auto get(GhostLayerField<float, uint_t{3u}> const *vec_field,
+                Cell const &cell) {
   const float &xyz0 = vec_field->get(cell, uint_t{0u});
   Vector3<float> vec;
   vec[0] = vec_field->getF(&xyz0, uint_t{0u});
@@ -233,8 +333,8 @@ inline void add(GhostLayerField<float, uint_t{3u}> *vec_field,
   vec_field->getF(&xyz0, uint_t{2u}) += vec[2u];
 }
 
-inline void broadcast(GhostLayerField<float, uint_t{3u}> *vec_field,
-                      Vector3<float> const &vec) {
+inline void initialize(GhostLayerField<float, uint_t{3u}> *vec_field,
+                       Vector3<float> const &vec) {
   WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(vec_field, {
     float &xyz0 = vec_field->get(x, y, z, uint_t{0u});
     vec_field->getF(&xyz0, uint_t{0u}) = vec[0u];
@@ -253,9 +353,8 @@ inline void add_to_all(GhostLayerField<float, uint_t{3u}> *vec_field,
   });
 }
 
-inline std::vector<float>
-get(GhostLayerField<float, uint_t{3u}> const *vec_field,
-    CellInterval const &ci) {
+inline auto get(GhostLayerField<float, uint_t{3u}> const *vec_field,
+                CellInterval const &ci) {
   std::vector<float> out;
   out.reserve(ci.numCells() * uint_t(3u));
   for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
@@ -282,7 +381,7 @@ inline void set(GhostLayerField<float, uint_t{3u}> *vec_field,
         vec_field->getF(&xyz0, uint_t{0u}) = values_ptr[0u];
         vec_field->getF(&xyz0, uint_t{1u}) = values_ptr[1u];
         vec_field->getF(&xyz0, uint_t{2u}) = values_ptr[2u];
-        values_ptr += 3u;
+        std::advance(values_ptr, 3);
       }
     }
   }
@@ -291,8 +390,8 @@ inline void set(GhostLayerField<float, uint_t{3u}> *vec_field,
 
 namespace EquilibriumDistribution {
 inline float get(stencil::Direction const direction,
-                 Vector3<float> const &u = Vector3<float>(float(0.0)),
-                 float rho = float(1.0)) {
+                 Vector3<float> const &u = Vector3<float>(float{0}),
+                 float rho = float{1}) {
 
   using namespace stencil;
   switch (direction) {
@@ -572,7 +671,7 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
                     vel1Term + vel2Term;
 
   // calculate current velocity (before density change)
-  const float conversion = float(1) / rho;
+  const float conversion = float{1} / rho;
   Vector3<float> velocity;
   velocity[0u] = momdensity_0 * conversion;
   velocity[1u] = momdensity_1 * conversion;
@@ -660,7 +759,7 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
                           vel1Term + vel2Term;
 
         // calculate current velocity (before density change)
-        const float conversion = float(1) / rho;
+        const float conversion = float{1} / rho;
         Vector3<float> velocity;
         velocity[0u] = momdensity_0 * conversion;
         velocity[1u] = momdensity_1 * conversion;
@@ -675,7 +774,108 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
 } // namespace Density
 
 namespace Velocity {
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                GhostLayerField<float, uint_t{3u}> const *force_field,
+                Cell const &cell) {
+  const float &xyz0 = pdf_field->get(cell, uint_t{0u});
+  const float f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+  const float f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+  const float f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+  const float f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+  const float f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+  const float f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+  const float f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+  const float f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+  const float f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+  const float f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+  const float f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+  const float f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+  const float f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+  const float f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+  const float f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+  const float f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+  const float f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+  const float f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+  const float f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+  const auto x = cell.x();
+  const auto y = cell.y();
+  const auto z = cell.z();
+  const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const float vel1Term = f_1 + f_11 + f_15 + f_7;
+  const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const float vel2Term = f_12 + f_13 + f_5;
+  const float momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                    vel1Term + vel2Term;
+  const float md_0 =
+      force_field->get(x, y, z, 0) * 0.50000000000000000f + momdensity_0;
+  const float md_1 =
+      force_field->get(x, y, z, 1) * 0.50000000000000000f + momdensity_1;
+  const float md_2 =
+      force_field->get(x, y, z, 2) * 0.50000000000000000f + momdensity_2;
+  const float rho_inv = float{1} / rho;
+
+  return Vector3<float>(md_0 * rho_inv, md_1 * rho_inv, md_2 * rho_inv);
+}
+
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                GhostLayerField<float, uint_t{3u}> const *force_field,
+                CellInterval const &ci) {
+  std::vector<float> out;
+  out.reserve(ci.numCells() * uint_t(3u));
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        const float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        const float f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+        const float f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+        const float f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+        const float f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+        const float f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+        const float f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+        const float f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+        const float f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+        const float f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+        const float f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+        const float f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+        const float f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+        const float f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+        const float f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+        const float f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+        const float f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+        const float f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+        const float f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+        const float f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+        const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const float vel1Term = f_1 + f_11 + f_15 + f_7;
+        const float momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const float vel2Term = f_12 + f_13 + f_5;
+        const float momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                          vel1Term + vel2Term;
+        const float md_0 =
+            force_field->get(x, y, z, 0) * 0.50000000000000000f + momdensity_0;
+        const float md_1 =
+            force_field->get(x, y, z, 1) * 0.50000000000000000f + momdensity_1;
+        const float md_2 =
+            force_field->get(x, y, z, 2) * 0.50000000000000000f + momdensity_2;
+        const float rho_inv = float{1} / rho;
+        out.emplace_back(md_0 * rho_inv);
+        out.emplace_back(md_1 * rho_inv);
+        out.emplace_back(md_2 * rho_inv);
+      }
+    }
+  }
+  return out;
+}
+
 inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
                 GhostLayerField<float, uint_t{3u}> const *force_field,
                 Vector3<float> const &u, Cell const &cell) {
   const float &xyz0 = pdf_field->get(cell, uint_t{0u});
@@ -713,15 +913,183 @@ inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
       -force_field->get(x, y, z, 1) * 0.50000000000000000f / rho + u[1];
   const float u_2 =
       -force_field->get(x, y, z, 2) * 0.50000000000000000f / rho + u[2];
+  velocity_field->get(x, y, z, uint_t{0u}) = u[0u];
+  velocity_field->get(x, y, z, uint_t{1u}) = u[1u];
+  velocity_field->get(x, y, z, uint_t{2u}) = u[2u];
 
   Equilibrium::set(pdf_field, Vector3<float>(u_0, u_1, u_2), rho, cell);
 }
+
+inline void set(GhostLayerField<float, uint_t{19u}> *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
+                GhostLayerField<float, uint_t{3u}> const *force_field,
+                std::vector<float> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(3u));
+  auto u = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        float &pdf_xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        float &vel_xyz0 = velocity_field->get(x, y, z, uint_t{0u});
+        const float f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+        const float f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+        const float f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+        const float f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+        const float f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+        const float f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+        const float f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+        const float f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+        const float f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+        const float f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+        const float f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+        const float f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+        const float f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+        const float f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+        const float f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+        const float f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+        const float f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+        const float f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+        const float f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+        const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const float vel1Term = f_1 + f_11 + f_15 + f_7;
+        const float vel2Term = f_12 + f_13 + f_5;
+        const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                          vel1Term + vel2Term;
+
+        const float u_0 =
+            -force_field->get(x, y, z, 0) * 0.50000000000000000f / rho + u[0];
+        const float u_1 =
+            -force_field->get(x, y, z, 1) * 0.50000000000000000f / rho + u[1];
+        const float u_2 =
+            -force_field->get(x, y, z, 2) * 0.50000000000000000f / rho + u[2];
+        velocity_field->getF(&vel_xyz0, uint_t{0u}) = u[0u];
+        velocity_field->getF(&vel_xyz0, uint_t{1u}) = u[1u];
+        velocity_field->getF(&vel_xyz0, uint_t{2u}) = u[2u];
+
+        std::advance(u, 3);
+
+        Equilibrium::set(pdf_field, Vector3<float>(u_0, u_1, u_2), rho,
+                         Cell{x, y, z});
+      }
+    }
+  }
+}
 } // namespace Velocity
 
+namespace Force {
+inline void set(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
+                GhostLayerField<float, uint_t{3u}> *force_field,
+                Vector3<float> const &force, Cell const &cell) {
+  float const &pdf_xyz0 = pdf_field->get(cell, uint_t{0u});
+  float &vel_xyz0 = velocity_field->get(cell, uint_t{0u});
+  float &laf_xyz0 = force_field->get(cell, uint_t{0u});
+  const float f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+  const float f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+  const float f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+  const float f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+  const float f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+  const float f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+  const float f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+  const float f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+  const float f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+  const float f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+  const float f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+  const float f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+  const float f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+  const float f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+  const float f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+  const float f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+  const float f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+  const float f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+  const float f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+  const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+  const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+  const float vel1Term = f_1 + f_11 + f_15 + f_7;
+  const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+  const float vel2Term = f_12 + f_13 + f_5;
+  const float momdensity_2 =
+      f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+  const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                    vel1Term + vel2Term;
+  const float md_0 = force[0u] * 0.50000000000000000f + momdensity_0;
+  const float md_1 = force[1u] * 0.50000000000000000f + momdensity_1;
+  const float md_2 = force[2u] * 0.50000000000000000f + momdensity_2;
+  auto const rho_inv = float{1} / rho;
+
+  force_field->getF(&laf_xyz0, uint_t{0u}) = force[0u];
+  force_field->getF(&laf_xyz0, uint_t{1u}) = force[1u];
+  force_field->getF(&laf_xyz0, uint_t{2u}) = force[2u];
+
+  velocity_field->getF(&vel_xyz0, uint_t{0u}) = md_0 * rho_inv;
+  velocity_field->getF(&vel_xyz0, uint_t{1u}) = md_1 * rho_inv;
+  velocity_field->getF(&vel_xyz0, uint_t{2u}) = md_2 * rho_inv;
+}
+
+inline void set(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                GhostLayerField<float, uint_t{3u}> *velocity_field,
+                GhostLayerField<float, uint_t{3u}> *force_field,
+                std::vector<float> const &values, CellInterval const &ci) {
+  assert(uint_c(values.size()) == ci.numCells() * uint_t(3u));
+  auto force = values.data();
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        float const &pdf_xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        float &vel_xyz0 = velocity_field->get(x, y, z, uint_t{0u});
+        float &laf_xyz0 = force_field->get(x, y, z, uint_t{0u});
+        const float f_0 = pdf_field->getF(&pdf_xyz0, uint_t{0u});
+        const float f_1 = pdf_field->getF(&pdf_xyz0, uint_t{1u});
+        const float f_2 = pdf_field->getF(&pdf_xyz0, uint_t{2u});
+        const float f_3 = pdf_field->getF(&pdf_xyz0, uint_t{3u});
+        const float f_4 = pdf_field->getF(&pdf_xyz0, uint_t{4u});
+        const float f_5 = pdf_field->getF(&pdf_xyz0, uint_t{5u});
+        const float f_6 = pdf_field->getF(&pdf_xyz0, uint_t{6u});
+        const float f_7 = pdf_field->getF(&pdf_xyz0, uint_t{7u});
+        const float f_8 = pdf_field->getF(&pdf_xyz0, uint_t{8u});
+        const float f_9 = pdf_field->getF(&pdf_xyz0, uint_t{9u});
+        const float f_10 = pdf_field->getF(&pdf_xyz0, uint_t{10u});
+        const float f_11 = pdf_field->getF(&pdf_xyz0, uint_t{11u});
+        const float f_12 = pdf_field->getF(&pdf_xyz0, uint_t{12u});
+        const float f_13 = pdf_field->getF(&pdf_xyz0, uint_t{13u});
+        const float f_14 = pdf_field->getF(&pdf_xyz0, uint_t{14u});
+        const float f_15 = pdf_field->getF(&pdf_xyz0, uint_t{15u});
+        const float f_16 = pdf_field->getF(&pdf_xyz0, uint_t{16u});
+        const float f_17 = pdf_field->getF(&pdf_xyz0, uint_t{17u});
+        const float f_18 = pdf_field->getF(&pdf_xyz0, uint_t{18u});
+        const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const float vel1Term = f_1 + f_11 + f_15 + f_7;
+        const float momdensity_1 =
+            -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const float vel2Term = f_12 + f_13 + f_5;
+        const float momdensity_2 =
+            f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term +
+                          vel1Term + vel2Term;
+        const float md_0 = force[0u] * 0.50000000000000000f + momdensity_0;
+        const float md_1 = force[1u] * 0.50000000000000000f + momdensity_1;
+        const float md_2 = force[2u] * 0.50000000000000000f + momdensity_2;
+        auto const rho_inv = float{1} / rho;
+
+        force_field->getF(&laf_xyz0, uint_t{0u}) = force[0u];
+        force_field->getF(&laf_xyz0, uint_t{1u}) = force[1u];
+        force_field->getF(&laf_xyz0, uint_t{2u}) = force[2u];
+
+        velocity_field->getF(&vel_xyz0, uint_t{0u}) = md_0 * rho_inv;
+        velocity_field->getF(&vel_xyz0, uint_t{1u}) = md_1 * rho_inv;
+        velocity_field->getF(&vel_xyz0, uint_t{2u}) = md_2 * rho_inv;
+
+        std::advance(force, 3);
+      }
+    }
+  }
+}
+} // namespace Force
+
 namespace MomentumDensity {
-inline Vector3<float>
-reduce(GhostLayerField<float, uint_t{19u}> const *pdf_field,
-       GhostLayerField<float, uint_t{3u}> const *force_field) {
+inline auto reduce(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                   GhostLayerField<float, uint_t{3u}> const *force_field) {
   Vector3<float> momentumDensity(float{0});
   WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
     const float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
@@ -769,8 +1137,8 @@ reduce(GhostLayerField<float, uint_t{19u}> const *pdf_field,
 } // namespace MomentumDensity
 
 namespace PressureTensor {
-inline Matrix3<float> get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
-                          Cell const &cell) {
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                Cell const &cell) {
   const float &xyz0 = pdf_field->get(cell, uint_t{0u});
   const float f_0 = pdf_field->getF(&xyz0, uint_t{0u});
   const float f_1 = pdf_field->getF(&xyz0, uint_t{1u});
@@ -819,6 +1187,63 @@ inline Matrix3<float> get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
 
   return pressureTensor;
 }
+
+inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
+                CellInterval const &ci) {
+  std::vector<float> out;
+  out.reserve(ci.numCells() * uint_t(9u));
+  for (auto x = ci.xMin(); x <= ci.xMax(); ++x) {
+    for (auto y = ci.yMin(); y <= ci.yMax(); ++y) {
+      for (auto z = ci.zMin(); z <= ci.zMax(); ++z) {
+        const float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+        const float f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+        const float f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+        const float f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+        const float f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+        const float f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+        const float f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+        const float f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+        const float f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+        const float f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+        const float f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+        const float f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+        const float f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+        const float f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+        const float f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+        const float f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+        const float f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+        const float f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+        const float f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+        const float f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+        const float p_0 =
+            f_10 + f_13 + f_14 + f_17 + f_18 + f_3 + f_4 + f_7 + f_8 + f_9;
+        const float p_1 = -f_10 - f_7 + f_8 + f_9;
+        const float p_2 = -f_13 + f_14 + f_17 - f_18;
+        const float p_3 = -f_10 - f_7 + f_8 + f_9;
+        const float p_4 =
+            f_1 + f_10 + f_11 + f_12 + f_15 + f_16 + f_2 + f_7 + f_8 + f_9;
+        const float p_5 = f_11 - f_12 - f_15 + f_16;
+        const float p_6 = -f_13 + f_14 + f_17 - f_18;
+        const float p_7 = f_11 - f_12 - f_15 + f_16;
+        const float p_8 =
+            f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
+
+        out.emplace_back(p_0);
+        out.emplace_back(p_1);
+        out.emplace_back(p_2);
+
+        out.emplace_back(p_3);
+        out.emplace_back(p_4);
+        out.emplace_back(p_5);
+
+        out.emplace_back(p_6);
+        out.emplace_back(p_7);
+        out.emplace_back(p_8);
+      }
+    }
+  }
+  return out;
+}
 } // namespace PressureTensor
 
 } // namespace accessor
@@ -831,4 +1256,4 @@ inline Matrix3<float> get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
 
 #ifdef WALBERLA_CXX_COMPILER_IS_CLANG
 #pragma clang diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cu
new file mode 100644
index 00000000000..e0c8dd81027
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cu
@@ -0,0 +1,1198 @@
+/*
+ * Copyright (C) 2023-2024 The ESPResSo project
+ * Copyright (C) 2020 The waLBerla project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+/**
+ * @file
+ * Lattice field accessors.
+ * Adapted from the waLBerla source file
+ * https://i10git.cs.fau.de/walberla/walberla/-/blob/a16141524c58ab88386e2a0f8fdd7c63c5edd704/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
+ */
+
+#include <core/DataTypes.h>
+#include <core/cell/Cell.h>
+#include <core/cell/CellInterval.h>
+#include <core/math/Matrix3.h>
+#include <core/math/Vector3.h>
+
+#include <field/iterators/IteratorMacros.h>
+
+#include <gpu/FieldAccessor.h>
+#include <gpu/FieldIndexing.h>
+#include <gpu/GPUField.h>
+#include <gpu/Kernel.h>
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+
+#include <array>
+#include <vector>
+
+#if defined(__NVCC__)
+#define RESTRICT __restrict__
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177 // unused variable
+#elif defined(__clang__)
+#if defined(__CUDA__)
+#if defined(__CUDA_ARCH__)
+// clang compiling CUDA code in device mode
+#define RESTRICT __restrict__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-variable"
+#else
+// clang compiling CUDA code in host mode
+#define RESTRICT __restrict__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-variable"
+#endif
+#endif
+#elif defined(__GNUC__) or defined(__GNUG__)
+#define RESTRICT __restrict__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#elif defined(_MSC_VER)
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+/** @brief Get linear index of flattened data with original layout @c fzyx. */
+static __forceinline__ __device__ uint getLinearIndex(uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim, uint fOffset) {
+  auto const x = threadIdx.x;
+  auto const y = blockIdx.x;
+  auto const z = blockIdx.y;
+  auto const f = blockIdx.z;
+  auto const ySize = gridDim.x;
+  auto const zSize = gridDim.y;
+  auto const fSize = fOffset;
+  return f +
+         z * fSize +
+         y * fSize * zSize +
+         x * fSize * zSize * ySize;
+}
+
+namespace walberla {
+namespace lbm {
+namespace accessor {
+
+namespace Population {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<float> pdf,
+    float *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
+  pdf.set(blockIdx, threadIdx);
+  pop += offset;
+  if (pdf.isValidPosition()) {
+    pop[0u] = pdf.get(0u);
+    pop[1u] = pdf.get(1u);
+    pop[2u] = pdf.get(2u);
+    pop[3u] = pdf.get(3u);
+    pop[4u] = pdf.get(4u);
+    pop[5u] = pdf.get(5u);
+    pop[6u] = pdf.get(6u);
+    pop[7u] = pdf.get(7u);
+    pop[8u] = pdf.get(8u);
+    pop[9u] = pdf.get(9u);
+    pop[10u] = pdf.get(10u);
+    pop[11u] = pdf.get(11u);
+    pop[12u] = pdf.get(12u);
+    pop[13u] = pdf.get(13u);
+    pop[14u] = pdf.get(14u);
+    pop[15u] = pdf.get(15u);
+    pop[16u] = pdf.get(16u);
+    pop[17u] = pdf.get(17u);
+    pop[18u] = pdf.get(18u);
+  }
+}
+
+__global__ void kernel_set(
+    gpu::FieldAccessor<float> pdf,
+    float const *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
+  pdf.set(blockIdx, threadIdx);
+  pop += offset;
+  if (pdf.isValidPosition()) {
+    pdf.get(0u) = pop[0u];
+    pdf.get(1u) = pop[1u];
+    pdf.get(2u) = pop[2u];
+    pdf.get(3u) = pop[3u];
+    pdf.get(4u) = pop[4u];
+    pdf.get(5u) = pop[5u];
+    pdf.get(6u) = pop[6u];
+    pdf.get(7u) = pop[7u];
+    pdf.get(8u) = pop[8u];
+    pdf.get(9u) = pop[9u];
+    pdf.get(10u) = pop[10u];
+    pdf.get(11u) = pop[11u];
+    pdf.get(12u) = pop[12u];
+    pdf.get(13u) = pop[13u];
+    pdf.get(14u) = pop[14u];
+    pdf.get(15u) = pop[15u];
+    pdf.get(16u) = pop[16u];
+    pdf.get(17u) = pop[17u];
+    pdf.get(18u) = pop[18u];
+  }
+}
+
+__global__ void kernel_broadcast(
+    gpu::FieldAccessor<float> pdf,
+    float const *RESTRICT pop) {
+  pdf.set(blockIdx, threadIdx);
+  if (pdf.isValidPosition()) {
+    pdf.get(0u) = pop[0u];
+    pdf.get(1u) = pop[1u];
+    pdf.get(2u) = pop[2u];
+    pdf.get(3u) = pop[3u];
+    pdf.get(4u) = pop[4u];
+    pdf.get(5u) = pop[5u];
+    pdf.get(6u) = pop[6u];
+    pdf.get(7u) = pop[7u];
+    pdf.get(8u) = pop[8u];
+    pdf.get(9u) = pop[9u];
+    pdf.get(10u) = pop[10u];
+    pdf.get(11u) = pop[11u];
+    pdf.get(12u) = pop[12u];
+    pdf.get(13u) = pop[13u];
+    pdf.get(14u) = pop[14u];
+    pdf.get(15u) = pop[15u];
+    pdf.get(16u) = pop[16u];
+    pdf.get(17u) = pop[17u];
+    pdf.get(18u) = pop[18u];
+  }
+}
+
+__global__ void kernel_set_vel(
+    gpu::FieldAccessor<float> pdf,
+    gpu::FieldAccessor<float> velocity,
+    gpu::FieldAccessor<float> force,
+    float const *RESTRICT pop) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
+  pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  pop += offset;
+  if (pdf.isValidPosition()) {
+    const float f_0 = pdf.get(0u) = pop[0u];
+    const float f_1 = pdf.get(1u) = pop[1u];
+    const float f_2 = pdf.get(2u) = pop[2u];
+    const float f_3 = pdf.get(3u) = pop[3u];
+    const float f_4 = pdf.get(4u) = pop[4u];
+    const float f_5 = pdf.get(5u) = pop[5u];
+    const float f_6 = pdf.get(6u) = pop[6u];
+    const float f_7 = pdf.get(7u) = pop[7u];
+    const float f_8 = pdf.get(8u) = pop[8u];
+    const float f_9 = pdf.get(9u) = pop[9u];
+    const float f_10 = pdf.get(10u) = pop[10u];
+    const float f_11 = pdf.get(11u) = pop[11u];
+    const float f_12 = pdf.get(12u) = pop[12u];
+    const float f_13 = pdf.get(13u) = pop[13u];
+    const float f_14 = pdf.get(14u) = pop[14u];
+    const float f_15 = pdf.get(15u) = pop[15u];
+    const float f_16 = pdf.get(16u) = pop[16u];
+    const float f_17 = pdf.get(17u) = pop[17u];
+    const float f_18 = pdf.get(18u) = pop[18u];
+    const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const float vel1Term = f_1 + f_11 + f_15 + f_7;
+    const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const float vel2Term = f_12 + f_13 + f_5;
+    const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
+    const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
+    const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
+    const float rho_inv = float{1} / rho;
+    velocity.get(0u) = md_0 * rho_inv;
+    velocity.get(1u) = md_1 * rho_inv;
+    velocity.get(2u) = md_2 * rho_inv;
+  }
+}
+// LCOV_EXCL_STOP
+
+std::array<float, 19u> get(
+    gpu::GPUField<float> const *pdf_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(19u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::array<float, 19u> pop;
+  thrust::copy(dev_data.begin(), dev_data.end(), pop.data());
+  return pop;
+}
+
+void set(
+    gpu::GPUField<float> *pdf_field,
+    std::array<float, 19u> const &pop,
+    Cell const &cell) {
+  thrust::device_vector<float> dev_data(pop.begin(), pop.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  CellInterval ci(cell, cell);
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+void set(
+    gpu::GPUField<float> *pdf_field,
+    gpu::GPUField<float> *velocity_field,
+    gpu::GPUField<float> const *force_field,
+    std::array<float, 19u> const &pop,
+    Cell const &cell) {
+  thrust::device_vector<float> dev_data(pop.begin(), pop.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  CellInterval ci(cell, cell);
+  auto kernel = gpu::make_kernel(kernel_set_vel);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+void initialize(
+    gpu::GPUField<float> *pdf_field,
+    std::array<float, 19u> const &pop) {
+  CellInterval ci = pdf_field->xyzSizeWithGhostLayer();
+  thrust::device_vector<float> dev_data(pop.begin(), pop.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_broadcast);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+std::vector<float> get(
+    gpu::GPUField<float> const *pdf_field,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(ci.numCells() * 19u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<float> out(ci.numCells() * 19u);
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+
+void set(
+    gpu::GPUField<float> *pdf_field,
+    std::vector<float> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+void set(
+    gpu::GPUField<float> *pdf_field,
+    gpu::GPUField<float> *velocity_field,
+    gpu::GPUField<float> const *force_field,
+    std::vector<float> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set_vel);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Population
+
+namespace Vector {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<float> vec,
+    float *u_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  vec.set(blockIdx, threadIdx);
+  u_out += offset;
+  if (vec.isValidPosition()) {
+    u_out[0u] = vec.get(0u);
+    u_out[1u] = vec.get(1u);
+    u_out[2u] = vec.get(2u);
+  }
+}
+
+__global__ void kernel_set(
+    gpu::FieldAccessor<float> vec,
+    float const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  vec.set(blockIdx, threadIdx);
+  u_in += offset;
+  if (vec.isValidPosition()) {
+    vec.get(0u) = u_in[0u];
+    vec.get(1u) = u_in[1u];
+    vec.get(2u) = u_in[2u];
+  }
+}
+
+__global__ void kernel_broadcast(
+    gpu::FieldAccessor<float> vec,
+    float const *RESTRICT u_in) {
+  vec.set(blockIdx, threadIdx);
+  if (vec.isValidPosition()) {
+    vec.get(0u) = u_in[0u];
+    vec.get(1u) = u_in[1u];
+    vec.get(2u) = u_in[2u];
+  }
+}
+
+__global__ void kernel_add(
+    gpu::FieldAccessor<float> vec,
+    float const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  vec.set(blockIdx, threadIdx);
+  u_in += offset;
+  if (vec.isValidPosition()) {
+    vec.get(0u) += u_in[0u];
+    vec.get(1u) += u_in[1u];
+    vec.get(2u) += u_in[2u];
+  }
+}
+
+__global__ void kernel_broadcast_add(
+    gpu::FieldAccessor<float> vec,
+    float const *RESTRICT u_in) {
+  vec.set(blockIdx, threadIdx);
+  if (vec.isValidPosition()) {
+    vec.get(0u) += u_in[0u];
+    vec.get(1u) += u_in[1u];
+    vec.get(2u) += u_in[2u];
+  }
+}
+// LCOV_EXCL_STOP
+
+Vector3<float> get(
+    gpu::GPUField<float> const *vec_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  Vector3<float> vec;
+  thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
+  return vec;
+}
+
+void set(
+    gpu::GPUField<float> *vec_field,
+    Vector3<float> const &vec,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+void add(
+    gpu::GPUField<float> *vec_field,
+    Vector3<float> const &vec,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_add);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+void initialize(
+    gpu::GPUField<float> *vec_field,
+    Vector3<float> const &vec) {
+  CellInterval ci = vec_field->xyzSizeWithGhostLayer();
+  thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_broadcast);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+void add_to_all(
+    gpu::GPUField<float> *vec_field,
+    Vector3<float> const &vec) {
+  CellInterval ci = vec_field->xyzSizeWithGhostLayer();
+  thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_broadcast_add);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+std::vector<float> get(
+    gpu::GPUField<float> const *vec_field,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(ci.numCells() * 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<float> out(ci.numCells() * 3u);
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+
+void set(
+    gpu::GPUField<float> *vec_field,
+    std::vector<float> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Vector
+
+namespace Interpolation {
+// LCOV_EXCL_START
+/** @brief Calculate interpolation weights. */
+static __forceinline__ __device__ void calculate_weights(
+    float const *RESTRICT const pos,
+    int *RESTRICT const corner,
+    float *RESTRICT const weights,
+    uint gl) {
+#pragma unroll
+  for (int dim = 0; dim < 3; ++dim) {
+    auto const fractional_index = pos[dim] - float{0.5};
+    auto const nmp = floorf(fractional_index);
+    auto const distance = fractional_index - nmp - float{0.5};
+    corner[dim] = __float2int_rn(nmp) + static_cast<int>(gl);
+    weights[dim * 2 + 0] = float{0.5} - distance;
+    weights[dim * 2 + 1] = float{0.5} + distance;
+  }
+}
+
+__global__ void kernel_get(
+    gpu::FieldAccessor<float> vec,
+    float const *RESTRICT const pos,
+    float *RESTRICT const vel,
+    uint n_pos,
+    uint gl) {
+
+  uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
+                   blockDim.x * blockIdx.x + threadIdx.x;
+
+  vec.set({0u, 0u, 0u}, {0u, 0u, 0u});
+  if (vec.isValidPosition() and pos_index < n_pos) {
+    auto const array_offset = pos_index * uint(3u);
+    int corner[3];
+    float weights[3][2];
+    calculate_weights(pos + array_offset, corner, &weights[0][0], gl);
+#pragma unroll
+    for (int i = 0; i < 2; i++) {
+      auto const cx = corner[0] + i;
+      auto const wx = weights[0][i];
+#pragma unroll
+      for (int j = 0; j < 2; j++) {
+        auto const cy = corner[1] + j;
+        auto const wxy = wx * weights[1][j];
+#pragma unroll
+        for (int k = 0; k < 2; k++) {
+          auto const cz = corner[2] + k;
+          auto const weight = wxy * weights[2][k];
+          vel[array_offset + 0u] += weight * vec.getNeighbor(cx, cy, cz, 0u);
+          vel[array_offset + 1u] += weight * vec.getNeighbor(cx, cy, cz, 1u);
+          vel[array_offset + 2u] += weight * vec.getNeighbor(cx, cy, cz, 2u);
+        }
+      }
+    }
+  }
+}
+
+__global__ void kernel_set(
+    gpu::FieldAccessor<float> vec,
+    float const *RESTRICT const pos,
+    float const *RESTRICT const forces,
+    uint n_pos,
+    uint gl) {
+
+  uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
+                   blockDim.x * blockIdx.x + threadIdx.x;
+
+  vec.set({0u, 0u, 0u}, {0u, 0u, 0u});
+  if (vec.isValidPosition() and pos_index < n_pos) {
+    auto const array_offset = pos_index * uint(3u);
+    int corner[3];
+    float weights[3][2];
+    calculate_weights(pos + array_offset, corner, &weights[0][0], gl);
+#pragma unroll
+    for (int i = 0; i < 2; i++) {
+      auto const cx = corner[0] + i;
+      auto const wx = weights[0][i];
+#pragma unroll
+      for (int j = 0; j < 2; j++) {
+        auto const cy = corner[1] + j;
+        auto const wxy = wx * weights[1][j];
+#pragma unroll
+        for (int k = 0; k < 2; k++) {
+          auto const cz = corner[2] + k;
+          auto const weight = wxy * weights[2][k];
+          atomicAdd(&vec.getNeighbor(cx, cy, cz, 0u),
+                    weight * forces[array_offset + 0u]);
+          atomicAdd(&vec.getNeighbor(cx, cy, cz, 1u),
+                    weight * forces[array_offset + 1u]);
+          atomicAdd(&vec.getNeighbor(cx, cy, cz, 2u),
+                    weight * forces[array_offset + 2u]);
+        }
+      }
+    }
+  }
+}
+// LCOV_EXCL_STOP
+
+static dim3 calculate_dim_grid(uint const threads_x,
+                               uint const blocks_per_grid_y,
+                               uint const threads_per_block) {
+  assert(threads_x >= 1u);
+  assert(blocks_per_grid_y >= 1u);
+  assert(threads_per_block >= 1u);
+  auto const threads_y = threads_per_block * blocks_per_grid_y;
+  auto const blocks_per_grid_x = (threads_x + threads_y - 1) / threads_y;
+  return make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+}
+
+std::vector<float>
+get(
+    gpu::GPUField<float> const *vec_field,
+    std::vector<float> const &pos,
+    uint gl) {
+  thrust::device_vector<float> dev_pos(pos.begin(), pos.end());
+  thrust::device_vector<float> dev_vel(pos.size());
+  auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
+  auto const dev_vel_ptr = thrust::raw_pointer_cast(dev_vel.data());
+
+  auto const threads_per_block = uint(64u);
+  auto const n_pos = static_cast<uint>(pos.size() / 3ul);
+  auto const dim_grid = calculate_dim_grid(n_pos, 4u, threads_per_block);
+  kernel_get<<<dim_grid, threads_per_block, 0u, nullptr>>>(
+      gpu::FieldIndexing<float>::withGhostLayerXYZ(*vec_field, gl).gpuAccess(),
+      dev_pos_ptr, dev_vel_ptr, n_pos, gl);
+
+  std::vector<float> out(pos.size());
+  thrust::copy(dev_vel.begin(), dev_vel.end(), out.data());
+  return out;
+}
+
+void set(
+    gpu::GPUField<float> const *vec_field,
+    std::vector<float> const &pos,
+    std::vector<float> const &forces,
+    uint gl) {
+  thrust::device_vector<float> dev_pos(pos.begin(), pos.end());
+  thrust::device_vector<float> dev_for(forces.begin(), forces.end());
+  auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
+  auto const dev_for_ptr = thrust::raw_pointer_cast(dev_for.data());
+
+  auto const threads_per_block = uint(64u);
+  auto const n_pos = static_cast<uint>(pos.size() / 3ul);
+  auto const dim_grid = calculate_dim_grid(n_pos, 4u, threads_per_block);
+  kernel_set<<<dim_grid, threads_per_block, 0u, nullptr>>>(
+      gpu::FieldIndexing<float>::withGhostLayerXYZ(*vec_field, gl).gpuAccess(),
+      dev_pos_ptr, dev_for_ptr, n_pos, gl);
+}
+} // namespace Interpolation
+
+namespace Equilibrium {
+// LCOV_EXCL_START
+__device__ void kernel_set_device(
+    gpu::FieldAccessor<float> pdf,
+    float const *RESTRICT const u,
+    float rho) {
+
+  pdf.get(0u) = rho * -0.33333333333333331f * (u[0] * u[0]) + rho * -0.33333333333333331f * (u[1] * u[1]) + rho * -0.33333333333333331f * (u[2] * u[2]) + rho * 0.33333333333333331f;
+  pdf.get(1u) = rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * u[1] + rho * 0.16666666666666666f * (u[1] * u[1]);
+  pdf.get(2u) = rho * -0.16666666666666666f * u[1] + rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u[1] * u[1]);
+  pdf.get(3u) = rho * -0.16666666666666666f * u[0] + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u[0] * u[0]);
+  pdf.get(4u) = rho * -0.16666666666666666f * (u[1] * u[1]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * u[0] + rho * 0.16666666666666666f * (u[0] * u[0]);
+  pdf.get(5u) = rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * u[2] + rho * 0.16666666666666666f * (u[2] * u[2]);
+  pdf.get(6u) = rho * -0.16666666666666666f * u[2] + rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u[2] * u[2]);
+  pdf.get(7u) = rho * -0.083333333333333329f * u[0] + rho * -0.25f * u[0] * u[1] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]);
+  pdf.get(8u) = rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.25f * u[0] * u[1];
+  pdf.get(9u) = rho * -0.083333333333333329f * u[0] + rho * -0.083333333333333329f * u[1] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.25f * u[0] * u[1];
+  pdf.get(10u) = rho * -0.083333333333333329f * u[1] + rho * -0.25f * u[0] * u[1] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]);
+  pdf.get(11u) = rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[1] * u[2];
+  pdf.get(12u) = rho * -0.083333333333333329f * u[1] + rho * -0.25f * u[1] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]);
+  pdf.get(13u) = rho * -0.083333333333333329f * u[0] + rho * -0.25f * u[0] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]);
+  pdf.get(14u) = rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[0] * u[2];
+  pdf.get(15u) = rho * -0.083333333333333329f * u[2] + rho * -0.25f * u[1] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]);
+  pdf.get(16u) = rho * -0.083333333333333329f * u[1] + rho * -0.083333333333333329f * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[1] * u[2];
+  pdf.get(17u) = rho * -0.083333333333333329f * u[0] + rho * -0.083333333333333329f * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[0] * u[2];
+  pdf.get(18u) = rho * -0.083333333333333329f * u[2] + rho * -0.25f * u[0] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]);
+}
+// LCOV_EXCL_STOP
+} // namespace Equilibrium
+
+namespace Density {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<float> pdf,
+    float *RESTRICT rho_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
+  pdf.set(blockIdx, threadIdx);
+  rho_out += offset;
+  if (pdf.isValidPosition()) {
+    float const f_0 = pdf.get(0u);
+    float const f_1 = pdf.get(1u);
+    float const f_2 = pdf.get(2u);
+    float const f_3 = pdf.get(3u);
+    float const f_4 = pdf.get(4u);
+    float const f_5 = pdf.get(5u);
+    float const f_6 = pdf.get(6u);
+    float const f_7 = pdf.get(7u);
+    float const f_8 = pdf.get(8u);
+    float const f_9 = pdf.get(9u);
+    float const f_10 = pdf.get(10u);
+    float const f_11 = pdf.get(11u);
+    float const f_12 = pdf.get(12u);
+    float const f_13 = pdf.get(13u);
+    float const f_14 = pdf.get(14u);
+    float const f_15 = pdf.get(15u);
+    float const f_16 = pdf.get(16u);
+    float const f_17 = pdf.get(17u);
+    float const f_18 = pdf.get(18u);
+    const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const float vel1Term = f_1 + f_11 + f_15 + f_7;
+    const float vel2Term = f_12 + f_13 + f_5;
+    const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    rho_out[0u] = rho;
+  }
+}
+
+__global__ void kernel_set(
+    gpu::FieldAccessor<float> pdf,
+    float const *RESTRICT rho_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
+  pdf.set(blockIdx, threadIdx);
+  rho_in += offset;
+  if (pdf.isValidPosition()) {
+    float const f_0 = pdf.get(0u);
+    float const f_1 = pdf.get(1u);
+    float const f_2 = pdf.get(2u);
+    float const f_3 = pdf.get(3u);
+    float const f_4 = pdf.get(4u);
+    float const f_5 = pdf.get(5u);
+    float const f_6 = pdf.get(6u);
+    float const f_7 = pdf.get(7u);
+    float const f_8 = pdf.get(8u);
+    float const f_9 = pdf.get(9u);
+    float const f_10 = pdf.get(10u);
+    float const f_11 = pdf.get(11u);
+    float const f_12 = pdf.get(12u);
+    float const f_13 = pdf.get(13u);
+    float const f_14 = pdf.get(14u);
+    float const f_15 = pdf.get(15u);
+    float const f_16 = pdf.get(16u);
+    float const f_17 = pdf.get(17u);
+    float const f_18 = pdf.get(18u);
+    const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const float vel1Term = f_1 + f_11 + f_15 + f_7;
+    const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const float vel2Term = f_12 + f_13 + f_5;
+    const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+
+    // calculate current velocity (before density change)
+    float const rho_inv = float{1} / rho;
+    float const u_old[3] = {momdensity_0 * rho_inv, momdensity_1 * rho_inv, momdensity_2 * rho_inv};
+
+    Equilibrium::kernel_set_device(pdf, u_old, rho_in[0u]);
+  }
+}
+// LCOV_EXCL_STOP
+
+float get(
+    gpu::GPUField<float> const *pdf_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(1u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  float rho = dev_data[0u];
+  return rho;
+}
+
+void set(
+    gpu::GPUField<float> *pdf_field,
+    const float rho,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(1u, rho);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+std::vector<float> get(
+    gpu::GPUField<float> const *pdf_field,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(ci.numCells());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<float> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.begin());
+  return out;
+}
+
+void set(
+    gpu::GPUField<float> *pdf_field,
+    std::vector<float> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Density
+
+namespace Velocity {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<float> pdf,
+    gpu::FieldAccessor<float> force,
+    float *RESTRICT u_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  u_out += offset;
+  if (pdf.isValidPosition()) {
+    float const f_0 = pdf.get(0u);
+    float const f_1 = pdf.get(1u);
+    float const f_2 = pdf.get(2u);
+    float const f_3 = pdf.get(3u);
+    float const f_4 = pdf.get(4u);
+    float const f_5 = pdf.get(5u);
+    float const f_6 = pdf.get(6u);
+    float const f_7 = pdf.get(7u);
+    float const f_8 = pdf.get(8u);
+    float const f_9 = pdf.get(9u);
+    float const f_10 = pdf.get(10u);
+    float const f_11 = pdf.get(11u);
+    float const f_12 = pdf.get(12u);
+    float const f_13 = pdf.get(13u);
+    float const f_14 = pdf.get(14u);
+    float const f_15 = pdf.get(15u);
+    float const f_16 = pdf.get(16u);
+    float const f_17 = pdf.get(17u);
+    float const f_18 = pdf.get(18u);
+    const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const float vel1Term = f_1 + f_11 + f_15 + f_7;
+    const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const float vel2Term = f_12 + f_13 + f_5;
+    const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
+    const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
+    const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
+    auto const rho_inv = float{1} / rho;
+    u_out[0u] = md_0 * rho_inv;
+    u_out[1u] = md_1 * rho_inv;
+    u_out[2u] = md_2 * rho_inv;
+  }
+}
+
+__global__ void kernel_set(
+    gpu::FieldAccessor<float> pdf,
+    gpu::FieldAccessor<float> velocity,
+    gpu::FieldAccessor<float> force,
+    float const *RESTRICT u_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  u_in += offset;
+  if (pdf.isValidPosition()) {
+    float const f_0 = pdf.get(0u);
+    float const f_1 = pdf.get(1u);
+    float const f_2 = pdf.get(2u);
+    float const f_3 = pdf.get(3u);
+    float const f_4 = pdf.get(4u);
+    float const f_5 = pdf.get(5u);
+    float const f_6 = pdf.get(6u);
+    float const f_7 = pdf.get(7u);
+    float const f_8 = pdf.get(8u);
+    float const f_9 = pdf.get(9u);
+    float const f_10 = pdf.get(10u);
+    float const f_11 = pdf.get(11u);
+    float const f_12 = pdf.get(12u);
+    float const f_13 = pdf.get(13u);
+    float const f_14 = pdf.get(14u);
+    float const f_15 = pdf.get(15u);
+    float const f_16 = pdf.get(16u);
+    float const f_17 = pdf.get(17u);
+    float const f_18 = pdf.get(18u);
+    float const *RESTRICT const u = u_in;
+    const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const float vel1Term = f_1 + f_11 + f_15 + f_7;
+    const float vel2Term = f_12 + f_13 + f_5;
+    const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const float u_0 = -force.get(0) * 0.50000000000000000f / rho + u[0];
+    const float u_1 = -force.get(1) * 0.50000000000000000f / rho + u[1];
+    const float u_2 = -force.get(2) * 0.50000000000000000f / rho + u[2];
+    velocity.get(0u) = u_in[0u];
+    velocity.get(1u) = u_in[1u];
+    velocity.get(2u) = u_in[2u];
+
+    float u_new[3] = {u_0, u_1, u_2};
+
+    Equilibrium::kernel_set_device(pdf, u_new, rho);
+  }
+}
+// LCOV_EXCL_STOP
+
+Vector3<float> get(
+    gpu::GPUField<float> const *pdf_field,
+    gpu::GPUField<float> const *force_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  Vector3<float> vec;
+  thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
+  return vec;
+}
+
+std::vector<float> get(
+    gpu::GPUField<float> const *pdf_field,
+    gpu::GPUField<float> const *force_field,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<float> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+
+void set(
+    gpu::GPUField<float> *pdf_field,
+    gpu::GPUField<float> *velocity_field,
+    gpu::GPUField<float> const *force_field,
+    Vector3<float> const &u,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(u.data(), u.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+void set(
+    gpu::GPUField<float> *pdf_field,
+    gpu::GPUField<float> *velocity_field,
+    gpu::GPUField<float> const *force_field,
+    std::vector<float> const &values,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Velocity
+
+namespace Force {
+// LCOV_EXCL_START
+__global__ void kernel_set(
+    gpu::FieldAccessor<float> pdf,
+    gpu::FieldAccessor<float> velocity,
+    gpu::FieldAccessor<float> force,
+    float const *RESTRICT f_in) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  velocity.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  f_in += offset;
+  if (pdf.isValidPosition()) {
+    float const f_0 = pdf.get(0u);
+    float const f_1 = pdf.get(1u);
+    float const f_2 = pdf.get(2u);
+    float const f_3 = pdf.get(3u);
+    float const f_4 = pdf.get(4u);
+    float const f_5 = pdf.get(5u);
+    float const f_6 = pdf.get(6u);
+    float const f_7 = pdf.get(7u);
+    float const f_8 = pdf.get(8u);
+    float const f_9 = pdf.get(9u);
+    float const f_10 = pdf.get(10u);
+    float const f_11 = pdf.get(11u);
+    float const f_12 = pdf.get(12u);
+    float const f_13 = pdf.get(13u);
+    float const f_14 = pdf.get(14u);
+    float const f_15 = pdf.get(15u);
+    float const f_16 = pdf.get(16u);
+    float const f_17 = pdf.get(17u);
+    float const f_18 = pdf.get(18u);
+    const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const float vel1Term = f_1 + f_11 + f_15 + f_7;
+    const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const float vel2Term = f_12 + f_13 + f_5;
+    const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const float md_0 = f_in[0u] * 0.50000000000000000f + momdensity_0;
+    const float md_1 = f_in[1u] * 0.50000000000000000f + momdensity_1;
+    const float md_2 = f_in[2u] * 0.50000000000000000f + momdensity_2;
+    auto const rho_inv = float{1} / rho;
+
+    force.get(0u) = f_in[0u];
+    force.get(1u) = f_in[1u];
+    force.get(2u) = f_in[2u];
+
+    velocity.get(0u) = md_0 * rho_inv;
+    velocity.get(1u) = md_1 * rho_inv;
+    velocity.get(2u) = md_2 * rho_inv;
+  }
+}
+// LCOV_EXCL_STOP
+
+void set(gpu::GPUField<float> const *pdf_field,
+         gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> *force_field,
+         Vector3<float> const &u,
+         Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(u.data(), u.data() + 3u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
+void set(gpu::GPUField<float> const *pdf_field,
+         gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> *force_field,
+         std::vector<float> const &values,
+         CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(values.begin(), values.end());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+} // namespace Force
+
+namespace MomentumDensity {
+// LCOV_EXCL_START
+__global__ void kernel_sum(
+    gpu::FieldAccessor<float> pdf,
+    gpu::FieldAccessor<float> force,
+    float *RESTRICT out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
+  pdf.set(blockIdx, threadIdx);
+  force.set(blockIdx, threadIdx);
+  out += offset;
+  if (pdf.isValidPosition()) {
+    float const f_0 = pdf.get(0u);
+    float const f_1 = pdf.get(1u);
+    float const f_2 = pdf.get(2u);
+    float const f_3 = pdf.get(3u);
+    float const f_4 = pdf.get(4u);
+    float const f_5 = pdf.get(5u);
+    float const f_6 = pdf.get(6u);
+    float const f_7 = pdf.get(7u);
+    float const f_8 = pdf.get(8u);
+    float const f_9 = pdf.get(9u);
+    float const f_10 = pdf.get(10u);
+    float const f_11 = pdf.get(11u);
+    float const f_12 = pdf.get(12u);
+    float const f_13 = pdf.get(13u);
+    float const f_14 = pdf.get(14u);
+    float const f_15 = pdf.get(15u);
+    float const f_16 = pdf.get(16u);
+    float const f_17 = pdf.get(17u);
+    float const f_18 = pdf.get(18u);
+    const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+    const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+    const float vel1Term = f_1 + f_11 + f_15 + f_7;
+    const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+    const float vel2Term = f_12 + f_13 + f_5;
+    const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+    const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
+    const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
+    const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
+    const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
+    out[0u] += md_0;
+    out[1u] += md_1;
+    out[2u] += md_2;
+  }
+}
+// LCOV_EXCL_STOP
+
+Vector3<float> reduce(
+    gpu::GPUField<float> const *pdf_field,
+    gpu::GPUField<float> const *force_field) {
+  thrust::device_vector<float> dev_data(3u, float{0});
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
+    Cell cell(x, y, z);
+    CellInterval ci(cell, cell);
+    auto kernel = gpu::make_kernel(kernel_sum);
+    kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+    kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+    kernel.addParam(dev_data_ptr);
+    kernel();
+  });
+  Vector3<float> mom(float{0});
+  thrust::copy(dev_data.begin(), dev_data.begin() + 3u, mom.data());
+  return mom;
+}
+} // namespace MomentumDensity
+
+namespace PressureTensor {
+// LCOV_EXCL_START
+__global__ void kernel_get(
+    gpu::FieldAccessor<float> pdf,
+    float *RESTRICT p_out) {
+  auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 9u);
+  pdf.set(blockIdx, threadIdx);
+  p_out += offset;
+  if (pdf.isValidPosition()) {
+    float const f_0 = pdf.get(0u);
+    float const f_1 = pdf.get(1u);
+    float const f_2 = pdf.get(2u);
+    float const f_3 = pdf.get(3u);
+    float const f_4 = pdf.get(4u);
+    float const f_5 = pdf.get(5u);
+    float const f_6 = pdf.get(6u);
+    float const f_7 = pdf.get(7u);
+    float const f_8 = pdf.get(8u);
+    float const f_9 = pdf.get(9u);
+    float const f_10 = pdf.get(10u);
+    float const f_11 = pdf.get(11u);
+    float const f_12 = pdf.get(12u);
+    float const f_13 = pdf.get(13u);
+    float const f_14 = pdf.get(14u);
+    float const f_15 = pdf.get(15u);
+    float const f_16 = pdf.get(16u);
+    float const f_17 = pdf.get(17u);
+    float const f_18 = pdf.get(18u);
+    const float p_0 = f_10 + f_13 + f_14 + f_17 + f_18 + f_3 + f_4 + f_7 + f_8 + f_9;
+    const float p_1 = -f_10 - f_7 + f_8 + f_9;
+    const float p_2 = -f_13 + f_14 + f_17 - f_18;
+    const float p_3 = -f_10 - f_7 + f_8 + f_9;
+    const float p_4 = f_1 + f_10 + f_11 + f_12 + f_15 + f_16 + f_2 + f_7 + f_8 + f_9;
+    const float p_5 = f_11 - f_12 - f_15 + f_16;
+    const float p_6 = -f_13 + f_14 + f_17 - f_18;
+    const float p_7 = f_11 - f_12 - f_15 + f_16;
+    const float p_8 = f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
+    p_out[0u] = p_0;
+    p_out[1u] = p_1;
+    p_out[2u] = p_2;
+    p_out[3u] = p_3;
+    p_out[4u] = p_4;
+    p_out[5u] = p_5;
+    p_out[6u] = p_6;
+    p_out[7u] = p_7;
+    p_out[8u] = p_8;
+  }
+}
+// LCOV_EXCL_STOP
+
+Matrix3<float> get(
+    gpu::GPUField<float> const *pdf_field,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(9u);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  Matrix3<float> out;
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+
+std::vector<float> get(
+    gpu::GPUField<float> const *pdf_field,
+    CellInterval const &ci) {
+  thrust::device_vector<float> dev_data(9u * ci.numCells());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<float> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  return out;
+}
+} // namespace PressureTensor
+
+} // namespace accessor
+} // namespace lbm
+} // namespace walberla
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh
new file mode 100644
index 00000000000..9d25a37a855
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2023-2024 The ESPResSo project
+ * Copyright (C) 2020 The waLBerla project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+/**
+ * @file
+ * Lattice field accessors.
+ * Adapted from the waLBerla source file
+ * https://i10git.cs.fau.de/walberla/walberla/-/blob/a16141524c58ab88386e2a0f8fdd7c63c5edd704/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
+ */
+
+#pragma once
+
+#include <core/DataTypes.h>
+#include <core/cell/Cell.h>
+#include <core/cell/CellInterval.h>
+#include <core/math/Matrix3.h>
+#include <core/math/Vector3.h>
+
+#include <gpu/GPUField.h>
+
+#include <array>
+#include <tuple>
+#include <vector>
+
+namespace walberla {
+namespace lbm {
+namespace accessor {
+
+namespace Population {
+/** @brief Get populations from a single cell. */
+std::array<float, 19u> get(gpu::GPUField<float> const *pdf_field,
+                           Cell const &cell);
+/** @brief Set populations on a single cell. */
+void set(gpu::GPUField<float> *pdf_field, std::array<float, 19u> const &pop,
+         Cell const &cell);
+/** @brief Set populations and recalculate velocities on a single cell. */
+void set(gpu::GPUField<float> *pdf_field, gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> const *force_field,
+         std::array<float, 19u> const &pop, Cell const &cell);
+/** @brief Initialize all cells with the same value. */
+void initialize(gpu::GPUField<float> *pdf_field,
+                std::array<float, 19u> const &pop);
+/** @brief Get populations from a cell interval. */
+std::vector<float> get(gpu::GPUField<float> const *pdf_field,
+                       CellInterval const &ci);
+/** @brief Set populations on a cell interval. */
+void set(gpu::GPUField<float> *pdf_field, std::vector<float> const &values,
+         CellInterval const &ci);
+/** @brief Set populations and recalculate velocities on a cell interval. */
+void set(gpu::GPUField<float> *pdf_field, gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> const *force_field,
+         std::vector<float> const &values, CellInterval const &ci);
+} // namespace Population
+
+namespace Vector {
+/** @brief Get value from a single cell. */
+Vector3<float> get(gpu::GPUField<float> const *field, Cell const &cell);
+/** @brief Set value on a single cell. */
+void set(gpu::GPUField<float> *field, Vector3<float> const &vec,
+         Cell const &cell);
+/** @brief Add value to a single cell. */
+void add(gpu::GPUField<float> *field, Vector3<float> const &vec,
+         Cell const &cell);
+/** @brief Initialize all cells with the same value. */
+void initialize(gpu::GPUField<float> *field, Vector3<float> const &vec);
+/** @brief Add value to all cells. */
+void add_to_all(gpu::GPUField<float> *field, Vector3<float> const &vec);
+/** @brief Get values from a cell interval. */
+std::vector<float> get(gpu::GPUField<float> const *vec_field,
+                       CellInterval const &ci);
+/** @brief Set values on a cell interval. */
+void set(gpu::GPUField<float> *vec_field, std::vector<float> const &values,
+         CellInterval const &ci);
+
+} // namespace Vector
+
+namespace Interpolation {
+std::vector<float> get(gpu::GPUField<float> const *vec_field,
+                       std::vector<float> const &pos, uint gl);
+void set(gpu::GPUField<float> const *vec_field, std::vector<float> const &pos,
+         std::vector<float> const &forces, uint gl);
+} // namespace Interpolation
+
+namespace Density {
+float get(gpu::GPUField<float> const *pdf_field, Cell const &cell);
+void set(gpu::GPUField<float> *pdf_field, float const rho, Cell const &cell);
+std::vector<float> get(gpu::GPUField<float> const *pdf_field,
+                       CellInterval const &ci);
+void set(gpu::GPUField<float> *pdf_field, std::vector<float> const &values,
+         CellInterval const &ci);
+} // namespace Density
+
+namespace Velocity {
+Vector3<float> get(gpu::GPUField<float> const *pdf_field,
+                   gpu::GPUField<float> const *force_field, Cell const &cell);
+std::vector<float> get(gpu::GPUField<float> const *pdf_field,
+                       gpu::GPUField<float> const *force_field,
+                       CellInterval const &ci);
+void set(gpu::GPUField<float> *pdf_field, gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> const *force_field, Vector3<float> const &u,
+         Cell const &cell);
+void set(gpu::GPUField<float> *pdf_field, gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> const *force_field,
+         std::vector<float> const &values, CellInterval const &ci);
+} // namespace Velocity
+
+namespace Force {
+void set(gpu::GPUField<float> const *pdf_field,
+         gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> *force_field, Vector3<float> const &u,
+         Cell const &cell);
+void set(gpu::GPUField<float> const *pdf_field,
+         gpu::GPUField<float> *velocity_field,
+         gpu::GPUField<float> *force_field, std::vector<float> const &values,
+         CellInterval const &ci);
+} // namespace Force
+
+namespace DensityAndVelocity {
+std::tuple<float, Vector3<float>> get(gpu::GPUField<float> const *pdf_field,
+                                      gpu::GPUField<float> const *force_field,
+                                      Cell const &cell);
+void set(gpu::GPUField<float> *pdf_field, gpu::GPUField<float> *force_field,
+         Vector3<float> const &u, float const rho, Cell const &cell);
+} // namespace DensityAndVelocity
+
+namespace DensityAndMomentumDensity {
+std::tuple<float, Vector3<float>> get(gpu::GPUField<float> const *pdf_field,
+                                      gpu::GPUField<float> const *force_field,
+                                      Cell const &cell);
+} // namespace DensityAndMomentumDensity
+
+namespace MomentumDensity {
+Vector3<float> reduce(gpu::GPUField<float> const *pdf_field,
+                      gpu::GPUField<float> const *force_field);
+} // namespace MomentumDensity
+
+namespace PressureTensor {
+Matrix3<float> get(gpu::GPUField<float> const *pdf_field, Cell const &cell);
+std::vector<float> get(gpu::GPUField<float> const *pdf_field,
+                       CellInterval const &ci);
+} // namespace PressureTensor
+
+} // namespace accessor
+} // namespace lbm
+} // namespace walberla
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.cpp
index 5407a10dc6f..25017078cd3 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file InitialPDFsSetterDoublePrecision.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -50,80 +49,30 @@ namespace internal_2df07fce91f5444fc18533f996cd1a79 {
 static FUNC_PREFIX void initialpdfssetterdoubleprecision_initialpdfssetterdoubleprecision(double *RESTRICT const _data_force, double *RESTRICT _data_pdfs, double *RESTRICT const _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3, double rho_0) {
   const double rho = rho_0;
   for (int64_t ctr_2 = 0; ctr_2 < _size_force_2; ctr_2 += 1) {
-    double *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    double *RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2 * ctr_2;
-    double *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    double *RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2 * ctr_2 + _stride_velocity_3;
-    double *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    double *RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3;
-    double *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    double *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3;
     for (int64_t ctr_1 = 0; ctr_1 < _size_force_1; ctr_1 += 1) {
-      double *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      double *RESTRICT _data_velocity_20_30_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_30;
-      double *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      double *RESTRICT _data_velocity_20_31_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_31;
-      double *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      double *RESTRICT _data_velocity_20_32_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_32;
-      double *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      double *RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_31;
-      double *RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_32;
-      double *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      double *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      double *RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_35;
-      double *RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_36;
-      double *RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_37;
-      double *RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_38;
-      double *RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_39;
-      double *RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_310;
-      double *RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_311;
-      double *RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_312;
-      double *RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_313;
-      double *RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_314;
-      double *RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_315;
-      double *RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_316;
-      double *RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_317;
-      double *RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_318;
       for (int64_t ctr_0 = 0; ctr_0 < _size_force_0; ctr_0 += 1) {
-        const double u_0 = -0.5 * ((1.0) / (rho)) * _data_force_20_30_10[_stride_force_0 * ctr_0] + _data_velocity_20_30_10[_stride_velocity_0 * ctr_0];
-        const double u_1 = -0.5 * ((1.0) / (rho)) * _data_force_20_31_10[_stride_force_0 * ctr_0] + _data_velocity_20_31_10[_stride_velocity_0 * ctr_0];
-        const double u_2 = -0.5 * ((1.0) / (rho)) * _data_force_20_32_10[_stride_force_0 * ctr_0] + _data_velocity_20_32_10[_stride_velocity_0 * ctr_0];
-        _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0] = rho * -0.33333333333333331 * (u_0 * u_0) + rho * -0.33333333333333331 * (u_1 * u_1) + rho * -0.33333333333333331 * (u_2 * u_2) + rho * 0.33333333333333331;
-        _data_pdfs_20_31_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * 0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_1 * u_1);
-        _data_pdfs_20_32_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * -0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_1 * u_1);
-        _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * -0.16666666666666666 + rho * -0.16666666666666666 * (u_1 * u_1) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_0 * u_0);
-        _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * 0.16666666666666666 + rho * -0.16666666666666666 * (u_1 * u_1) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_0 * u_0);
-        _data_pdfs_20_35_10[_stride_pdfs_0 * ctr_0] = rho * u_2 * 0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_1 * u_1) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_2 * u_2);
-        _data_pdfs_20_36_10[_stride_pdfs_0 * ctr_0] = rho * u_2 * -0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_1 * u_1) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_2 * u_2);
-        _data_pdfs_20_37_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_1 * -0.25 + rho * u_0 * -0.083333333333333329 + rho * u_1 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
-        _data_pdfs_20_38_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_1 * 0.25 + rho * u_0 * 0.083333333333333329 + rho * u_1 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
-        _data_pdfs_20_39_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_1 * 0.25 + rho * u_0 * -0.083333333333333329 + rho * u_1 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
-        _data_pdfs_20_310_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_1 * -0.25 + rho * u_0 * 0.083333333333333329 + rho * u_1 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
-        _data_pdfs_20_311_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * u_2 * 0.25 + rho * u_1 * 0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
-        _data_pdfs_20_312_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * u_2 * -0.25 + rho * u_1 * -0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
-        _data_pdfs_20_313_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_2 * -0.25 + rho * u_0 * -0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
-        _data_pdfs_20_314_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_2 * 0.25 + rho * u_0 * 0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
-        _data_pdfs_20_315_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * u_2 * -0.25 + rho * u_1 * 0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
-        _data_pdfs_20_316_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * u_2 * 0.25 + rho * u_1 * -0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
-        _data_pdfs_20_317_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_2 * 0.25 + rho * u_0 * -0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
-        _data_pdfs_20_318_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_2 * -0.25 + rho * u_0 * 0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
+        const double u_0 = -0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2];
+        const double u_1 = -0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3];
+        const double u_2 = -0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3];
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = rho * -0.33333333333333331 * (u_0 * u_0) + rho * -0.33333333333333331 * (u_1 * u_1) + rho * -0.33333333333333331 * (u_2 * u_2) + rho * 0.33333333333333331;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = rho * u_1 * 0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = rho * u_1 * -0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = rho * u_0 * -0.16666666666666666 + rho * -0.16666666666666666 * (u_1 * u_1) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_0 * u_0);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = rho * u_0 * 0.16666666666666666 + rho * -0.16666666666666666 * (u_1 * u_1) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_0 * u_0);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = rho * u_2 * 0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_1 * u_1) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = rho * u_2 * -0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_1 * u_1) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = rho * u_0 * u_1 * -0.25 + rho * u_0 * -0.083333333333333329 + rho * u_1 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = rho * u_0 * u_1 * 0.25 + rho * u_0 * 0.083333333333333329 + rho * u_1 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = rho * u_0 * u_1 * 0.25 + rho * u_0 * -0.083333333333333329 + rho * u_1 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = rho * u_0 * u_1 * -0.25 + rho * u_0 * 0.083333333333333329 + rho * u_1 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = rho * u_1 * u_2 * 0.25 + rho * u_1 * 0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = rho * u_1 * u_2 * -0.25 + rho * u_1 * -0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = rho * u_0 * u_2 * -0.25 + rho * u_0 * -0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = rho * u_0 * u_2 * 0.25 + rho * u_0 * 0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = rho * u_1 * u_2 * -0.25 + rho * u_1 * 0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = rho * u_1 * u_2 * 0.25 + rho * u_1 * -0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = rho * u_0 * u_2 * 0.25 + rho * u_0 * -0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = rho * u_0 * u_2 * -0.25 + rho * u_0 * 0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
       }
     }
   }
@@ -131,29 +80,30 @@ static FUNC_PREFIX void initialpdfssetterdoubleprecision_initialpdfssetterdouble
 } // namespace internal_2df07fce91f5444fc18533f996cd1a79
 
 void InitialPDFsSetterDoublePrecision::run(IBlock *block) {
+
   auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
   auto velocity = block->getData<field::GhostLayerField<double, 3>>(velocityID);
 
   auto &rho_0 = this->rho_0_;
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(velocity->nrOfGhostLayers()))
   double *RESTRICT const _data_velocity = velocity->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -170,6 +120,7 @@ void InitialPDFsSetterDoublePrecision::run(IBlock *block) {
 }
 
 void InitialPDFsSetterDoublePrecision::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -183,30 +134,30 @@ void InitialPDFsSetterDoublePrecision::runOnCellInterval(const shared_ptr<Struct
   auto velocity = block->getData<field::GhostLayerField<double, 3>>(velocityID);
 
   auto &rho_0 = this->rho_0_;
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers()))
   double *RESTRICT const _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -231,4 +182,4 @@ void InitialPDFsSetterDoublePrecision::runOnCellInterval(const shared_ptr<Struct
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h
index 31b316dcbb8..c472f1f5112 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecision.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -91,6 +92,9 @@ class InitialPDFsSetterDoublePrecision {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   BlockDataID velocityID;
@@ -103,4 +107,4 @@ class InitialPDFsSetterDoublePrecision {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.cu
new file mode 100644
index 00000000000..394d053dd97
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.cu
@@ -0,0 +1,188 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file InitialPDFsSetterDoublePrecisionCUDA.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#include <cmath>
+
+#include "InitialPDFsSetterDoublePrecisionCUDA.h"
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+
+#define FUNC_PREFIX __global__
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#pragma GCC diagnostic ignored "-Wshadow"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning push
+#pragma warning(disable : 1599)
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace pystencils {
+
+namespace internal_initialpdfssetterdoubleprecisioncuda_initialpdfssetterdoubleprecisioncuda {
+static FUNC_PREFIX __launch_bounds__(256) void initialpdfssetterdoubleprecisioncuda_initialpdfssetterdoubleprecisioncuda(double *RESTRICT const _data_force, double *RESTRICT _data_pdfs, double *RESTRICT const _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3, double rho_0) {
+  if (blockDim.x * blockIdx.x + threadIdx.x < _size_force_0 && blockDim.y * blockIdx.y + threadIdx.y < _size_force_1 && blockDim.z * blockIdx.z + threadIdx.z < _size_force_2) {
+    const int64_t ctr_0 = blockDim.x * blockIdx.x + threadIdx.x;
+    const int64_t ctr_1 = blockDim.y * blockIdx.y + threadIdx.y;
+    const int64_t ctr_2 = blockDim.z * blockIdx.z + threadIdx.z;
+    const double rho = rho_0;
+    const double u_0 = -0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2];
+    const double u_1 = -0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3];
+    const double u_2 = -0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3];
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = rho * -0.33333333333333331 * (u_0 * u_0) + rho * -0.33333333333333331 * (u_1 * u_1) + rho * -0.33333333333333331 * (u_2 * u_2) + rho * 0.33333333333333331;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = rho * u_1 * 0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = rho * u_1 * -0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = rho * u_0 * -0.16666666666666666 + rho * -0.16666666666666666 * (u_1 * u_1) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_0 * u_0);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = rho * u_0 * 0.16666666666666666 + rho * -0.16666666666666666 * (u_1 * u_1) + rho * -0.16666666666666666 * (u_2 * u_2) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_0 * u_0);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = rho * u_2 * 0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_1 * u_1) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = rho * u_2 * -0.16666666666666666 + rho * -0.16666666666666666 * (u_0 * u_0) + rho * -0.16666666666666666 * (u_1 * u_1) + rho * 0.055555555555555552 + rho * 0.16666666666666666 * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = rho * u_0 * u_1 * -0.25 + rho * u_0 * -0.083333333333333329 + rho * u_1 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = rho * u_0 * u_1 * 0.25 + rho * u_0 * 0.083333333333333329 + rho * u_1 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = rho * u_0 * u_1 * 0.25 + rho * u_0 * -0.083333333333333329 + rho * u_1 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = rho * u_0 * u_1 * -0.25 + rho * u_0 * 0.083333333333333329 + rho * u_1 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = rho * u_1 * u_2 * 0.25 + rho * u_1 * 0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = rho * u_1 * u_2 * -0.25 + rho * u_1 * -0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = rho * u_0 * u_2 * -0.25 + rho * u_0 * -0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = rho * u_0 * u_2 * 0.25 + rho * u_0 * 0.083333333333333329 + rho * u_2 * 0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = rho * u_1 * u_2 * -0.25 + rho * u_1 * 0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = rho * u_1 * u_2 * 0.25 + rho * u_1 * -0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_1 * u_1) + rho * 0.083333333333333329 * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = rho * u_0 * u_2 * 0.25 + rho * u_0 * -0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = rho * u_0 * u_2 * -0.25 + rho * u_0 * 0.083333333333333329 + rho * u_2 * -0.083333333333333329 + rho * 0.027777777777777776 + rho * 0.083333333333333329 * (u_0 * u_0) + rho * 0.083333333333333329 * (u_2 * u_2);
+  }
+}
+} // namespace internal_initialpdfssetterdoubleprecisioncuda_initialpdfssetterdoubleprecisioncuda
+
+void InitialPDFsSetterDoublePrecisionCUDA::run(IBlock *block, gpuStream_t stream) {
+
+  auto velocity = block->getData<gpu::GPUField<double>>(velocityID);
+  auto force = block->getData<gpu::GPUField<double>>(forceID);
+  auto pdfs = block->getData<gpu::GPUField<double>>(pdfsID);
+
+  auto &rho_0 = this->rho_0_;
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
+  double *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+  double *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(velocity->nrOfGhostLayers()))
+  double *RESTRICT const _data_velocity = velocity->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+  const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+  const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+  const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_initialpdfssetterdoubleprecisioncuda_initialpdfssetterdoubleprecisioncuda::initialpdfssetterdoubleprecisioncuda_initialpdfssetterdoubleprecisioncuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3, rho_0);
+}
+
+void InitialPDFsSetterDoublePrecisionCUDA::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block, gpuStream_t stream) {
+
+  CellInterval ci = globalCellInterval;
+  CellInterval blockBB = blocks->getBlockCellBB(*block);
+  blockBB.expand(ghostLayers);
+  ci.intersect(blockBB);
+  blocks->transformGlobalToBlockLocalCellInterval(ci, *block);
+  if (ci.empty())
+    return;
+
+  auto velocity = block->getData<gpu::GPUField<double>>(velocityID);
+  auto force = block->getData<gpu::GPUField<double>>(forceID);
+  auto pdfs = block->getData<gpu::GPUField<double>>(pdfsID);
+
+  auto &rho_0 = this->rho_0_;
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
+  double *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+  double *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers()))
+  double *RESTRICT const _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+  const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+  const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+  const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_initialpdfssetterdoubleprecisioncuda_initialpdfssetterdoubleprecisioncuda::initialpdfssetterdoubleprecisioncuda_initialpdfssetterdoubleprecisioncuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3, rho_0);
+}
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h
new file mode 100644
index 00000000000..6f4ab75275a
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h
@@ -0,0 +1,120 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file InitialPDFsSetterDoublePrecisionCUDA.h
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "field/SwapableCompare.h"
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace pystencils {
+
+class InitialPDFsSetterDoublePrecisionCUDA {
+public:
+  InitialPDFsSetterDoublePrecisionCUDA(BlockDataID forceID_,
+                                       BlockDataID pdfsID_,
+                                       BlockDataID velocityID_, double rho_0)
+      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_),
+        rho_0_(rho_0){};
+
+  void run(IBlock *block, gpuStream_t stream = nullptr);
+
+  void runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers, IBlock *block,
+                         gpuStream_t stream = nullptr);
+
+  void operator()(IBlock *block, gpuStream_t stream = nullptr) {
+    run(block, stream);
+  }
+
+  static std::function<void(IBlock *)>
+  getSweep(const shared_ptr<InitialPDFsSetterDoublePrecisionCUDA> &kernel) {
+    return [kernel](IBlock *b) { kernel->run(b); };
+  }
+
+  static std::function<void(IBlock *, gpuStream_t)> getSweepOnCellInterval(
+      const shared_ptr<InitialPDFsSetterDoublePrecisionCUDA> &kernel,
+      const shared_ptr<StructuredBlockStorage> &blocks,
+      const CellInterval &globalCellInterval, cell_idx_t ghostLayers = 1) {
+    return [kernel, blocks, globalCellInterval,
+            ghostLayers](IBlock *b, gpuStream_t stream = nullptr) {
+      kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                                stream);
+    };
+  }
+
+  std::function<void(IBlock *)> getSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->run(b, stream); };
+  }
+
+  std::function<void(IBlock *)>
+  getSweepOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers = 1,
+                         gpuStream_t stream = nullptr) {
+    return [this, blocks, globalCellInterval, ghostLayers, stream](IBlock *b) {
+      this->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                              stream);
+    };
+  }
+
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
+  BlockDataID forceID;
+  BlockDataID pdfsID;
+  BlockDataID velocityID;
+  double rho_0_;
+};
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.cpp
index 0c963fb65e5..d319d749f7e 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file InitialPDFsSetterSinglePrecision.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -50,80 +49,30 @@ namespace internal_b8085d63d6b7e842485134abbac511e8 {
 static FUNC_PREFIX void initialpdfssettersingleprecision_initialpdfssettersingleprecision(float *RESTRICT const _data_force, float *RESTRICT _data_pdfs, float *RESTRICT const _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3, float rho_0) {
   const float rho = rho_0;
   for (int64_t ctr_2 = 0; ctr_2 < _size_force_2; ctr_2 += 1) {
-    float *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    float *RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2 * ctr_2;
-    float *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    float *RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2 * ctr_2 + _stride_velocity_3;
-    float *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    float *RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3;
-    float *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    float *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3;
     for (int64_t ctr_1 = 0; ctr_1 < _size_force_1; ctr_1 += 1) {
-      float *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      float *RESTRICT _data_velocity_20_30_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_30;
-      float *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      float *RESTRICT _data_velocity_20_31_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_31;
-      float *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      float *RESTRICT _data_velocity_20_32_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_32;
-      float *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      float *RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_31;
-      float *RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_32;
-      float *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      float *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      float *RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_35;
-      float *RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_36;
-      float *RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_37;
-      float *RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_38;
-      float *RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_39;
-      float *RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_310;
-      float *RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_311;
-      float *RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_312;
-      float *RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_313;
-      float *RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_314;
-      float *RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_315;
-      float *RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_316;
-      float *RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_317;
-      float *RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_318;
       for (int64_t ctr_0 = 0; ctr_0 < _size_force_0; ctr_0 += 1) {
-        const float u_0 = -0.5f * ((1.0f) / (rho)) * _data_force_20_30_10[_stride_force_0 * ctr_0] + _data_velocity_20_30_10[_stride_velocity_0 * ctr_0];
-        const float u_1 = -0.5f * ((1.0f) / (rho)) * _data_force_20_31_10[_stride_force_0 * ctr_0] + _data_velocity_20_31_10[_stride_velocity_0 * ctr_0];
-        const float u_2 = -0.5f * ((1.0f) / (rho)) * _data_force_20_32_10[_stride_force_0 * ctr_0] + _data_velocity_20_32_10[_stride_velocity_0 * ctr_0];
-        _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0] = rho * -0.33333333333333331f * (u_0 * u_0) + rho * -0.33333333333333331f * (u_1 * u_1) + rho * -0.33333333333333331f * (u_2 * u_2) + rho * 0.33333333333333331f;
-        _data_pdfs_20_31_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * 0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_1 * u_1);
-        _data_pdfs_20_32_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * -0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_1 * u_1);
-        _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * -0.16666666666666666f + rho * -0.16666666666666666f * (u_1 * u_1) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_0 * u_0);
-        _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * 0.16666666666666666f + rho * -0.16666666666666666f * (u_1 * u_1) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_0 * u_0);
-        _data_pdfs_20_35_10[_stride_pdfs_0 * ctr_0] = rho * u_2 * 0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_1 * u_1) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_2 * u_2);
-        _data_pdfs_20_36_10[_stride_pdfs_0 * ctr_0] = rho * u_2 * -0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_1 * u_1) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_2 * u_2);
-        _data_pdfs_20_37_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_1 * -0.25f + rho * u_0 * -0.083333333333333329f + rho * u_1 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
-        _data_pdfs_20_38_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_1 * 0.25f + rho * u_0 * 0.083333333333333329f + rho * u_1 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
-        _data_pdfs_20_39_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_1 * 0.25f + rho * u_0 * -0.083333333333333329f + rho * u_1 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
-        _data_pdfs_20_310_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_1 * -0.25f + rho * u_0 * 0.083333333333333329f + rho * u_1 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
-        _data_pdfs_20_311_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * u_2 * 0.25f + rho * u_1 * 0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
-        _data_pdfs_20_312_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * u_2 * -0.25f + rho * u_1 * -0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
-        _data_pdfs_20_313_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_2 * -0.25f + rho * u_0 * -0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
-        _data_pdfs_20_314_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_2 * 0.25f + rho * u_0 * 0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
-        _data_pdfs_20_315_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * u_2 * -0.25f + rho * u_1 * 0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
-        _data_pdfs_20_316_10[_stride_pdfs_0 * ctr_0] = rho * u_1 * u_2 * 0.25f + rho * u_1 * -0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
-        _data_pdfs_20_317_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_2 * 0.25f + rho * u_0 * -0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
-        _data_pdfs_20_318_10[_stride_pdfs_0 * ctr_0] = rho * u_0 * u_2 * -0.25f + rho * u_0 * 0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
+        const float u_0 = -0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2];
+        const float u_1 = -0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3];
+        const float u_2 = -0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3];
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = rho * -0.33333333333333331f * (u_0 * u_0) + rho * -0.33333333333333331f * (u_1 * u_1) + rho * -0.33333333333333331f * (u_2 * u_2) + rho * 0.33333333333333331f;
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = rho * u_1 * 0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = rho * u_1 * -0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = rho * u_0 * -0.16666666666666666f + rho * -0.16666666666666666f * (u_1 * u_1) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_0 * u_0);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = rho * u_0 * 0.16666666666666666f + rho * -0.16666666666666666f * (u_1 * u_1) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_0 * u_0);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = rho * u_2 * 0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_1 * u_1) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = rho * u_2 * -0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_1 * u_1) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = rho * u_0 * u_1 * -0.25f + rho * u_0 * -0.083333333333333329f + rho * u_1 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = rho * u_0 * u_1 * 0.25f + rho * u_0 * 0.083333333333333329f + rho * u_1 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = rho * u_0 * u_1 * 0.25f + rho * u_0 * -0.083333333333333329f + rho * u_1 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = rho * u_0 * u_1 * -0.25f + rho * u_0 * 0.083333333333333329f + rho * u_1 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = rho * u_1 * u_2 * 0.25f + rho * u_1 * 0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = rho * u_1 * u_2 * -0.25f + rho * u_1 * -0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = rho * u_0 * u_2 * -0.25f + rho * u_0 * -0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = rho * u_0 * u_2 * 0.25f + rho * u_0 * 0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = rho * u_1 * u_2 * -0.25f + rho * u_1 * 0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = rho * u_1 * u_2 * 0.25f + rho * u_1 * -0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = rho * u_0 * u_2 * 0.25f + rho * u_0 * -0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
+        _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = rho * u_0 * u_2 * -0.25f + rho * u_0 * 0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
       }
     }
   }
@@ -131,29 +80,30 @@ static FUNC_PREFIX void initialpdfssettersingleprecision_initialpdfssettersingle
 } // namespace internal_b8085d63d6b7e842485134abbac511e8
 
 void InitialPDFsSetterSinglePrecision::run(IBlock *block) {
+
+  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
-  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
 
   auto &rho_0 = this->rho_0_;
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(velocity->nrOfGhostLayers()))
   float *RESTRICT const _data_velocity = velocity->dataAt(0, 0, 0, 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -170,6 +120,7 @@ void InitialPDFsSetterSinglePrecision::run(IBlock *block) {
 }
 
 void InitialPDFsSetterSinglePrecision::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -178,35 +129,35 @@ void InitialPDFsSetterSinglePrecision::runOnCellInterval(const shared_ptr<Struct
   if (ci.empty())
     return;
 
+  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
-  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
 
   auto &rho_0 = this->rho_0_;
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers()))
   float *RESTRICT const _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -231,4 +182,4 @@ void InitialPDFsSetterSinglePrecision::runOnCellInterval(const shared_ptr<Struct
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h
index d6c4553c4e6..e1c3260bab9 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecision.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -91,6 +92,9 @@ class InitialPDFsSetterSinglePrecision {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   BlockDataID velocityID;
@@ -103,4 +107,4 @@ class InitialPDFsSetterSinglePrecision {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.cu
new file mode 100644
index 00000000000..8054d90cd94
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.cu
@@ -0,0 +1,188 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file InitialPDFsSetterSinglePrecisionCUDA.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#include <cmath>
+
+#include "InitialPDFsSetterSinglePrecisionCUDA.h"
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+
+#define FUNC_PREFIX __global__
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#pragma GCC diagnostic ignored "-Wshadow"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning push
+#pragma warning(disable : 1599)
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace pystencils {
+
+namespace internal_initialpdfssettersingleprecisioncuda_initialpdfssettersingleprecisioncuda {
+static FUNC_PREFIX __launch_bounds__(256) void initialpdfssettersingleprecisioncuda_initialpdfssettersingleprecisioncuda(float *RESTRICT const _data_force, float *RESTRICT _data_pdfs, float *RESTRICT const _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3, float rho_0) {
+  if (blockDim.x * blockIdx.x + threadIdx.x < _size_force_0 && blockDim.y * blockIdx.y + threadIdx.y < _size_force_1 && blockDim.z * blockIdx.z + threadIdx.z < _size_force_2) {
+    const int64_t ctr_0 = blockDim.x * blockIdx.x + threadIdx.x;
+    const int64_t ctr_1 = blockDim.y * blockIdx.y + threadIdx.y;
+    const int64_t ctr_2 = blockDim.z * blockIdx.z + threadIdx.z;
+    const float rho = rho_0;
+    const float u_0 = -0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2];
+    const float u_1 = -0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3];
+    const float u_2 = -0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3] + _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3];
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2] = rho * -0.33333333333333331f * (u_0 * u_0) + rho * -0.33333333333333331f * (u_1 * u_1) + rho * -0.33333333333333331f * (u_2 * u_2) + rho * 0.33333333333333331f;
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3] = rho * u_1 * 0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3] = rho * u_1 * -0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3] = rho * u_0 * -0.16666666666666666f + rho * -0.16666666666666666f * (u_1 * u_1) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_0 * u_0);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3] = rho * u_0 * 0.16666666666666666f + rho * -0.16666666666666666f * (u_1 * u_1) + rho * -0.16666666666666666f * (u_2 * u_2) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_0 * u_0);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3] = rho * u_2 * 0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_1 * u_1) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3] = rho * u_2 * -0.16666666666666666f + rho * -0.16666666666666666f * (u_0 * u_0) + rho * -0.16666666666666666f * (u_1 * u_1) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3] = rho * u_0 * u_1 * -0.25f + rho * u_0 * -0.083333333333333329f + rho * u_1 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3] = rho * u_0 * u_1 * 0.25f + rho * u_0 * 0.083333333333333329f + rho * u_1 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3] = rho * u_0 * u_1 * 0.25f + rho * u_0 * -0.083333333333333329f + rho * u_1 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3] = rho * u_0 * u_1 * -0.25f + rho * u_0 * 0.083333333333333329f + rho * u_1 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_1 * u_1);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3] = rho * u_1 * u_2 * 0.25f + rho * u_1 * 0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3] = rho * u_1 * u_2 * -0.25f + rho * u_1 * -0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3] = rho * u_0 * u_2 * -0.25f + rho * u_0 * -0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3] = rho * u_0 * u_2 * 0.25f + rho * u_0 * 0.083333333333333329f + rho * u_2 * 0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3] = rho * u_1 * u_2 * -0.25f + rho * u_1 * 0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3] = rho * u_1 * u_2 * 0.25f + rho * u_1 * -0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_1 * u_1) + rho * 0.083333333333333329f * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3] = rho * u_0 * u_2 * 0.25f + rho * u_0 * -0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
+    _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3] = rho * u_0 * u_2 * -0.25f + rho * u_0 * 0.083333333333333329f + rho * u_2 * -0.083333333333333329f + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u_0 * u_0) + rho * 0.083333333333333329f * (u_2 * u_2);
+  }
+}
+} // namespace internal_initialpdfssettersingleprecisioncuda_initialpdfssettersingleprecisioncuda
+
+void InitialPDFsSetterSinglePrecisionCUDA::run(IBlock *block, gpuStream_t stream) {
+
+  auto velocity = block->getData<gpu::GPUField<float>>(velocityID);
+  auto pdfs = block->getData<gpu::GPUField<float>>(pdfsID);
+  auto force = block->getData<gpu::GPUField<float>>(forceID);
+
+  auto &rho_0 = this->rho_0_;
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
+  float *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+  float *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(velocity->nrOfGhostLayers()))
+  float *RESTRICT const _data_velocity = velocity->dataAt(0, 0, 0, 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+  const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+  const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+  const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_initialpdfssettersingleprecisioncuda_initialpdfssettersingleprecisioncuda::initialpdfssettersingleprecisioncuda_initialpdfssettersingleprecisioncuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3, rho_0);
+}
+
+void InitialPDFsSetterSinglePrecisionCUDA::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block, gpuStream_t stream) {
+
+  CellInterval ci = globalCellInterval;
+  CellInterval blockBB = blocks->getBlockCellBB(*block);
+  blockBB.expand(ghostLayers);
+  ci.intersect(blockBB);
+  blocks->transformGlobalToBlockLocalCellInterval(ci, *block);
+  if (ci.empty())
+    return;
+
+  auto velocity = block->getData<gpu::GPUField<float>>(velocityID);
+  auto pdfs = block->getData<gpu::GPUField<float>>(pdfsID);
+  auto force = block->getData<gpu::GPUField<float>>(forceID);
+
+  auto &rho_0 = this->rho_0_;
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
+  float *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+  float *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers()))
+  float *RESTRICT const _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+  const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+  const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+  const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0) ? 128 : _size_force_0)), uint32_c(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))), uint32_c(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))));
+  dim3 _grid(uint32_c(((_size_force_0) % (((128 < _size_force_0) ? 128 : _size_force_0)) == 0 ? (int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)) : ((int64_t)(_size_force_0) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))) + 1)), uint32_c(((_size_force_1) % (((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) == 0 ? (int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))) : ((int64_t)(_size_force_1) / (int64_t)(((1024 < ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))) ? 1024 : ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) + 1)), uint32_c(((_size_force_2) % (((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) == 0 ? (int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))))) : ((int64_t)(_size_force_2) / (int64_t)(((64 < ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))) ? 64 : ((_size_force_2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0))))))) ? _size_force_2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0) * ((_size_force_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))) ? _size_force_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0) ? 128 : _size_force_0)))))))))) + 1)));
+  internal_initialpdfssettersingleprecisioncuda_initialpdfssettersingleprecisioncuda::initialpdfssettersingleprecisioncuda_initialpdfssettersingleprecisioncuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3, rho_0);
+}
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h
new file mode 100644
index 00000000000..a3faac64d68
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h
@@ -0,0 +1,120 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file InitialPDFsSetterSinglePrecisionCUDA.h
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "field/SwapableCompare.h"
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace pystencils {
+
+class InitialPDFsSetterSinglePrecisionCUDA {
+public:
+  InitialPDFsSetterSinglePrecisionCUDA(BlockDataID forceID_,
+                                       BlockDataID pdfsID_,
+                                       BlockDataID velocityID_, float rho_0)
+      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_),
+        rho_0_(rho_0){};
+
+  void run(IBlock *block, gpuStream_t stream = nullptr);
+
+  void runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers, IBlock *block,
+                         gpuStream_t stream = nullptr);
+
+  void operator()(IBlock *block, gpuStream_t stream = nullptr) {
+    run(block, stream);
+  }
+
+  static std::function<void(IBlock *)>
+  getSweep(const shared_ptr<InitialPDFsSetterSinglePrecisionCUDA> &kernel) {
+    return [kernel](IBlock *b) { kernel->run(b); };
+  }
+
+  static std::function<void(IBlock *, gpuStream_t)> getSweepOnCellInterval(
+      const shared_ptr<InitialPDFsSetterSinglePrecisionCUDA> &kernel,
+      const shared_ptr<StructuredBlockStorage> &blocks,
+      const CellInterval &globalCellInterval, cell_idx_t ghostLayers = 1) {
+    return [kernel, blocks, globalCellInterval,
+            ghostLayers](IBlock *b, gpuStream_t stream = nullptr) {
+      kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                                stream);
+    };
+  }
+
+  std::function<void(IBlock *)> getSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->run(b, stream); };
+  }
+
+  std::function<void(IBlock *)>
+  getSweepOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers = 1,
+                         gpuStream_t stream = nullptr) {
+    return [this, blocks, globalCellInterval, ghostLayers, stream](IBlock *b) {
+      this->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                              stream);
+    };
+  }
+
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
+  BlockDataID forceID;
+  BlockDataID pdfsID;
+  BlockDataID velocityID;
+  float rho_0_;
+};
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.cpp
index 65bbfd9f8d7..9f6a75e72ce 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file StreamSweepDoublePrecision.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -49,147 +48,59 @@ namespace pystencils {
 namespace internal_streamsweepdoubleprecision_streamsweepdoubleprecision {
 static FUNC_PREFIX void streamsweepdoubleprecision_streamsweepdoubleprecision(double *RESTRICT const _data_force, double *RESTRICT const _data_pdfs, double *RESTRICT _data_pdfs_tmp, double *RESTRICT _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) {
   for (int64_t ctr_2 = 1; ctr_2 < _size_force_2 - 1; ctr_2 += 1) {
-    double *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    double *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3;
-    double *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    double *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    double *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    double *RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2 * ctr_2;
-    double *RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2 * ctr_2 + _stride_velocity_3;
-    double *RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3;
-    double *RESTRICT _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2;
-    double *RESTRICT _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3;
     for (int64_t ctr_1 = 1; ctr_1 < _size_force_1 - 1; ctr_1 += 1) {
-      double *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      double *RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
-      double *RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
-      double *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      double *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      double *RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_35;
-      double *RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_36;
-      double *RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
-      double *RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
-      double *RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
-      double *RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
-      double *RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
-      double *RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
-      double *RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_313;
-      double *RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_314;
-      double *RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
-      double *RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
-      double *RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_317;
-      double *RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_318;
-      double *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      double *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      double *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      double *RESTRICT _data_velocity_20_30_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_30;
-      double *RESTRICT _data_velocity_20_31_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_31;
-      double *RESTRICT _data_velocity_20_32_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_32;
-      double *RESTRICT _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_30;
-      double *RESTRICT _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_31;
-      double *RESTRICT _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_32;
-      double *RESTRICT _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_33;
-      double *RESTRICT _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_34;
-      double *RESTRICT _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_35;
-      double *RESTRICT _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_36;
-      double *RESTRICT _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_37;
-      double *RESTRICT _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_38;
-      double *RESTRICT _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_39;
-      double *RESTRICT _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_310;
-      double *RESTRICT _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_311;
-      double *RESTRICT _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_312;
-      double *RESTRICT _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_313;
-      double *RESTRICT _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_314;
-      double *RESTRICT _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_315;
-      double *RESTRICT _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_316;
-      double *RESTRICT _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_317;
-      double *RESTRICT _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_318;
       for (int64_t ctr_0 = 1; ctr_0 < _size_force_0 - 1; ctr_0 += 1) {
-        const double streamed_0 = _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0];
-        const double streamed_1 = _data_pdfs_20_31_1m1[_stride_pdfs_0 * ctr_0];
-        const double streamed_2 = _data_pdfs_20_32_11[_stride_pdfs_0 * ctr_0];
-        const double streamed_3 = _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
-        const double streamed_4 = _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
-        const double streamed_5 = _data_pdfs_2m1_35_10[_stride_pdfs_0 * ctr_0];
-        const double streamed_6 = _data_pdfs_21_36_10[_stride_pdfs_0 * ctr_0];
-        const double streamed_7 = _data_pdfs_20_37_1m1[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
-        const double streamed_8 = _data_pdfs_20_38_1m1[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
-        const double streamed_9 = _data_pdfs_20_39_11[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
-        const double streamed_10 = _data_pdfs_20_310_11[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
-        const double streamed_11 = _data_pdfs_2m1_311_1m1[_stride_pdfs_0 * ctr_0];
-        const double streamed_12 = _data_pdfs_2m1_312_11[_stride_pdfs_0 * ctr_0];
-        const double streamed_13 = _data_pdfs_2m1_313_10[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
-        const double streamed_14 = _data_pdfs_2m1_314_10[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
-        const double streamed_15 = _data_pdfs_21_315_1m1[_stride_pdfs_0 * ctr_0];
-        const double streamed_16 = _data_pdfs_21_316_11[_stride_pdfs_0 * ctr_0];
-        const double streamed_17 = _data_pdfs_21_317_10[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
-        const double streamed_18 = _data_pdfs_21_318_10[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
+        const double streamed_0 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+        const double streamed_1 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+        const double streamed_2 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+        const double streamed_3 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+        const double streamed_4 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
+        const double streamed_5 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3];
+        const double streamed_6 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3];
+        const double streamed_7 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+        const double streamed_8 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+        const double streamed_9 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+        const double streamed_10 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+        const double streamed_11 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3];
+        const double streamed_12 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3];
+        const double streamed_13 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3];
+        const double streamed_14 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3];
+        const double streamed_15 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3];
+        const double streamed_16 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3];
+        const double streamed_17 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3];
+        const double streamed_18 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3];
         const double vel0Term = streamed_10 + streamed_14 + streamed_18 + streamed_4 + streamed_8;
-        const double momdensity_0 = streamed_13 * -1.0 + streamed_17 * -1.0 + streamed_3 * -1.0 + streamed_7 * -1.0 + streamed_9 * -1.0 + vel0Term;
+        const double momdensity_0 = -streamed_13 - streamed_17 - streamed_3 - streamed_7 - streamed_9 + vel0Term;
         const double vel1Term = streamed_1 + streamed_11 + streamed_15 + streamed_7;
-        const double momdensity_1 = streamed_10 * -1.0 + streamed_12 * -1.0 + streamed_16 * -1.0 + streamed_2 * -1.0 + streamed_8 + streamed_9 * -1.0 + vel1Term;
+        const double momdensity_1 = -streamed_10 - streamed_12 - streamed_16 - streamed_2 + streamed_8 - streamed_9 + vel1Term;
         const double vel2Term = streamed_12 + streamed_13 + streamed_5;
         const double rho = streamed_0 + streamed_16 + streamed_17 + streamed_2 + streamed_3 + streamed_6 + streamed_9 + vel0Term + vel1Term + vel2Term;
-        const double momdensity_2 = streamed_11 + streamed_14 + streamed_15 * -1.0 + streamed_16 * -1.0 + streamed_17 * -1.0 + streamed_18 * -1.0 + streamed_6 * -1.0 + vel2Term;
-        const double u_0 = momdensity_0 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force_20_30_10[_stride_force_0 * ctr_0];
-        const double u_1 = momdensity_1 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force_20_31_10[_stride_force_0 * ctr_0];
-        const double u_2 = momdensity_2 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force_20_32_10[_stride_force_0 * ctr_0];
-        _data_velocity_20_30_10[_stride_velocity_0 * ctr_0] = u_0;
-        _data_velocity_20_31_10[_stride_velocity_0 * ctr_0] = u_1;
-        _data_velocity_20_32_10[_stride_velocity_0 * ctr_0] = u_2;
-        _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_0;
-        _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_1;
-        _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_2;
-        _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_3;
-        _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_4;
-        _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_5;
-        _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_6;
-        _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_7;
-        _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_8;
-        _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_9;
-        _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_10;
-        _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_11;
-        _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_12;
-        _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_13;
-        _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_14;
-        _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_15;
-        _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_16;
-        _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_17;
-        _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_18;
+        const double momdensity_2 = streamed_11 + streamed_14 - streamed_15 - streamed_16 - streamed_17 - streamed_18 - streamed_6 + vel2Term;
+        const double u_0 = momdensity_0 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+        const double u_1 = momdensity_1 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+        const double u_2 = momdensity_2 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+        _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2] = u_0;
+        _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3] = u_1;
+        _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3] = u_2;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2] = streamed_0;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3] = streamed_1;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3] = streamed_2;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3] = streamed_3;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3] = streamed_4;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3] = streamed_5;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3] = streamed_6;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3] = streamed_7;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3] = streamed_8;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3] = streamed_9;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3] = streamed_10;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3] = streamed_11;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3] = streamed_12;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3] = streamed_13;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3] = streamed_14;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3] = streamed_15;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3] = streamed_16;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3] = streamed_17;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3] = streamed_18;
       }
     }
   }
@@ -197,6 +108,7 @@ static FUNC_PREFIX void streamsweepdoubleprecision_streamsweepdoubleprecision(do
 } // namespace internal_streamsweepdoubleprecision_streamsweepdoubleprecision
 
 void StreamSweepDoublePrecision::run(IBlock *block) {
+
   auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
   auto velocity = block->getData<field::GhostLayerField<double, 3>>(velocityID);
@@ -212,27 +124,27 @@ void StreamSweepDoublePrecision::run(IBlock *block) {
     }
   }
 
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT const _data_pdfs = pdfs->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()))
   double *RESTRICT _data_velocity = velocity->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 2));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 2));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 2));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -254,6 +166,7 @@ void StreamSweepDoublePrecision::run(IBlock *block) {
 }
 
 void StreamSweepDoublePrecision::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -277,35 +190,35 @@ void StreamSweepDoublePrecision::runOnCellInterval(const shared_ptr<StructuredBl
     }
   }
 
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()))
   double *RESTRICT _data_velocity = velocity->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 2));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 2));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 2));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -335,4 +248,4 @@ void StreamSweepDoublePrecision::runOnCellInterval(const shared_ptr<StructuredBl
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h
index de3d343cb2d..87fa2aa0336 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecision.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -97,6 +98,9 @@ class StreamSweepDoublePrecision {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   BlockDataID velocityID;
@@ -113,4 +117,4 @@ class StreamSweepDoublePrecision {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.cpp
index 666330a7003..8b26558419e 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file StreamSweepDoublePrecisionAVX.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -51,116 +50,28 @@ namespace pystencils {
 namespace internal_91e2c9bdb4c4fa8a405803890749bf98 {
 static FUNC_PREFIX void streamsweepdoubleprecisionavx_streamsweepdoubleprecisionavx(double *RESTRICT const _data_force, double *RESTRICT const _data_pdfs, double *RESTRICT _data_pdfs_tmp, double *RESTRICT _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) {
   for (int64_t ctr_2 = 1; ctr_2 < _size_force_2 - 1; ctr_2 += 1) {
-    double *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    double *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3;
-    double *RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3;
-    double *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    double *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    double *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    double *RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2 * ctr_2;
-    double *RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2 * ctr_2 + _stride_velocity_3;
-    double *RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3;
-    double *RESTRICT _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2;
-    double *RESTRICT _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3;
-    double *RESTRICT _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3;
     for (int64_t ctr_1 = 1; ctr_1 < _size_force_1 - 1; ctr_1 += 1) {
-      double *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      double *RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
-      double *RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
-      double *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      double *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      double *RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_35;
-      double *RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_36;
-      double *RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
-      double *RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
-      double *RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
-      double *RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
-      double *RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
-      double *RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
-      double *RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_313;
-      double *RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_314;
-      double *RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
-      double *RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
-      double *RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_317;
-      double *RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_318;
-      double *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      double *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      double *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      double *RESTRICT _data_velocity_20_30_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_30;
-      double *RESTRICT _data_velocity_20_31_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_31;
-      double *RESTRICT _data_velocity_20_32_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_32;
-      double *RESTRICT _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_30;
-      double *RESTRICT _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_31;
-      double *RESTRICT _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_32;
-      double *RESTRICT _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_33;
-      double *RESTRICT _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_34;
-      double *RESTRICT _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_35;
-      double *RESTRICT _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_36;
-      double *RESTRICT _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_37;
-      double *RESTRICT _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_38;
-      double *RESTRICT _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_39;
-      double *RESTRICT _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_310;
-      double *RESTRICT _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_311;
-      double *RESTRICT _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_312;
-      double *RESTRICT _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_313;
-      double *RESTRICT _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_314;
-      double *RESTRICT _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_315;
-      double *RESTRICT _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_316;
-      double *RESTRICT _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_317;
-      double *RESTRICT _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_318;
       {
         for (int64_t ctr_0 = 1; ctr_0 < (int64_t)((_size_force_0 - 2) / (4)) * (4) + 1; ctr_0 += 4) {
-          const __m256d streamed_0 = _mm256_load_pd(&_data_pdfs_20_30_10[ctr_0]);
-          const __m256d streamed_1 = _mm256_load_pd(&_data_pdfs_20_31_1m1[ctr_0]);
-          const __m256d streamed_2 = _mm256_load_pd(&_data_pdfs_20_32_11[ctr_0]);
-          const __m256d streamed_3 = _mm256_loadu_pd(&_data_pdfs_20_33_10[ctr_0 + 1]);
-          const __m256d streamed_4 = _mm256_loadu_pd(&_data_pdfs_20_34_10[ctr_0 - 1]);
-          const __m256d streamed_5 = _mm256_load_pd(&_data_pdfs_2m1_35_10[ctr_0]);
-          const __m256d streamed_6 = _mm256_load_pd(&_data_pdfs_21_36_10[ctr_0]);
-          const __m256d streamed_7 = _mm256_loadu_pd(&_data_pdfs_20_37_1m1[ctr_0 + 1]);
-          const __m256d streamed_8 = _mm256_loadu_pd(&_data_pdfs_20_38_1m1[ctr_0 - 1]);
-          const __m256d streamed_9 = _mm256_loadu_pd(&_data_pdfs_20_39_11[ctr_0 + 1]);
-          const __m256d streamed_10 = _mm256_loadu_pd(&_data_pdfs_20_310_11[ctr_0 - 1]);
-          const __m256d streamed_11 = _mm256_load_pd(&_data_pdfs_2m1_311_1m1[ctr_0]);
-          const __m256d streamed_12 = _mm256_load_pd(&_data_pdfs_2m1_312_11[ctr_0]);
-          const __m256d streamed_13 = _mm256_loadu_pd(&_data_pdfs_2m1_313_10[ctr_0 + 1]);
-          const __m256d streamed_14 = _mm256_loadu_pd(&_data_pdfs_2m1_314_10[ctr_0 - 1]);
-          const __m256d streamed_15 = _mm256_load_pd(&_data_pdfs_21_315_1m1[ctr_0]);
-          const __m256d streamed_16 = _mm256_load_pd(&_data_pdfs_21_316_11[ctr_0]);
-          const __m256d streamed_17 = _mm256_loadu_pd(&_data_pdfs_21_317_10[ctr_0 + 1]);
-          const __m256d streamed_18 = _mm256_loadu_pd(&_data_pdfs_21_318_10[ctr_0 - 1]);
+          const __m256d streamed_0 = _mm256_load_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0]);
+          const __m256d streamed_1 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0]);
+          const __m256d streamed_2 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0]);
+          const __m256d streamed_3 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0 + 1]);
+          const __m256d streamed_4 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0 - 1]);
+          const __m256d streamed_5 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3 + ctr_0]);
+          const __m256d streamed_6 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3 + ctr_0]);
+          const __m256d streamed_7 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0 + 1]);
+          const __m256d streamed_8 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0 - 1]);
+          const __m256d streamed_9 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0 + 1]);
+          const __m256d streamed_10 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0 - 1]);
+          const __m256d streamed_11 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3 + ctr_0]);
+          const __m256d streamed_12 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3 + ctr_0]);
+          const __m256d streamed_13 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3 + ctr_0 + 1]);
+          const __m256d streamed_14 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3 + ctr_0 - 1]);
+          const __m256d streamed_15 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3 + ctr_0]);
+          const __m256d streamed_16 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3 + ctr_0]);
+          const __m256d streamed_17 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3 + ctr_0 + 1]);
+          const __m256d streamed_18 = _mm256_loadu_pd(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3 + ctr_0 - 1]);
           const __m256d vel0Term = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(streamed_10, streamed_14), streamed_18), streamed_4), streamed_8);
           const __m256d momdensity_0 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(streamed_13, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(streamed_17, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_3, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_7, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_9, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), vel0Term);
           const __m256d vel1Term = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(streamed_1, streamed_11), streamed_15), streamed_7);
@@ -168,84 +79,84 @@ static FUNC_PREFIX void streamsweepdoubleprecisionavx_streamsweepdoubleprecision
           const __m256d vel2Term = _mm256_add_pd(_mm256_add_pd(streamed_12, streamed_13), streamed_5);
           const __m256d rho = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(streamed_0, streamed_16), streamed_17), streamed_2), streamed_3), streamed_6), streamed_9), vel0Term), vel1Term), vel2Term);
           const __m256d momdensity_2 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(streamed_15, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(streamed_16, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_17, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_18, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_6, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), streamed_11), streamed_14), vel2Term);
-          const __m256d u_0 = _mm256_add_pd(_mm256_mul_pd(momdensity_0, _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_mul_pd(_mm256_mul_pd(_mm256_set_pd(0.5, 0.5, 0.5, 0.5), _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_load_pd(&_data_force_20_30_10[ctr_0])));
-          const __m256d u_1 = _mm256_add_pd(_mm256_mul_pd(momdensity_1, _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_mul_pd(_mm256_mul_pd(_mm256_set_pd(0.5, 0.5, 0.5, 0.5), _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_load_pd(&_data_force_20_31_10[ctr_0])));
-          const __m256d u_2 = _mm256_add_pd(_mm256_mul_pd(momdensity_2, _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_mul_pd(_mm256_mul_pd(_mm256_set_pd(0.5, 0.5, 0.5, 0.5), _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_load_pd(&_data_force_20_32_10[ctr_0])));
-          _mm256_store_pd(&_data_velocity_20_30_10[ctr_0], u_0);
-          _mm256_store_pd(&_data_velocity_20_31_10[ctr_0], u_1);
-          _mm256_store_pd(&_data_velocity_20_32_10[ctr_0], u_2);
-          _mm256_store_pd(&_data_pdfs_tmp_20_30_10[ctr_0], streamed_0);
-          _mm256_store_pd(&_data_pdfs_tmp_20_31_10[ctr_0], streamed_1);
-          _mm256_store_pd(&_data_pdfs_tmp_20_32_10[ctr_0], streamed_2);
-          _mm256_store_pd(&_data_pdfs_tmp_20_33_10[ctr_0], streamed_3);
-          _mm256_store_pd(&_data_pdfs_tmp_20_34_10[ctr_0], streamed_4);
-          _mm256_store_pd(&_data_pdfs_tmp_20_35_10[ctr_0], streamed_5);
-          _mm256_store_pd(&_data_pdfs_tmp_20_36_10[ctr_0], streamed_6);
-          _mm256_store_pd(&_data_pdfs_tmp_20_37_10[ctr_0], streamed_7);
-          _mm256_store_pd(&_data_pdfs_tmp_20_38_10[ctr_0], streamed_8);
-          _mm256_store_pd(&_data_pdfs_tmp_20_39_10[ctr_0], streamed_9);
-          _mm256_store_pd(&_data_pdfs_tmp_20_310_10[ctr_0], streamed_10);
-          _mm256_store_pd(&_data_pdfs_tmp_20_311_10[ctr_0], streamed_11);
-          _mm256_store_pd(&_data_pdfs_tmp_20_312_10[ctr_0], streamed_12);
-          _mm256_store_pd(&_data_pdfs_tmp_20_313_10[ctr_0], streamed_13);
-          _mm256_store_pd(&_data_pdfs_tmp_20_314_10[ctr_0], streamed_14);
-          _mm256_store_pd(&_data_pdfs_tmp_20_315_10[ctr_0], streamed_15);
-          _mm256_store_pd(&_data_pdfs_tmp_20_316_10[ctr_0], streamed_16);
-          _mm256_store_pd(&_data_pdfs_tmp_20_317_10[ctr_0], streamed_17);
-          _mm256_store_pd(&_data_pdfs_tmp_20_318_10[ctr_0], streamed_18);
+          const __m256d u_0 = _mm256_add_pd(_mm256_mul_pd(momdensity_0, _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_mul_pd(_mm256_mul_pd(_mm256_set_pd(0.5, 0.5, 0.5, 0.5), _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_load_pd(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0])));
+          const __m256d u_1 = _mm256_add_pd(_mm256_mul_pd(momdensity_1, _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_mul_pd(_mm256_mul_pd(_mm256_set_pd(0.5, 0.5, 0.5, 0.5), _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_loadu_pd(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0])));
+          const __m256d u_2 = _mm256_add_pd(_mm256_mul_pd(momdensity_2, _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_mul_pd(_mm256_mul_pd(_mm256_set_pd(0.5, 0.5, 0.5, 0.5), _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_loadu_pd(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0])));
+          _mm256_store_pd(&_data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + ctr_0], u_0);
+          _mm256_storeu_pd(&_data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3 + ctr_0], u_1);
+          _mm256_storeu_pd(&_data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3 + ctr_0], u_2);
+          _mm256_store_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + ctr_0], streamed_0);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3 + ctr_0], streamed_1);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3 + ctr_0], streamed_2);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3 + ctr_0], streamed_3);
+          _mm256_store_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3 + ctr_0], streamed_4);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3 + ctr_0], streamed_5);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3 + ctr_0], streamed_6);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3 + ctr_0], streamed_7);
+          _mm256_store_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3 + ctr_0], streamed_8);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3 + ctr_0], streamed_9);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3 + ctr_0], streamed_10);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3 + ctr_0], streamed_11);
+          _mm256_store_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3 + ctr_0], streamed_12);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3 + ctr_0], streamed_13);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3 + ctr_0], streamed_14);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3 + ctr_0], streamed_15);
+          _mm256_store_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3 + ctr_0], streamed_16);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3 + ctr_0], streamed_17);
+          _mm256_storeu_pd(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3 + ctr_0], streamed_18);
         }
         for (int64_t ctr_0 = (int64_t)((_size_force_0 - 2) / (4)) * (4) + 1; ctr_0 < _size_force_0 - 1; ctr_0 += 1) {
-          const double streamed_0 = _data_pdfs_20_30_10[ctr_0];
-          const double streamed_1 = _data_pdfs_20_31_1m1[ctr_0];
-          const double streamed_2 = _data_pdfs_20_32_11[ctr_0];
-          const double streamed_3 = _data_pdfs_20_33_10[ctr_0 + 1];
-          const double streamed_4 = _data_pdfs_20_34_10[ctr_0 - 1];
-          const double streamed_5 = _data_pdfs_2m1_35_10[ctr_0];
-          const double streamed_6 = _data_pdfs_21_36_10[ctr_0];
-          const double streamed_7 = _data_pdfs_20_37_1m1[ctr_0 + 1];
-          const double streamed_8 = _data_pdfs_20_38_1m1[ctr_0 - 1];
-          const double streamed_9 = _data_pdfs_20_39_11[ctr_0 + 1];
-          const double streamed_10 = _data_pdfs_20_310_11[ctr_0 - 1];
-          const double streamed_11 = _data_pdfs_2m1_311_1m1[ctr_0];
-          const double streamed_12 = _data_pdfs_2m1_312_11[ctr_0];
-          const double streamed_13 = _data_pdfs_2m1_313_10[ctr_0 + 1];
-          const double streamed_14 = _data_pdfs_2m1_314_10[ctr_0 - 1];
-          const double streamed_15 = _data_pdfs_21_315_1m1[ctr_0];
-          const double streamed_16 = _data_pdfs_21_316_11[ctr_0];
-          const double streamed_17 = _data_pdfs_21_317_10[ctr_0 + 1];
-          const double streamed_18 = _data_pdfs_21_318_10[ctr_0 - 1];
+          const double streamed_0 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0];
+          const double streamed_1 = _data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0];
+          const double streamed_2 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0];
+          const double streamed_3 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0 + 1];
+          const double streamed_4 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0 - 1];
+          const double streamed_5 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3 + ctr_0];
+          const double streamed_6 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3 + ctr_0];
+          const double streamed_7 = _data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0 + 1];
+          const double streamed_8 = _data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0 - 1];
+          const double streamed_9 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0 + 1];
+          const double streamed_10 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0 - 1];
+          const double streamed_11 = _data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3 + ctr_0];
+          const double streamed_12 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3 + ctr_0];
+          const double streamed_13 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3 + ctr_0 + 1];
+          const double streamed_14 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3 + ctr_0 - 1];
+          const double streamed_15 = _data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3 + ctr_0];
+          const double streamed_16 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3 + ctr_0];
+          const double streamed_17 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3 + ctr_0 + 1];
+          const double streamed_18 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3 + ctr_0 - 1];
           const double vel0Term = streamed_10 + streamed_14 + streamed_18 + streamed_4 + streamed_8;
-          const double momdensity_0 = streamed_13 * -1.0 + streamed_17 * -1.0 + streamed_3 * -1.0 + streamed_7 * -1.0 + streamed_9 * -1.0 + vel0Term;
+          const double momdensity_0 = -streamed_13 - streamed_17 - streamed_3 - streamed_7 - streamed_9 + vel0Term;
           const double vel1Term = streamed_1 + streamed_11 + streamed_15 + streamed_7;
-          const double momdensity_1 = streamed_10 * -1.0 + streamed_12 * -1.0 + streamed_16 * -1.0 + streamed_2 * -1.0 + streamed_8 + streamed_9 * -1.0 + vel1Term;
+          const double momdensity_1 = -streamed_10 - streamed_12 - streamed_16 - streamed_2 + streamed_8 - streamed_9 + vel1Term;
           const double vel2Term = streamed_12 + streamed_13 + streamed_5;
           const double rho = streamed_0 + streamed_16 + streamed_17 + streamed_2 + streamed_3 + streamed_6 + streamed_9 + vel0Term + vel1Term + vel2Term;
-          const double momdensity_2 = streamed_11 + streamed_14 + streamed_15 * -1.0 + streamed_16 * -1.0 + streamed_17 * -1.0 + streamed_18 * -1.0 + streamed_6 * -1.0 + vel2Term;
-          const double u_0 = momdensity_0 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force_20_30_10[ctr_0];
-          const double u_1 = momdensity_1 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force_20_31_10[ctr_0];
-          const double u_2 = momdensity_2 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force_20_32_10[ctr_0];
-          _data_velocity_20_30_10[ctr_0] = u_0;
-          _data_velocity_20_31_10[ctr_0] = u_1;
-          _data_velocity_20_32_10[ctr_0] = u_2;
-          _data_pdfs_tmp_20_30_10[ctr_0] = streamed_0;
-          _data_pdfs_tmp_20_31_10[ctr_0] = streamed_1;
-          _data_pdfs_tmp_20_32_10[ctr_0] = streamed_2;
-          _data_pdfs_tmp_20_33_10[ctr_0] = streamed_3;
-          _data_pdfs_tmp_20_34_10[ctr_0] = streamed_4;
-          _data_pdfs_tmp_20_35_10[ctr_0] = streamed_5;
-          _data_pdfs_tmp_20_36_10[ctr_0] = streamed_6;
-          _data_pdfs_tmp_20_37_10[ctr_0] = streamed_7;
-          _data_pdfs_tmp_20_38_10[ctr_0] = streamed_8;
-          _data_pdfs_tmp_20_39_10[ctr_0] = streamed_9;
-          _data_pdfs_tmp_20_310_10[ctr_0] = streamed_10;
-          _data_pdfs_tmp_20_311_10[ctr_0] = streamed_11;
-          _data_pdfs_tmp_20_312_10[ctr_0] = streamed_12;
-          _data_pdfs_tmp_20_313_10[ctr_0] = streamed_13;
-          _data_pdfs_tmp_20_314_10[ctr_0] = streamed_14;
-          _data_pdfs_tmp_20_315_10[ctr_0] = streamed_15;
-          _data_pdfs_tmp_20_316_10[ctr_0] = streamed_16;
-          _data_pdfs_tmp_20_317_10[ctr_0] = streamed_17;
-          _data_pdfs_tmp_20_318_10[ctr_0] = streamed_18;
+          const double momdensity_2 = streamed_11 + streamed_14 - streamed_15 - streamed_16 - streamed_17 - streamed_18 - streamed_6 + vel2Term;
+          const double u_0 = momdensity_0 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0];
+          const double u_1 = momdensity_1 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0];
+          const double u_2 = momdensity_2 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0];
+          _data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + ctr_0] = u_0;
+          _data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3 + ctr_0] = u_1;
+          _data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3 + ctr_0] = u_2;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + ctr_0] = streamed_0;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3 + ctr_0] = streamed_1;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3 + ctr_0] = streamed_2;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3 + ctr_0] = streamed_3;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3 + ctr_0] = streamed_4;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3 + ctr_0] = streamed_5;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3 + ctr_0] = streamed_6;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3 + ctr_0] = streamed_7;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3 + ctr_0] = streamed_8;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3 + ctr_0] = streamed_9;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3 + ctr_0] = streamed_10;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3 + ctr_0] = streamed_11;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3 + ctr_0] = streamed_12;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3 + ctr_0] = streamed_13;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3 + ctr_0] = streamed_14;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3 + ctr_0] = streamed_15;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3 + ctr_0] = streamed_16;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3 + ctr_0] = streamed_17;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3 + ctr_0] = streamed_18;
         }
       }
     }
@@ -254,6 +165,7 @@ static FUNC_PREFIX void streamsweepdoubleprecisionavx_streamsweepdoubleprecision
 } // namespace internal_91e2c9bdb4c4fa8a405803890749bf98
 
 void StreamSweepDoublePrecisionAVX::run(IBlock *block) {
+
   auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
   auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
   auto velocity = block->getData<field::GhostLayerField<double, 3>>(velocityID);
@@ -269,34 +181,34 @@ void StreamSweepDoublePrecisionAVX::run(IBlock *block) {
     }
   }
 
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT const _data_pdfs = pdfs->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs_tmp->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs_tmp->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()))
   double *RESTRICT _data_velocity = velocity->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)velocity->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 2));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 2));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 2));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)velocity->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -314,6 +226,7 @@ void StreamSweepDoublePrecisionAVX::run(IBlock *block) {
 }
 
 void StreamSweepDoublePrecisionAVX::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -337,42 +250,42 @@ void StreamSweepDoublePrecisionAVX::runOnCellInterval(const shared_ptr<Structure
     }
   }
 
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()))
   double *RESTRICT const _data_force = force->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
   double *RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
   double *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs_tmp->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs_tmp->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()))
   double *RESTRICT _data_velocity = velocity->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)velocity->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 2));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 2));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 2));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)velocity->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -398,4 +311,4 @@ void StreamSweepDoublePrecisionAVX::runOnCellInterval(const shared_ptr<Structure
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h
index 5ff5ed1738c..52288021ad8 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionAVX.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -96,6 +97,9 @@ class StreamSweepDoublePrecisionAVX {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   BlockDataID velocityID;
@@ -112,4 +116,4 @@ class StreamSweepDoublePrecisionAVX {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.cu
new file mode 100644
index 00000000000..c0351452f9c
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.cu
@@ -0,0 +1,254 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file StreamSweepDoublePrecisionCUDA.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#include <cmath>
+
+#include "StreamSweepDoublePrecisionCUDA.h"
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+
+#define FUNC_PREFIX __global__
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#pragma GCC diagnostic ignored "-Wshadow"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning push
+#pragma warning(disable : 1599)
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace pystencils {
+
+namespace internal_streamsweepdoubleprecisioncuda_streamsweepdoubleprecisioncuda {
+static FUNC_PREFIX __launch_bounds__(256) void streamsweepdoubleprecisioncuda_streamsweepdoubleprecisioncuda(double *RESTRICT const _data_force, double *RESTRICT const _data_pdfs, double *RESTRICT _data_pdfs_tmp, double *RESTRICT _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) {
+  if (blockDim.x * blockIdx.x + threadIdx.x + 1 < _size_force_0 - 1 && blockDim.y * blockIdx.y + threadIdx.y + 1 < _size_force_1 - 1 && blockDim.z * blockIdx.z + threadIdx.z + 1 < _size_force_2 - 1) {
+    const int64_t ctr_0 = blockDim.x * blockIdx.x + threadIdx.x + 1;
+    const int64_t ctr_1 = blockDim.y * blockIdx.y + threadIdx.y + 1;
+    const int64_t ctr_2 = blockDim.z * blockIdx.z + threadIdx.z + 1;
+    const double streamed_0 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+    const double streamed_1 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+    const double streamed_2 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+    const double streamed_3 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+    const double streamed_4 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
+    const double streamed_5 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3];
+    const double streamed_6 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3];
+    const double streamed_7 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+    const double streamed_8 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+    const double streamed_9 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+    const double streamed_10 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+    const double streamed_11 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3];
+    const double streamed_12 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3];
+    const double streamed_13 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3];
+    const double streamed_14 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3];
+    const double streamed_15 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3];
+    const double streamed_16 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3];
+    const double streamed_17 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3];
+    const double streamed_18 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3];
+    const double vel0Term = streamed_10 + streamed_14 + streamed_18 + streamed_4 + streamed_8;
+    const double momdensity_0 = -streamed_13 - streamed_17 - streamed_3 - streamed_7 - streamed_9 + vel0Term;
+    const double vel1Term = streamed_1 + streamed_11 + streamed_15 + streamed_7;
+    const double momdensity_1 = -streamed_10 - streamed_12 - streamed_16 - streamed_2 + streamed_8 - streamed_9 + vel1Term;
+    const double vel2Term = streamed_12 + streamed_13 + streamed_5;
+    const double rho = streamed_0 + streamed_16 + streamed_17 + streamed_2 + streamed_3 + streamed_6 + streamed_9 + vel0Term + vel1Term + vel2Term;
+    const double momdensity_2 = streamed_11 + streamed_14 - streamed_15 - streamed_16 - streamed_17 - streamed_18 - streamed_6 + vel2Term;
+    const double u_0 = momdensity_0 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+    const double u_1 = momdensity_1 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+    const double u_2 = momdensity_2 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+    _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2] = u_0;
+    _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3] = u_1;
+    _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3] = u_2;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2] = streamed_0;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3] = streamed_1;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3] = streamed_2;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3] = streamed_3;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3] = streamed_4;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3] = streamed_5;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3] = streamed_6;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3] = streamed_7;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3] = streamed_8;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3] = streamed_9;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3] = streamed_10;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3] = streamed_11;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3] = streamed_12;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3] = streamed_13;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3] = streamed_14;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3] = streamed_15;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3] = streamed_16;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3] = streamed_17;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3] = streamed_18;
+  }
+}
+} // namespace internal_streamsweepdoubleprecisioncuda_streamsweepdoubleprecisioncuda
+
+void StreamSweepDoublePrecisionCUDA::run(IBlock *block, gpuStream_t stream) {
+
+  auto velocity = block->getData<gpu::GPUField<double>>(velocityID);
+  auto force = block->getData<gpu::GPUField<double>>(forceID);
+  auto pdfs = block->getData<gpu::GPUField<double>>(pdfsID);
+  gpu::GPUField<double> *pdfs_tmp;
+  {
+    // Getting temporary field pdfs_tmp
+    auto it = cache_pdfs_.find(pdfs);
+    if (it != cache_pdfs_.end()) {
+      pdfs_tmp = *it;
+    } else {
+      pdfs_tmp = pdfs->cloneUninitialized();
+      cache_pdfs_.insert(pdfs_tmp);
+    }
+  }
+
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()))
+  double *RESTRICT const _data_force = force->dataAt(-1, -1, -1, 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()))
+  double *RESTRICT const _data_pdfs = pdfs->dataAt(-1, -1, -1, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  double *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-1, -1, -1, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()))
+  double *RESTRICT _data_velocity = velocity->dataAt(-1, -1, -1, 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+  const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+  const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+  const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+  const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+  const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+  const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+  const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)), uint32_c(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))), uint32_c(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))));
+  dim3 _grid(uint32_c(((_size_force_0 - 2) % (((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)) == 0 ? (int64_t)(_size_force_0 - 2) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)) : ((int64_t)(_size_force_0 - 2) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))) + 1)), uint32_c(((_size_force_1 - 2) % (((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))) == 0 ? (int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))) : ((int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) + 1)), uint32_c(((_size_force_2 - 2) % (((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))) == 0 ? (int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))) : ((int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))))) + 1)));
+  internal_streamsweepdoubleprecisioncuda_streamsweepdoubleprecisioncuda::streamsweepdoubleprecisioncuda_streamsweepdoubleprecisioncuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _data_pdfs_tmp, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+  pdfs->swapDataPointers(pdfs_tmp);
+}
+
+void StreamSweepDoublePrecisionCUDA::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block, gpuStream_t stream) {
+
+  CellInterval ci = globalCellInterval;
+  CellInterval blockBB = blocks->getBlockCellBB(*block);
+  blockBB.expand(ghostLayers);
+  ci.intersect(blockBB);
+  blocks->transformGlobalToBlockLocalCellInterval(ci, *block);
+  if (ci.empty())
+    return;
+
+  auto velocity = block->getData<gpu::GPUField<double>>(velocityID);
+  auto force = block->getData<gpu::GPUField<double>>(forceID);
+  auto pdfs = block->getData<gpu::GPUField<double>>(pdfsID);
+  gpu::GPUField<double> *pdfs_tmp;
+  {
+    // Getting temporary field pdfs_tmp
+    auto it = cache_pdfs_.find(pdfs);
+    if (it != cache_pdfs_.end()) {
+      pdfs_tmp = *it;
+    } else {
+      pdfs_tmp = pdfs->cloneUninitialized();
+      cache_pdfs_.insert(pdfs_tmp);
+    }
+  }
+
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()))
+  double *RESTRICT const _data_force = force->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  double *RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  double *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  double *RESTRICT _data_velocity = velocity->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+  const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+  const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+  const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+  const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+  const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+  const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+  const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)), uint32_c(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))), uint32_c(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))));
+  dim3 _grid(uint32_c(((_size_force_0 - 2) % (((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)) == 0 ? (int64_t)(_size_force_0 - 2) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)) : ((int64_t)(_size_force_0 - 2) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))) + 1)), uint32_c(((_size_force_1 - 2) % (((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))) == 0 ? (int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))) : ((int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) + 1)), uint32_c(((_size_force_2 - 2) % (((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))) == 0 ? (int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))) : ((int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))))) + 1)));
+  internal_streamsweepdoubleprecisioncuda_streamsweepdoubleprecisioncuda::streamsweepdoubleprecisioncuda_streamsweepdoubleprecisioncuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _data_pdfs_tmp, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+  pdfs->swapDataPointers(pdfs_tmp);
+}
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.h
new file mode 100644
index 00000000000..217f90854e2
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepDoublePrecisionCUDA.h
@@ -0,0 +1,128 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file StreamSweepDoublePrecisionCUDA.h
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "field/SwapableCompare.h"
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace pystencils {
+
+class StreamSweepDoublePrecisionCUDA {
+public:
+  StreamSweepDoublePrecisionCUDA(BlockDataID forceID_, BlockDataID pdfsID_,
+                                 BlockDataID velocityID_)
+      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){};
+
+  ~StreamSweepDoublePrecisionCUDA() {
+    for (auto p : cache_pdfs_) {
+      delete p;
+    }
+  }
+
+  void run(IBlock *block, gpuStream_t stream = nullptr);
+
+  void runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers, IBlock *block,
+                         gpuStream_t stream = nullptr);
+
+  void operator()(IBlock *block, gpuStream_t stream = nullptr) {
+    run(block, stream);
+  }
+
+  static std::function<void(IBlock *)>
+  getSweep(const shared_ptr<StreamSweepDoublePrecisionCUDA> &kernel) {
+    return [kernel](IBlock *b) { kernel->run(b); };
+  }
+
+  static std::function<void(IBlock *, gpuStream_t)> getSweepOnCellInterval(
+      const shared_ptr<StreamSweepDoublePrecisionCUDA> &kernel,
+      const shared_ptr<StructuredBlockStorage> &blocks,
+      const CellInterval &globalCellInterval, cell_idx_t ghostLayers = 1) {
+    return [kernel, blocks, globalCellInterval,
+            ghostLayers](IBlock *b, gpuStream_t stream = nullptr) {
+      kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                                stream);
+    };
+  }
+
+  std::function<void(IBlock *)> getSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->run(b, stream); };
+  }
+
+  std::function<void(IBlock *)>
+  getSweepOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers = 1,
+                         gpuStream_t stream = nullptr) {
+    return [this, blocks, globalCellInterval, ghostLayers, stream](IBlock *b) {
+      this->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                              stream);
+    };
+  }
+
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
+  BlockDataID forceID;
+  BlockDataID pdfsID;
+  BlockDataID velocityID;
+
+private:
+  std::set<gpu::GPUField<double> *,
+           field::SwapableCompare<gpu::GPUField<double> *>>
+      cache_pdfs_;
+};
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.cpp
index ea431d01eb1..7a72d3fc836 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file StreamSweepSinglePrecision.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -49,147 +48,59 @@ namespace pystencils {
 namespace internal_streamsweepsingleprecision_streamsweepsingleprecision {
 static FUNC_PREFIX void streamsweepsingleprecision_streamsweepsingleprecision(float *RESTRICT const _data_force, float *RESTRICT const _data_pdfs, float *RESTRICT _data_pdfs_tmp, float *RESTRICT _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) {
   for (int64_t ctr_2 = 1; ctr_2 < _size_force_2 - 1; ctr_2 += 1) {
-    float *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    float *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    float *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    float *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    float *RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2 * ctr_2;
-    float *RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2 * ctr_2 + _stride_velocity_3;
-    float *RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3;
-    float *RESTRICT _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2;
-    float *RESTRICT _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3;
     for (int64_t ctr_1 = 1; ctr_1 < _size_force_1 - 1; ctr_1 += 1) {
-      float *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      float *RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
-      float *RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
-      float *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      float *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      float *RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_35;
-      float *RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_36;
-      float *RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
-      float *RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
-      float *RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
-      float *RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
-      float *RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
-      float *RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
-      float *RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_313;
-      float *RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_314;
-      float *RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
-      float *RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
-      float *RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_317;
-      float *RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_318;
-      float *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      float *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      float *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      float *RESTRICT _data_velocity_20_30_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_30;
-      float *RESTRICT _data_velocity_20_31_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_31;
-      float *RESTRICT _data_velocity_20_32_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_32;
-      float *RESTRICT _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_30;
-      float *RESTRICT _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_31;
-      float *RESTRICT _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_32;
-      float *RESTRICT _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_33;
-      float *RESTRICT _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_34;
-      float *RESTRICT _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_35;
-      float *RESTRICT _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_36;
-      float *RESTRICT _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_37;
-      float *RESTRICT _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_38;
-      float *RESTRICT _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_39;
-      float *RESTRICT _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_310;
-      float *RESTRICT _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_311;
-      float *RESTRICT _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_312;
-      float *RESTRICT _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_313;
-      float *RESTRICT _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_314;
-      float *RESTRICT _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_315;
-      float *RESTRICT _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_316;
-      float *RESTRICT _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_317;
-      float *RESTRICT _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_318;
       for (int64_t ctr_0 = 1; ctr_0 < _size_force_0 - 1; ctr_0 += 1) {
-        const float streamed_0 = _data_pdfs_20_30_10[_stride_pdfs_0 * ctr_0];
-        const float streamed_1 = _data_pdfs_20_31_1m1[_stride_pdfs_0 * ctr_0];
-        const float streamed_2 = _data_pdfs_20_32_11[_stride_pdfs_0 * ctr_0];
-        const float streamed_3 = _data_pdfs_20_33_10[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
-        const float streamed_4 = _data_pdfs_20_34_10[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
-        const float streamed_5 = _data_pdfs_2m1_35_10[_stride_pdfs_0 * ctr_0];
-        const float streamed_6 = _data_pdfs_21_36_10[_stride_pdfs_0 * ctr_0];
-        const float streamed_7 = _data_pdfs_20_37_1m1[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
-        const float streamed_8 = _data_pdfs_20_38_1m1[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
-        const float streamed_9 = _data_pdfs_20_39_11[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
-        const float streamed_10 = _data_pdfs_20_310_11[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
-        const float streamed_11 = _data_pdfs_2m1_311_1m1[_stride_pdfs_0 * ctr_0];
-        const float streamed_12 = _data_pdfs_2m1_312_11[_stride_pdfs_0 * ctr_0];
-        const float streamed_13 = _data_pdfs_2m1_313_10[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
-        const float streamed_14 = _data_pdfs_2m1_314_10[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
-        const float streamed_15 = _data_pdfs_21_315_1m1[_stride_pdfs_0 * ctr_0];
-        const float streamed_16 = _data_pdfs_21_316_11[_stride_pdfs_0 * ctr_0];
-        const float streamed_17 = _data_pdfs_21_317_10[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
-        const float streamed_18 = _data_pdfs_21_318_10[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
+        const float streamed_0 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+        const float streamed_1 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+        const float streamed_2 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+        const float streamed_3 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+        const float streamed_4 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
+        const float streamed_5 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3];
+        const float streamed_6 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3];
+        const float streamed_7 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+        const float streamed_8 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+        const float streamed_9 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+        const float streamed_10 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+        const float streamed_11 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3];
+        const float streamed_12 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3];
+        const float streamed_13 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3];
+        const float streamed_14 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3];
+        const float streamed_15 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3];
+        const float streamed_16 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3];
+        const float streamed_17 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3];
+        const float streamed_18 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3];
         const float vel0Term = streamed_10 + streamed_14 + streamed_18 + streamed_4 + streamed_8;
-        const float momdensity_0 = streamed_13 * -1.0f + streamed_17 * -1.0f + streamed_3 * -1.0f + streamed_7 * -1.0f + streamed_9 * -1.0f + vel0Term;
+        const float momdensity_0 = -streamed_13 - streamed_17 - streamed_3 - streamed_7 - streamed_9 + vel0Term;
         const float vel1Term = streamed_1 + streamed_11 + streamed_15 + streamed_7;
-        const float momdensity_1 = streamed_10 * -1.0f + streamed_12 * -1.0f + streamed_16 * -1.0f + streamed_2 * -1.0f + streamed_8 + streamed_9 * -1.0f + vel1Term;
+        const float momdensity_1 = -streamed_10 - streamed_12 - streamed_16 - streamed_2 + streamed_8 - streamed_9 + vel1Term;
         const float vel2Term = streamed_12 + streamed_13 + streamed_5;
         const float rho = streamed_0 + streamed_16 + streamed_17 + streamed_2 + streamed_3 + streamed_6 + streamed_9 + vel0Term + vel1Term + vel2Term;
-        const float momdensity_2 = streamed_11 + streamed_14 + streamed_15 * -1.0f + streamed_16 * -1.0f + streamed_17 * -1.0f + streamed_18 * -1.0f + streamed_6 * -1.0f + vel2Term;
-        const float u_0 = momdensity_0 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force_20_30_10[_stride_force_0 * ctr_0];
-        const float u_1 = momdensity_1 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force_20_31_10[_stride_force_0 * ctr_0];
-        const float u_2 = momdensity_2 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force_20_32_10[_stride_force_0 * ctr_0];
-        _data_velocity_20_30_10[_stride_velocity_0 * ctr_0] = u_0;
-        _data_velocity_20_31_10[_stride_velocity_0 * ctr_0] = u_1;
-        _data_velocity_20_32_10[_stride_velocity_0 * ctr_0] = u_2;
-        _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_0;
-        _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_1;
-        _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_2;
-        _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_3;
-        _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_4;
-        _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_5;
-        _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_6;
-        _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_7;
-        _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_8;
-        _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_9;
-        _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_10;
-        _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_11;
-        _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_12;
-        _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_13;
-        _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_14;
-        _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_15;
-        _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_16;
-        _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_17;
-        _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0 * ctr_0] = streamed_18;
+        const float momdensity_2 = streamed_11 + streamed_14 - streamed_15 - streamed_16 - streamed_17 - streamed_18 - streamed_6 + vel2Term;
+        const float u_0 = momdensity_0 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+        const float u_1 = momdensity_1 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+        const float u_2 = momdensity_2 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+        _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2] = u_0;
+        _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3] = u_1;
+        _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3] = u_2;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2] = streamed_0;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3] = streamed_1;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3] = streamed_2;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3] = streamed_3;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3] = streamed_4;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3] = streamed_5;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3] = streamed_6;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3] = streamed_7;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3] = streamed_8;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3] = streamed_9;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3] = streamed_10;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3] = streamed_11;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3] = streamed_12;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3] = streamed_13;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3] = streamed_14;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3] = streamed_15;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3] = streamed_16;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3] = streamed_17;
+        _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3] = streamed_18;
       }
     }
   }
@@ -197,9 +108,10 @@ static FUNC_PREFIX void streamsweepsingleprecision_streamsweepsingleprecision(fl
 } // namespace internal_streamsweepsingleprecision_streamsweepsingleprecision
 
 void StreamSweepSinglePrecision::run(IBlock *block) {
+
+  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
-  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
   field::GhostLayerField<float, 19> *pdfs_tmp;
   {
     // Getting temporary field pdfs_tmp
@@ -212,27 +124,27 @@ void StreamSweepSinglePrecision::run(IBlock *block) {
     }
   }
 
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT const _data_pdfs = pdfs->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()))
   float *RESTRICT _data_velocity = velocity->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 2));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 2));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 2));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -254,6 +166,7 @@ void StreamSweepSinglePrecision::run(IBlock *block) {
 }
 
 void StreamSweepSinglePrecision::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -262,9 +175,9 @@ void StreamSweepSinglePrecision::runOnCellInterval(const shared_ptr<StructuredBl
   if (ci.empty())
     return;
 
+  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
-  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
   field::GhostLayerField<float, 19> *pdfs_tmp;
   {
     // Getting temporary field pdfs_tmp
@@ -277,35 +190,35 @@ void StreamSweepSinglePrecision::runOnCellInterval(const shared_ptr<StructuredBl
     }
   }
 
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()))
   float *RESTRICT _data_velocity = velocity->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 2));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 2));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 2));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
   const int64_t _stride_force_0 = int64_t(force->xStride());
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
@@ -335,4 +248,4 @@ void StreamSweepSinglePrecision::runOnCellInterval(const shared_ptr<StructuredBl
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h
index 3cb474e0b33..4425265fc6c 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecision.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -97,6 +98,9 @@ class StreamSweepSinglePrecision {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   BlockDataID velocityID;
@@ -113,4 +117,4 @@ class StreamSweepSinglePrecision {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.cpp b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.cpp
index a6778800989..cd7cc6d231f 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.cpp
@@ -14,11 +14,10 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file StreamSweepSinglePrecisionAVX.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #include <cmath>
 
@@ -51,116 +50,28 @@ namespace pystencils {
 namespace internal_5e7ed0276adbfbb1ac4789ac0a0f54c4 {
 static FUNC_PREFIX void streamsweepsingleprecisionavx_streamsweepsingleprecisionavx(float *RESTRICT const _data_force, float *RESTRICT const _data_pdfs, float *RESTRICT _data_pdfs_tmp, float *RESTRICT _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) {
   for (int64_t ctr_2 = 1; ctr_2 < _size_force_2 - 1; ctr_2 += 1) {
-    float *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
-    float *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3;
-    float *RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3;
-    float *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
-    float *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
-    float *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
-    float *RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2 * ctr_2;
-    float *RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2 * ctr_2 + _stride_velocity_3;
-    float *RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3;
-    float *RESTRICT _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2;
-    float *RESTRICT _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3;
-    float *RESTRICT _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3;
     for (int64_t ctr_1 = 1; ctr_1 < _size_force_1 - 1; ctr_1 += 1) {
-      float *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
-      float *RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
-      float *RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
-      float *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
-      float *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
-      float *RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_35;
-      float *RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_36;
-      float *RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
-      float *RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
-      float *RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
-      float *RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
-      float *RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
-      float *RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
-      float *RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_313;
-      float *RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_314;
-      float *RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
-      float *RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
-      float *RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_317;
-      float *RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_318;
-      float *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
-      float *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
-      float *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
-      float *RESTRICT _data_velocity_20_30_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_30;
-      float *RESTRICT _data_velocity_20_31_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_31;
-      float *RESTRICT _data_velocity_20_32_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_32;
-      float *RESTRICT _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_30;
-      float *RESTRICT _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_31;
-      float *RESTRICT _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_32;
-      float *RESTRICT _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_33;
-      float *RESTRICT _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_34;
-      float *RESTRICT _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_35;
-      float *RESTRICT _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_36;
-      float *RESTRICT _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_37;
-      float *RESTRICT _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_38;
-      float *RESTRICT _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_39;
-      float *RESTRICT _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_310;
-      float *RESTRICT _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_311;
-      float *RESTRICT _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_312;
-      float *RESTRICT _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_313;
-      float *RESTRICT _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_314;
-      float *RESTRICT _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_315;
-      float *RESTRICT _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_316;
-      float *RESTRICT _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_317;
-      float *RESTRICT _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_318;
       {
         for (int64_t ctr_0 = 1; ctr_0 < (int64_t)((_size_force_0 - 2) / (8)) * (8) + 1; ctr_0 += 8) {
-          const __m256 streamed_0 = _mm256_load_ps(&_data_pdfs_20_30_10[ctr_0]);
-          const __m256 streamed_1 = _mm256_load_ps(&_data_pdfs_20_31_1m1[ctr_0]);
-          const __m256 streamed_2 = _mm256_load_ps(&_data_pdfs_20_32_11[ctr_0]);
-          const __m256 streamed_3 = _mm256_loadu_ps(&_data_pdfs_20_33_10[ctr_0 + 1]);
-          const __m256 streamed_4 = _mm256_loadu_ps(&_data_pdfs_20_34_10[ctr_0 - 1]);
-          const __m256 streamed_5 = _mm256_load_ps(&_data_pdfs_2m1_35_10[ctr_0]);
-          const __m256 streamed_6 = _mm256_load_ps(&_data_pdfs_21_36_10[ctr_0]);
-          const __m256 streamed_7 = _mm256_loadu_ps(&_data_pdfs_20_37_1m1[ctr_0 + 1]);
-          const __m256 streamed_8 = _mm256_loadu_ps(&_data_pdfs_20_38_1m1[ctr_0 - 1]);
-          const __m256 streamed_9 = _mm256_loadu_ps(&_data_pdfs_20_39_11[ctr_0 + 1]);
-          const __m256 streamed_10 = _mm256_loadu_ps(&_data_pdfs_20_310_11[ctr_0 - 1]);
-          const __m256 streamed_11 = _mm256_load_ps(&_data_pdfs_2m1_311_1m1[ctr_0]);
-          const __m256 streamed_12 = _mm256_load_ps(&_data_pdfs_2m1_312_11[ctr_0]);
-          const __m256 streamed_13 = _mm256_loadu_ps(&_data_pdfs_2m1_313_10[ctr_0 + 1]);
-          const __m256 streamed_14 = _mm256_loadu_ps(&_data_pdfs_2m1_314_10[ctr_0 - 1]);
-          const __m256 streamed_15 = _mm256_load_ps(&_data_pdfs_21_315_1m1[ctr_0]);
-          const __m256 streamed_16 = _mm256_load_ps(&_data_pdfs_21_316_11[ctr_0]);
-          const __m256 streamed_17 = _mm256_loadu_ps(&_data_pdfs_21_317_10[ctr_0 + 1]);
-          const __m256 streamed_18 = _mm256_loadu_ps(&_data_pdfs_21_318_10[ctr_0 - 1]);
+          const __m256 streamed_0 = _mm256_load_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0]);
+          const __m256 streamed_1 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0]);
+          const __m256 streamed_2 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0]);
+          const __m256 streamed_3 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0 + 1]);
+          const __m256 streamed_4 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0 - 1]);
+          const __m256 streamed_5 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3 + ctr_0]);
+          const __m256 streamed_6 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3 + ctr_0]);
+          const __m256 streamed_7 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0 + 1]);
+          const __m256 streamed_8 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0 - 1]);
+          const __m256 streamed_9 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0 + 1]);
+          const __m256 streamed_10 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0 - 1]);
+          const __m256 streamed_11 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3 + ctr_0]);
+          const __m256 streamed_12 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3 + ctr_0]);
+          const __m256 streamed_13 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3 + ctr_0 + 1]);
+          const __m256 streamed_14 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3 + ctr_0 - 1]);
+          const __m256 streamed_15 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3 + ctr_0]);
+          const __m256 streamed_16 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3 + ctr_0]);
+          const __m256 streamed_17 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3 + ctr_0 + 1]);
+          const __m256 streamed_18 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3 + ctr_0 - 1]);
           const __m256 vel0Term = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(streamed_10, streamed_14), streamed_18), streamed_4), streamed_8);
           const __m256 momdensity_0 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(streamed_13, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(streamed_17, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(streamed_3, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(streamed_7, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(streamed_9, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), vel0Term);
           const __m256 vel1Term = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(streamed_1, streamed_11), streamed_15), streamed_7);
@@ -168,84 +79,84 @@ static FUNC_PREFIX void streamsweepsingleprecisionavx_streamsweepsingleprecision
           const __m256 vel2Term = _mm256_add_ps(_mm256_add_ps(streamed_12, streamed_13), streamed_5);
           const __m256 rho = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(streamed_0, streamed_16), streamed_17), streamed_2), streamed_3), streamed_6), streamed_9), vel0Term), vel1Term), vel2Term);
           const __m256 momdensity_2 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(streamed_15, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(streamed_16, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(streamed_17, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(streamed_18, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(streamed_6, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), streamed_11), streamed_14), vel2Term);
-          const __m256 u_0 = _mm256_add_ps(_mm256_mul_ps(momdensity_0, _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_mul_ps(_mm256_mul_ps(_mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f), _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_load_ps(&_data_force_20_30_10[ctr_0])));
-          const __m256 u_1 = _mm256_add_ps(_mm256_mul_ps(momdensity_1, _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_mul_ps(_mm256_mul_ps(_mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f), _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_load_ps(&_data_force_20_31_10[ctr_0])));
-          const __m256 u_2 = _mm256_add_ps(_mm256_mul_ps(momdensity_2, _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_mul_ps(_mm256_mul_ps(_mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f), _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_load_ps(&_data_force_20_32_10[ctr_0])));
-          _mm256_store_ps(&_data_velocity_20_30_10[ctr_0], u_0);
-          _mm256_store_ps(&_data_velocity_20_31_10[ctr_0], u_1);
-          _mm256_store_ps(&_data_velocity_20_32_10[ctr_0], u_2);
-          _mm256_store_ps(&_data_pdfs_tmp_20_30_10[ctr_0], streamed_0);
-          _mm256_store_ps(&_data_pdfs_tmp_20_31_10[ctr_0], streamed_1);
-          _mm256_store_ps(&_data_pdfs_tmp_20_32_10[ctr_0], streamed_2);
-          _mm256_store_ps(&_data_pdfs_tmp_20_33_10[ctr_0], streamed_3);
-          _mm256_store_ps(&_data_pdfs_tmp_20_34_10[ctr_0], streamed_4);
-          _mm256_store_ps(&_data_pdfs_tmp_20_35_10[ctr_0], streamed_5);
-          _mm256_store_ps(&_data_pdfs_tmp_20_36_10[ctr_0], streamed_6);
-          _mm256_store_ps(&_data_pdfs_tmp_20_37_10[ctr_0], streamed_7);
-          _mm256_store_ps(&_data_pdfs_tmp_20_38_10[ctr_0], streamed_8);
-          _mm256_store_ps(&_data_pdfs_tmp_20_39_10[ctr_0], streamed_9);
-          _mm256_store_ps(&_data_pdfs_tmp_20_310_10[ctr_0], streamed_10);
-          _mm256_store_ps(&_data_pdfs_tmp_20_311_10[ctr_0], streamed_11);
-          _mm256_store_ps(&_data_pdfs_tmp_20_312_10[ctr_0], streamed_12);
-          _mm256_store_ps(&_data_pdfs_tmp_20_313_10[ctr_0], streamed_13);
-          _mm256_store_ps(&_data_pdfs_tmp_20_314_10[ctr_0], streamed_14);
-          _mm256_store_ps(&_data_pdfs_tmp_20_315_10[ctr_0], streamed_15);
-          _mm256_store_ps(&_data_pdfs_tmp_20_316_10[ctr_0], streamed_16);
-          _mm256_store_ps(&_data_pdfs_tmp_20_317_10[ctr_0], streamed_17);
-          _mm256_store_ps(&_data_pdfs_tmp_20_318_10[ctr_0], streamed_18);
+          const __m256 u_0 = _mm256_add_ps(_mm256_mul_ps(momdensity_0, _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_mul_ps(_mm256_mul_ps(_mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f), _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_load_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0])));
+          const __m256 u_1 = _mm256_add_ps(_mm256_mul_ps(momdensity_1, _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_mul_ps(_mm256_mul_ps(_mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f), _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_loadu_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0])));
+          const __m256 u_2 = _mm256_add_ps(_mm256_mul_ps(momdensity_2, _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_mul_ps(_mm256_mul_ps(_mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f), _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho)), _mm256_loadu_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0])));
+          _mm256_store_ps(&_data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + ctr_0], u_0);
+          _mm256_storeu_ps(&_data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3 + ctr_0], u_1);
+          _mm256_storeu_ps(&_data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3 + ctr_0], u_2);
+          _mm256_store_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + ctr_0], streamed_0);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3 + ctr_0], streamed_1);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3 + ctr_0], streamed_2);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3 + ctr_0], streamed_3);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3 + ctr_0], streamed_4);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3 + ctr_0], streamed_5);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3 + ctr_0], streamed_6);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3 + ctr_0], streamed_7);
+          _mm256_store_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3 + ctr_0], streamed_8);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3 + ctr_0], streamed_9);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3 + ctr_0], streamed_10);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3 + ctr_0], streamed_11);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3 + ctr_0], streamed_12);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3 + ctr_0], streamed_13);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3 + ctr_0], streamed_14);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3 + ctr_0], streamed_15);
+          _mm256_store_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3 + ctr_0], streamed_16);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3 + ctr_0], streamed_17);
+          _mm256_storeu_ps(&_data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3 + ctr_0], streamed_18);
         }
         for (int64_t ctr_0 = (int64_t)((_size_force_0 - 2) / (8)) * (8) + 1; ctr_0 < _size_force_0 - 1; ctr_0 += 1) {
-          const float streamed_0 = _data_pdfs_20_30_10[ctr_0];
-          const float streamed_1 = _data_pdfs_20_31_1m1[ctr_0];
-          const float streamed_2 = _data_pdfs_20_32_11[ctr_0];
-          const float streamed_3 = _data_pdfs_20_33_10[ctr_0 + 1];
-          const float streamed_4 = _data_pdfs_20_34_10[ctr_0 - 1];
-          const float streamed_5 = _data_pdfs_2m1_35_10[ctr_0];
-          const float streamed_6 = _data_pdfs_21_36_10[ctr_0];
-          const float streamed_7 = _data_pdfs_20_37_1m1[ctr_0 + 1];
-          const float streamed_8 = _data_pdfs_20_38_1m1[ctr_0 - 1];
-          const float streamed_9 = _data_pdfs_20_39_11[ctr_0 + 1];
-          const float streamed_10 = _data_pdfs_20_310_11[ctr_0 - 1];
-          const float streamed_11 = _data_pdfs_2m1_311_1m1[ctr_0];
-          const float streamed_12 = _data_pdfs_2m1_312_11[ctr_0];
-          const float streamed_13 = _data_pdfs_2m1_313_10[ctr_0 + 1];
-          const float streamed_14 = _data_pdfs_2m1_314_10[ctr_0 - 1];
-          const float streamed_15 = _data_pdfs_21_315_1m1[ctr_0];
-          const float streamed_16 = _data_pdfs_21_316_11[ctr_0];
-          const float streamed_17 = _data_pdfs_21_317_10[ctr_0 + 1];
-          const float streamed_18 = _data_pdfs_21_318_10[ctr_0 - 1];
+          const float streamed_0 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0];
+          const float streamed_1 = _data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0];
+          const float streamed_2 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0];
+          const float streamed_3 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0 + 1];
+          const float streamed_4 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0 - 1];
+          const float streamed_5 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3 + ctr_0];
+          const float streamed_6 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3 + ctr_0];
+          const float streamed_7 = _data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0 + 1];
+          const float streamed_8 = _data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0 - 1];
+          const float streamed_9 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0 + 1];
+          const float streamed_10 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0 - 1];
+          const float streamed_11 = _data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3 + ctr_0];
+          const float streamed_12 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3 + ctr_0];
+          const float streamed_13 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3 + ctr_0 + 1];
+          const float streamed_14 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3 + ctr_0 - 1];
+          const float streamed_15 = _data_pdfs[_stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3 + ctr_0];
+          const float streamed_16 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3 + ctr_0];
+          const float streamed_17 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3 + ctr_0 + 1];
+          const float streamed_18 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3 + ctr_0 - 1];
           const float vel0Term = streamed_10 + streamed_14 + streamed_18 + streamed_4 + streamed_8;
-          const float momdensity_0 = streamed_13 * -1.0f + streamed_17 * -1.0f + streamed_3 * -1.0f + streamed_7 * -1.0f + streamed_9 * -1.0f + vel0Term;
+          const float momdensity_0 = -streamed_13 - streamed_17 - streamed_3 - streamed_7 - streamed_9 + vel0Term;
           const float vel1Term = streamed_1 + streamed_11 + streamed_15 + streamed_7;
-          const float momdensity_1 = streamed_10 * -1.0f + streamed_12 * -1.0f + streamed_16 * -1.0f + streamed_2 * -1.0f + streamed_8 + streamed_9 * -1.0f + vel1Term;
+          const float momdensity_1 = -streamed_10 - streamed_12 - streamed_16 - streamed_2 + streamed_8 - streamed_9 + vel1Term;
           const float vel2Term = streamed_12 + streamed_13 + streamed_5;
           const float rho = streamed_0 + streamed_16 + streamed_17 + streamed_2 + streamed_3 + streamed_6 + streamed_9 + vel0Term + vel1Term + vel2Term;
-          const float momdensity_2 = streamed_11 + streamed_14 + streamed_15 * -1.0f + streamed_16 * -1.0f + streamed_17 * -1.0f + streamed_18 * -1.0f + streamed_6 * -1.0f + vel2Term;
-          const float u_0 = momdensity_0 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force_20_30_10[ctr_0];
-          const float u_1 = momdensity_1 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force_20_31_10[ctr_0];
-          const float u_2 = momdensity_2 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force_20_32_10[ctr_0];
-          _data_velocity_20_30_10[ctr_0] = u_0;
-          _data_velocity_20_31_10[ctr_0] = u_1;
-          _data_velocity_20_32_10[ctr_0] = u_2;
-          _data_pdfs_tmp_20_30_10[ctr_0] = streamed_0;
-          _data_pdfs_tmp_20_31_10[ctr_0] = streamed_1;
-          _data_pdfs_tmp_20_32_10[ctr_0] = streamed_2;
-          _data_pdfs_tmp_20_33_10[ctr_0] = streamed_3;
-          _data_pdfs_tmp_20_34_10[ctr_0] = streamed_4;
-          _data_pdfs_tmp_20_35_10[ctr_0] = streamed_5;
-          _data_pdfs_tmp_20_36_10[ctr_0] = streamed_6;
-          _data_pdfs_tmp_20_37_10[ctr_0] = streamed_7;
-          _data_pdfs_tmp_20_38_10[ctr_0] = streamed_8;
-          _data_pdfs_tmp_20_39_10[ctr_0] = streamed_9;
-          _data_pdfs_tmp_20_310_10[ctr_0] = streamed_10;
-          _data_pdfs_tmp_20_311_10[ctr_0] = streamed_11;
-          _data_pdfs_tmp_20_312_10[ctr_0] = streamed_12;
-          _data_pdfs_tmp_20_313_10[ctr_0] = streamed_13;
-          _data_pdfs_tmp_20_314_10[ctr_0] = streamed_14;
-          _data_pdfs_tmp_20_315_10[ctr_0] = streamed_15;
-          _data_pdfs_tmp_20_316_10[ctr_0] = streamed_16;
-          _data_pdfs_tmp_20_317_10[ctr_0] = streamed_17;
-          _data_pdfs_tmp_20_318_10[ctr_0] = streamed_18;
+          const float momdensity_2 = streamed_11 + streamed_14 - streamed_15 - streamed_16 - streamed_17 - streamed_18 - streamed_6 + vel2Term;
+          const float u_0 = momdensity_0 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0];
+          const float u_1 = momdensity_1 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0];
+          const float u_2 = momdensity_2 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0];
+          _data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + ctr_0] = u_0;
+          _data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3 + ctr_0] = u_1;
+          _data_velocity[_stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3 + ctr_0] = u_2;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + ctr_0] = streamed_0;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3 + ctr_0] = streamed_1;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3 + ctr_0] = streamed_2;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3 + ctr_0] = streamed_3;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3 + ctr_0] = streamed_4;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3 + ctr_0] = streamed_5;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3 + ctr_0] = streamed_6;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3 + ctr_0] = streamed_7;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3 + ctr_0] = streamed_8;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3 + ctr_0] = streamed_9;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3 + ctr_0] = streamed_10;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3 + ctr_0] = streamed_11;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3 + ctr_0] = streamed_12;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3 + ctr_0] = streamed_13;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3 + ctr_0] = streamed_14;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3 + ctr_0] = streamed_15;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3 + ctr_0] = streamed_16;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3 + ctr_0] = streamed_17;
+          _data_pdfs_tmp[_stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3 + ctr_0] = streamed_18;
         }
       }
     }
@@ -254,9 +165,10 @@ static FUNC_PREFIX void streamsweepsingleprecisionavx_streamsweepsingleprecision
 } // namespace internal_5e7ed0276adbfbb1ac4789ac0a0f54c4
 
 void StreamSweepSinglePrecisionAVX::run(IBlock *block) {
+
+  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
-  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
   field::GhostLayerField<float, 19> *pdfs_tmp;
   {
     // Getting temporary field pdfs_tmp
@@ -269,34 +181,34 @@ void StreamSweepSinglePrecisionAVX::run(IBlock *block) {
     }
   }
 
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT const _data_pdfs = pdfs->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs_tmp->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs_tmp->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()))
   float *RESTRICT _data_velocity = velocity->dataAt(-1, -1, -1, 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)velocity->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 2));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 2));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 2));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)velocity->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -314,6 +226,7 @@ void StreamSweepSinglePrecisionAVX::run(IBlock *block) {
 }
 
 void StreamSweepSinglePrecisionAVX::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
+
   CellInterval ci = globalCellInterval;
   CellInterval blockBB = blocks->getBlockCellBB(*block);
   blockBB.expand(ghostLayers);
@@ -322,9 +235,9 @@ void StreamSweepSinglePrecisionAVX::runOnCellInterval(const shared_ptr<Structure
   if (ci.empty())
     return;
 
+  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
   auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
   auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
-  auto velocity = block->getData<field::GhostLayerField<float, 3>>(velocityID);
   field::GhostLayerField<float, 19> *pdfs_tmp;
   {
     // Getting temporary field pdfs_tmp
@@ -337,42 +250,42 @@ void StreamSweepSinglePrecisionAVX::runOnCellInterval(const shared_ptr<Structure
     }
   }
 
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()));
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()))
   float *RESTRICT const _data_force = force->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
   float *RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
   float *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs_tmp->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()));
-  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()));
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs_tmp->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()))
   float *RESTRICT _data_velocity = velocity->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
-  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)velocity->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 2));
-  const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 2));
-  const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
-  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 2));
-  const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 2);
-  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
-  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)velocity->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
   const int64_t _stride_force_1 = int64_t(force->yStride());
   const int64_t _stride_force_2 = int64_t(force->zStride());
   const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
@@ -398,4 +311,4 @@ void StreamSweepSinglePrecisionAVX::runOnCellInterval(const shared_ptr<Structure
 
 #if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
 #pragma warning pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h
index 57285414ff5..095c86c6cb9 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionAVX.h
@@ -17,12 +17,13 @@
 //! \\author pystencils
 //======================================================================================================================
 
-// kernel generated with pystencils v1.2, lbmpy v1.2,
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
 // lbmpy_walberla/pystencils_walberla from waLBerla commit
-// 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -96,6 +97,9 @@ class StreamSweepSinglePrecisionAVX {
     };
   }
 
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
   BlockDataID forceID;
   BlockDataID pdfsID;
   BlockDataID velocityID;
@@ -112,4 +116,4 @@ class StreamSweepSinglePrecisionAVX {
 #if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
     (defined WALBERLA_CXX_COMPILER_IS_CLANG)
 #pragma GCC diagnostic pop
-#endif
\ No newline at end of file
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.cu
new file mode 100644
index 00000000000..f16ecffca6c
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.cu
@@ -0,0 +1,254 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file StreamSweepSinglePrecisionCUDA.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#include <cmath>
+
+#include "StreamSweepSinglePrecisionCUDA.h"
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+
+#define FUNC_PREFIX __global__
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#pragma GCC diagnostic ignored "-Wshadow"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning push
+#pragma warning(disable : 1599)
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace pystencils {
+
+namespace internal_streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda {
+static FUNC_PREFIX __launch_bounds__(256) void streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda(float *RESTRICT const _data_force, float *RESTRICT const _data_pdfs, float *RESTRICT _data_pdfs_tmp, float *RESTRICT _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) {
+  if (blockDim.x * blockIdx.x + threadIdx.x + 1 < _size_force_0 - 1 && blockDim.y * blockIdx.y + threadIdx.y + 1 < _size_force_1 - 1 && blockDim.z * blockIdx.z + threadIdx.z + 1 < _size_force_2 - 1) {
+    const int64_t ctr_0 = blockDim.x * blockIdx.x + threadIdx.x + 1;
+    const int64_t ctr_1 = blockDim.y * blockIdx.y + threadIdx.y + 1;
+    const int64_t ctr_2 = blockDim.z * blockIdx.z + threadIdx.z + 1;
+    const float streamed_0 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2];
+    const float streamed_1 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3];
+    const float streamed_2 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3];
+    const float streamed_3 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3];
+    const float streamed_4 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3];
+    const float streamed_5 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3];
+    const float streamed_6 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3];
+    const float streamed_7 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3];
+    const float streamed_8 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3];
+    const float streamed_9 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3];
+    const float streamed_10 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3];
+    const float streamed_11 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3];
+    const float streamed_12 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3];
+    const float streamed_13 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3];
+    const float streamed_14 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3];
+    const float streamed_15 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3];
+    const float streamed_16 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3];
+    const float streamed_17 = _data_pdfs[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3];
+    const float streamed_18 = _data_pdfs[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0 + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3];
+    const float vel0Term = streamed_10 + streamed_14 + streamed_18 + streamed_4 + streamed_8;
+    const float momdensity_0 = -streamed_13 - streamed_17 - streamed_3 - streamed_7 - streamed_9 + vel0Term;
+    const float vel1Term = streamed_1 + streamed_11 + streamed_15 + streamed_7;
+    const float momdensity_1 = -streamed_10 - streamed_12 - streamed_16 - streamed_2 + streamed_8 - streamed_9 + vel1Term;
+    const float vel2Term = streamed_12 + streamed_13 + streamed_5;
+    const float rho = streamed_0 + streamed_16 + streamed_17 + streamed_2 + streamed_3 + streamed_6 + streamed_9 + vel0Term + vel1Term + vel2Term;
+    const float momdensity_2 = streamed_11 + streamed_14 - streamed_15 - streamed_16 - streamed_17 - streamed_18 - streamed_6 + vel2Term;
+    const float u_0 = momdensity_0 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2];
+    const float u_1 = momdensity_1 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3];
+    const float u_2 = momdensity_2 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force[_stride_force_0 * ctr_0 + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3];
+    _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2] = u_0;
+    _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3] = u_1;
+    _data_velocity[_stride_velocity_0 * ctr_0 + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3] = u_2;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2] = streamed_0;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3] = streamed_1;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3] = streamed_2;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3] = streamed_3;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3] = streamed_4;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3] = streamed_5;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3] = streamed_6;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3] = streamed_7;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3] = streamed_8;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3] = streamed_9;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3] = streamed_10;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3] = streamed_11;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3] = streamed_12;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3] = streamed_13;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3] = streamed_14;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3] = streamed_15;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3] = streamed_16;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3] = streamed_17;
+    _data_pdfs_tmp[_stride_pdfs_tmp_0 * ctr_0 + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3] = streamed_18;
+  }
+}
+} // namespace internal_streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda
+
+void StreamSweepSinglePrecisionCUDA::run(IBlock *block, gpuStream_t stream) {
+
+  auto velocity = block->getData<gpu::GPUField<float>>(velocityID);
+  auto pdfs = block->getData<gpu::GPUField<float>>(pdfsID);
+  auto force = block->getData<gpu::GPUField<float>>(forceID);
+  gpu::GPUField<float> *pdfs_tmp;
+  {
+    // Getting temporary field pdfs_tmp
+    auto it = cache_pdfs_.find(pdfs);
+    if (it != cache_pdfs_.end()) {
+      pdfs_tmp = *it;
+    } else {
+      pdfs_tmp = pdfs->cloneUninitialized();
+      cache_pdfs_.insert(pdfs_tmp);
+    }
+  }
+
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()))
+  float *RESTRICT const _data_force = force->dataAt(-1, -1, -1, 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()))
+  float *RESTRICT const _data_pdfs = pdfs->dataAt(-1, -1, -1, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  float *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-1, -1, -1, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()))
+  float *RESTRICT _data_velocity = velocity->dataAt(-1, -1, -1, 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+  const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+  const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+  const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+  const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+  const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+  const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+  const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)), uint32_c(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))), uint32_c(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))));
+  dim3 _grid(uint32_c(((_size_force_0 - 2) % (((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)) == 0 ? (int64_t)(_size_force_0 - 2) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)) : ((int64_t)(_size_force_0 - 2) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))) + 1)), uint32_c(((_size_force_1 - 2) % (((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))) == 0 ? (int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))) : ((int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) + 1)), uint32_c(((_size_force_2 - 2) % (((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))) == 0 ? (int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))) : ((int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))))) + 1)));
+  internal_streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda::streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _data_pdfs_tmp, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+  pdfs->swapDataPointers(pdfs_tmp);
+}
+
+void StreamSweepSinglePrecisionCUDA::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block, gpuStream_t stream) {
+
+  CellInterval ci = globalCellInterval;
+  CellInterval blockBB = blocks->getBlockCellBB(*block);
+  blockBB.expand(ghostLayers);
+  ci.intersect(blockBB);
+  blocks->transformGlobalToBlockLocalCellInterval(ci, *block);
+  if (ci.empty())
+    return;
+
+  auto velocity = block->getData<gpu::GPUField<float>>(velocityID);
+  auto pdfs = block->getData<gpu::GPUField<float>>(pdfsID);
+  auto force = block->getData<gpu::GPUField<float>>(forceID);
+  gpu::GPUField<float> *pdfs_tmp;
+  {
+    // Getting temporary field pdfs_tmp
+    auto it = cache_pdfs_.find(pdfs);
+    if (it != cache_pdfs_.end()) {
+      pdfs_tmp = *it;
+    } else {
+      pdfs_tmp = pdfs->cloneUninitialized();
+      cache_pdfs_.insert(pdfs_tmp);
+    }
+  }
+
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()))
+  float *RESTRICT const _data_force = force->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
+  float *RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
+  float *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
+  WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()))
+  float *RESTRICT _data_velocity = velocity->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
+  WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 2))
+  const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 2))
+  const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 2))
+  const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 2);
+  WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
+  const int64_t _stride_force_0 = int64_t(force->xStride());
+  const int64_t _stride_force_1 = int64_t(force->yStride());
+  const int64_t _stride_force_2 = int64_t(force->zStride());
+  const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
+  const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+  const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+  const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+  const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+  const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+  const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+  const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+  const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+  const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+  const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+  const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+  const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+  dim3 _block(uint32_c(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)), uint32_c(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))), uint32_c(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))));
+  dim3 _grid(uint32_c(((_size_force_0 - 2) % (((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)) == 0 ? (int64_t)(_size_force_0 - 2) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)) : ((int64_t)(_size_force_0 - 2) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))) + 1)), uint32_c(((_size_force_1 - 2) % (((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))) == 0 ? (int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))) : ((int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) + 1)), uint32_c(((_size_force_2 - 2) % (((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))) == 0 ? (int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))))) : ((int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2) * ((_size_force_1 - 2 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_force_0 - 2) ? 128 : _size_force_0 - 2)))))))))) + 1)));
+  internal_streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda::streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _data_pdfs_tmp, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+  pdfs->swapDataPointers(pdfs_tmp);
+}
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
+#pragma warning pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.h
new file mode 100644
index 00000000000..302b62816b7
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/StreamSweepSinglePrecisionCUDA.h
@@ -0,0 +1,128 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file StreamSweepSinglePrecisionCUDA.h
+//! \\author pystencils
+//======================================================================================================================
+
+// kernel generated with pystencils v1.3.3, lbmpy v1.3.3,
+// lbmpy_walberla/pystencils_walberla from waLBerla commit
+// b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
+
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "field/SwapableCompare.h"
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace pystencils {
+
+class StreamSweepSinglePrecisionCUDA {
+public:
+  StreamSweepSinglePrecisionCUDA(BlockDataID forceID_, BlockDataID pdfsID_,
+                                 BlockDataID velocityID_)
+      : forceID(forceID_), pdfsID(pdfsID_), velocityID(velocityID_){};
+
+  ~StreamSweepSinglePrecisionCUDA() {
+    for (auto p : cache_pdfs_) {
+      delete p;
+    }
+  }
+
+  void run(IBlock *block, gpuStream_t stream = nullptr);
+
+  void runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers, IBlock *block,
+                         gpuStream_t stream = nullptr);
+
+  void operator()(IBlock *block, gpuStream_t stream = nullptr) {
+    run(block, stream);
+  }
+
+  static std::function<void(IBlock *)>
+  getSweep(const shared_ptr<StreamSweepSinglePrecisionCUDA> &kernel) {
+    return [kernel](IBlock *b) { kernel->run(b); };
+  }
+
+  static std::function<void(IBlock *, gpuStream_t)> getSweepOnCellInterval(
+      const shared_ptr<StreamSweepSinglePrecisionCUDA> &kernel,
+      const shared_ptr<StructuredBlockStorage> &blocks,
+      const CellInterval &globalCellInterval, cell_idx_t ghostLayers = 1) {
+    return [kernel, blocks, globalCellInterval,
+            ghostLayers](IBlock *b, gpuStream_t stream = nullptr) {
+      kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                                stream);
+    };
+  }
+
+  std::function<void(IBlock *)> getSweep(gpuStream_t stream = nullptr) {
+    return [this, stream](IBlock *b) { this->run(b, stream); };
+  }
+
+  std::function<void(IBlock *)>
+  getSweepOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks,
+                         const CellInterval &globalCellInterval,
+                         cell_idx_t ghostLayers = 1,
+                         gpuStream_t stream = nullptr) {
+    return [this, blocks, globalCellInterval, ghostLayers, stream](IBlock *b) {
+      this->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b,
+                              stream);
+    };
+  }
+
+  void configure(const shared_ptr<StructuredBlockStorage> &blocks,
+                 IBlock *block) {}
+
+  BlockDataID forceID;
+  BlockDataID pdfsID;
+  BlockDataID velocityID;
+
+private:
+  std::set<gpu::GPUField<float> *,
+           field::SwapableCompare<gpu::GPUField<float> *>>
+      cache_pdfs_;
+};
+
+} // namespace pystencils
+} // namespace walberla
+
+#if (defined WALBERLA_CXX_COMPILER_IS_GNU) ||                                  \
+    (defined WALBERLA_CXX_COMPILER_IS_CLANG)
+#pragma GCC diagnostic pop
+#endif
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/philox_rand.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/philox_rand.h
index 3c97a91f984..fd9437a9869 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/philox_rand.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/philox_rand.h
@@ -35,11 +35,17 @@
 #endif
 #endif
 
-#ifndef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__clang__) && defined(__CUDA__)
+#if defined(__clang__) && defined(QUALIFIERS)
+#undef QUALIFIERS
+#endif
+#define QUALIFIERS static __forceinline__ __device__
+#else
+#if defined(__clang__) && defined(QUALIFIERS)
+#undef QUALIFIERS
+#endif
 #define QUALIFIERS inline
 #include "myintrin.h"
-#else
-#define QUALIFIERS static __forceinline__ __device__
 #endif
 
 #define PHILOX_W32_0 (0x9E3779B9)
diff --git a/src/walberla_bridge/src/lattice_boltzmann/lb_kernels.cuh b/src/walberla_bridge/src/lattice_boltzmann/lb_kernels.cuh
new file mode 100644
index 00000000000..f9dc9ae83d2
--- /dev/null
+++ b/src/walberla_bridge/src/lattice_boltzmann/lb_kernels.cuh
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2021-2024 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "walberla_bridge/Architecture.hpp"
+
+#include "lb_kernels.hpp"
+
+#include "generated_kernels/Dynamic_UBB_double_precisionCUDA.h"
+#include "generated_kernels/Dynamic_UBB_single_precisionCUDA.h"
+#include "generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh"
+#include "generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh"
+#include "generated_kernels/InitialPDFsSetterDoublePrecisionCUDA.h"
+#include "generated_kernels/InitialPDFsSetterSinglePrecisionCUDA.h"
+#include "generated_kernels/StreamSweepDoublePrecisionCUDA.h"
+#include "generated_kernels/StreamSweepSinglePrecisionCUDA.h"
+
+#include "generated_kernels/CollideSweepDoublePrecisionLeesEdwardsCUDA.h"
+#include "generated_kernels/CollideSweepDoublePrecisionThermalizedCUDA.h"
+#include "generated_kernels/CollideSweepSinglePrecisionLeesEdwardsCUDA.h"
+#include "generated_kernels/CollideSweepSinglePrecisionThermalizedCUDA.h"
+
+namespace walberla {
+namespace detail {
+
+using lbmpy::Arch;
+
+template <> struct KernelTrait<double, Arch::GPU> {
+  using CollisionModelThermalized =
+      pystencils::CollideSweepDoublePrecisionThermalizedCUDA;
+  using CollisionModelLeesEdwards =
+      pystencils::CollideSweepDoublePrecisionLeesEdwardsCUDA;
+  using StreamSweep = pystencils::StreamSweepDoublePrecisionCUDA;
+  using InitialPDFsSetter = pystencils::InitialPDFsSetterDoublePrecisionCUDA;
+};
+
+template <> struct KernelTrait<float, Arch::GPU> {
+  using CollisionModelThermalized =
+      pystencils::CollideSweepSinglePrecisionThermalizedCUDA;
+  using CollisionModelLeesEdwards =
+      pystencils::CollideSweepSinglePrecisionLeesEdwardsCUDA;
+  using StreamSweep = pystencils::StreamSweepSinglePrecisionCUDA;
+  using InitialPDFsSetter = pystencils::InitialPDFsSetterSinglePrecisionCUDA;
+};
+
+template <> struct BoundaryHandlingTrait<double, Arch::GPU> {
+  using Dynamic_UBB = lbm::Dynamic_UBB_double_precisionCUDA;
+};
+
+template <> struct BoundaryHandlingTrait<float, Arch::GPU> {
+  using Dynamic_UBB = lbm::Dynamic_UBB_single_precisionCUDA;
+};
+
+} // namespace detail
+} // namespace walberla
diff --git a/src/walberla_bridge/src/lattice_boltzmann/lb_walberla_init.cpp b/src/walberla_bridge/src/lattice_boltzmann/lb_walberla_init.cpp
index 9ce57049b42..fc07c5814fd 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/lb_walberla_init.cpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/lb_walberla_init.cpp
@@ -27,8 +27,8 @@
 #include <memory>
 
 std::shared_ptr<LBWalberlaBase>
-new_lb_walberla(std::shared_ptr<LatticeWalberla> const &lattice,
-                double viscosity, double density, bool single_precision) {
+new_lb_walberla_cpu(std::shared_ptr<LatticeWalberla> const &lattice,
+                    double viscosity, double density, bool single_precision) {
   if (single_precision) {
     return std::make_shared<walberla::LBWalberlaImpl<float, lbmpy::Arch::CPU>>(
         lattice, viscosity, density);
diff --git a/src/walberla_bridge/src/lattice_boltzmann/lb_walberla_init.cu b/src/walberla_bridge/src/lattice_boltzmann/lb_walberla_init.cu
index 9e75c37075d..651f59da6cb 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/lb_walberla_init.cu
+++ b/src/walberla_bridge/src/lattice_boltzmann/lb_walberla_init.cu
@@ -16,3 +16,47 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
+
+#include <walberla_bridge/lattice_boltzmann/lb_walberla_init.hpp>
+
+#if defined(__NVCC__)
+#define RESTRICT __restrict__
+#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 554 // no implicit or explicit cast
+#else
+#pragma push
+#pragma diag_suppress 554 // no implicit or explicit cast
+#endif
+#endif
+
+#include "LBWalberlaImpl.hpp"
+
+#if defined(__NVCC__)
+#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+#pragma nv_diagnostic pop
+#else
+#pragma pop
+#endif
+#endif
+
+#include <walberla_bridge/Architecture.hpp>
+#include <walberla_bridge/LatticeWalberla.hpp>
+#include <walberla_bridge/lattice_boltzmann/LBWalberlaBase.hpp>
+
+#include <gpu/DeviceSelectMPI.h>
+
+#include <memory>
+
+std::shared_ptr<LBWalberlaBase>
+new_lb_walberla_gpu(std::shared_ptr<LatticeWalberla> const &lattice,
+                    double viscosity, double density, bool single_precision) {
+  if (single_precision) {
+    return std::make_shared<walberla::LBWalberlaImpl<float, lbmpy::Arch::GPU>>(
+        lattice, viscosity, density);
+  }
+  return std::make_shared<walberla::LBWalberlaImpl<double, lbmpy::Arch::GPU>>(
+      lattice, viscosity, density);
+}
+
+void set_device_id_per_rank() { walberla::gpu::selectDeviceBasedOnMpiRank(); }
diff --git a/src/walberla_bridge/src/utils/boundary.hpp b/src/walberla_bridge/src/utils/boundary.hpp
index 7d3f3cdb07f..719c028aa43 100644
--- a/src/walberla_bridge/src/utils/boundary.hpp
+++ b/src/walberla_bridge/src/utils/boundary.hpp
@@ -87,9 +87,10 @@ void set_boundary_from_grid(BoundaryModel &boundary,
   auto const grid_size = lattice.get_grid_dimensions();
   auto const offset = lattice.get_local_grid_range().first;
   auto const gl = static_cast<int>(lattice.get_ghost_layers());
-  assert(raster_flat.size() == Utils::product(grid_size));
-  auto const n_y = grid_size[1];
-  auto const n_z = grid_size[2];
+  assert(raster_flat.size() ==
+         static_cast<std::size_t>(Utils::product(grid_size)));
+  auto const n_y = static_cast<std::size_t>(grid_size[1]);
+  auto const n_z = static_cast<std::size_t>(grid_size[2]);
 
   for (auto const &block : *lattice.get_blocks()) {
     auto const [size_i, size_j, size_k] = boundary.block_dims(block);
@@ -100,7 +101,9 @@ void set_boundary_from_grid(BoundaryModel &boundary,
         for (int k = -gl; k < size_k + gl; ++k) {
           auto const node = offset + Utils::Vector3i{{i, j, k}};
           auto const idx = (node + grid_size) % grid_size;
-          auto const index = idx[0] * n_y * n_z + idx[1] * n_z + idx[2];
+          auto const index = static_cast<std::size_t>(idx[0]) * n_y * n_z +
+                             static_cast<std::size_t>(idx[1]) * n_z +
+                             static_cast<std::size_t>(idx[2]);
           if (raster_flat[index]) {
             auto const &value = data_flat[index];
             auto const bc = get_block_and_cell(lattice, node, true);
diff --git a/src/walberla_bridge/tests/CMakeLists.txt b/src/walberla_bridge/tests/CMakeLists.txt
index 4cbf9ce15d7..83a7d9d2ee4 100644
--- a/src/walberla_bridge/tests/CMakeLists.txt
+++ b/src/walberla_bridge/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2020-2023 The ESPResSo project
+# Copyright (C) 2020-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,52 +17,44 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
-
-function(ESPRESSO_WALBERLA_UNIT_TEST)
-  cmake_parse_arguments(TEST "" "NAME;NUM_PROC" "SRC;DEPENDS" ${ARGN})
-  unit_test(NAME ${TEST_NAME} NUM_PROC ${TEST_NUM_PROC} SRC ${TEST_SRC} DEPENDS
-            ${TEST_DEPENDS} espresso::walberla espresso::walberla::cpp_flags
-            espresso::utils)
+include(espresso_unit_test)
+
+function(ESPRESSO_ADD_TEST)
+  cmake_parse_arguments(TEST "" "SRC;NAME;NUM_PROC" "DEPENDS" ${ARGN})
+  espresso_unit_test(
+    SRC ${TEST_SRC} NAME ${TEST_NAME} NUM_PROC ${TEST_NUM_PROC} DEPENDS
+    ${TEST_DEPENDS} espresso::walberla espresso::utils)
+  if(${TEST_SRC} MATCHES ".*\.cu$")
+    target_link_libraries(${TEST_NAME} PRIVATE espresso::walberla::cuda_flags
+                                               espresso::walberla_cuda)
+  else()
+    target_link_libraries(${TEST_NAME} PRIVATE espresso::walberla::cpp_flags)
+  endif()
   set_target_properties(${TEST_NAME} PROPERTIES CXX_CLANG_TIDY "")
   target_include_directories(${TEST_NAME} PRIVATE ${WALBERLA_INCLUDE_DIRS}
                                                   ${walberla_BINARY_DIR}/src)
   target_link_libraries(${TEST_NAME} PRIVATE ${WALBERLA_LIBS})
 endfunction()
 
-espresso_walberla_unit_test(NAME ResourceManager_test SRC
-                            ResourceManager_test.cpp)
-
-espresso_walberla_unit_test(NAME lb_kernels_unit_tests SRC
-                            lb_kernels_unit_tests.cpp)
-
-espresso_walberla_unit_test(NAME ek_kernels_unit_tests SRC
-                            ek_kernels_unit_tests.cpp)
-
-espresso_walberla_unit_test(
-  NAME LatticeWalberla_unit_tests SRC LatticeWalberla_unit_tests.cpp DEPENDS
-  Boost::boost Boost::mpi NUM_PROC 2)
-
-espresso_walberla_unit_test(
-  NAME LBWalberlaImpl_unit_tests SRC LBWalberlaImpl_unit_tests.cpp DEPENDS
-  Boost::boost Boost::mpi NUM_PROC 2)
-
-espresso_walberla_unit_test(
-  NAME LBWalberlaImpl_bspline_tests SRC LBWalberlaImpl_bspline_tests.cpp
-  DEPENDS Boost::mpi NUM_PROC 2)
+espresso_add_test(SRC ResourceManager_test.cpp)
+espresso_add_test(SRC lb_kernels_unit_tests.cpp)
+espresso_add_test(SRC ek_kernels_unit_tests.cpp)
+espresso_add_test(SRC LatticeWalberla_unit_tests.cpp DEPENDS Boost::mpi
+                  NUM_PROC 2)
+espresso_add_test(SRC LBWalberlaImpl_unit_tests.cpp DEPENDS Boost::mpi NUM_PROC
+                  2)
+espresso_add_test(SRC LBWalberlaImpl_bspline_tests.cpp DEPENDS Boost::mpi
+                  NUM_PROC 2)
+espresso_add_test(SRC LBWalberlaImpl_flow_tests.cpp DEPENDS Boost::mpi)
+espresso_add_test(SRC LBWalberlaImpl_lees_edwards_tests.cpp DEPENDS Boost::mpi)
+espresso_add_test(SRC EKinWalberlaImpl_unit_tests.cpp DEPENDS Boost::mpi
+                  NUM_PROC 2)
 
 if(NOT (ESPRESSO_BUILD_WITH_ASAN OR ESPRESSO_BUILD_WITH_UBSAN))
-  espresso_walberla_unit_test(
-    NAME LBWalberlaImpl_statistical_tests SRC
-    LBWalberlaImpl_statistical_tests.cpp DEPENDS Boost::mpi)
+  espresso_add_test(SRC LBWalberlaImpl_statistical_tests.cpp DEPENDS Boost::mpi)
 endif()
 
-espresso_walberla_unit_test(NAME LBWalberlaImpl_flow_tests SRC
-                            LBWalberlaImpl_flow_tests.cpp DEPENDS Boost::mpi)
-
-espresso_walberla_unit_test(NAME LBWalberlaImpl_lees_edwards_test SRC
-                            LBWalberlaImpl_lees_edwards.cpp DEPENDS Boost::mpi)
-
-espresso_walberla_unit_test(
-  NAME EKinWalberlaImpl_unit_tests SRC EKinWalberlaImpl_unit_tests.cpp DEPENDS
-  Boost::boost Boost::mpi NUM_PROC 2)
+if(ESPRESSO_BUILD_WITH_CUDA AND WALBERLA_BUILD_WITH_CUDA)
+  espresso_add_test(SRC LBWalberlaImpl_field_accessors_tests.cu DEPENDS
+                    Boost::mpi)
+endif()
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu b/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu
new file mode 100644
index 00000000000..30c98ab2e7a
--- /dev/null
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu
@@ -0,0 +1,392 @@
+/*
+ * Copyright (C) 2024 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#define BOOST_TEST_MODULE "field accessors test"
+#define BOOST_TEST_DYN_LINK
+#define BOOST_TEST_NO_MAIN
+
+#if defined(__NVCC__)
+// Fix for https://i10git.cs.fau.de/walberla/walberla/-/issues/244
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 554 // unreachable conversion operator
+#endif
+
+#include <boost/test/data/monomorphic.hpp>
+#include <boost/test/data/test_case.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include "../src/lattice_boltzmann/LBWalberlaImpl.hpp"
+#include "../src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h"
+#include "../src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh"
+#include "../src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h"
+#include "../src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh"
+
+#include <walberla_bridge/Architecture.hpp>
+#include <walberla_bridge/BlockAndCell.hpp>
+
+#include <utils/Vector.hpp>
+
+#include <boost/mpi/collectives/all_reduce.hpp>
+#include <boost/mpi/communicator.hpp>
+#include <boost/mpl/list.hpp>
+
+#include <mpi.h>
+
+#include <cuda.h>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <concepts>
+#include <cstddef>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <ranges>
+#include <span>
+#include <sstream>
+#include <vector>
+
+static Utils::Vector3i mpi_shape;
+
+boost::test_tools::assertion_result has_gpu(boost::unit_test::test_unit_id) {
+  bool has_compatible_device = false;
+  int n_devices = 0;
+  cudaGetDeviceCount(&n_devices);
+  if (n_devices > 0) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    if (prop.major >= 3) {
+      has_compatible_device = true;
+    }
+  }
+  return has_compatible_device;
+}
+
+template <std::ranges::contiguous_range R>
+  requires std::ranges::sized_range<R>
+boost::test_tools::predicate_result almost_equal(R const &val, R const &ref,
+                                                 typename R::value_type atol) {
+  assert(val.size() == ref.size());
+  assert(not val.empty());
+  auto const print_first_n = [](R const &v) {
+    auto constexpr n = std::size_t{6u};
+    std::stringstream stream;
+    stream << "[";
+    for (auto i = 0ul, end = std::min(v.size(), n); i < end; ++i) {
+      if (i) {
+        stream << ", ";
+      }
+      stream << v[i];
+    }
+    if (v.size() > n) {
+      stream << ", ...";
+    }
+    stream << "]";
+    return stream.str();
+  };
+  boost::test_tools::predicate_result res(true);
+  for (auto i = 0ul; i < val.size(); ++i) {
+    if (auto const diff = std::abs(val[i] - ref[i]); diff > atol) {
+      res = false;
+      res.message() << "val{" << print_first_n(val) << "} and " << "ref{"
+                    << print_first_n(ref) << "} mismatch: " << "val[" << i
+                    << "]{" << val[i] << "} != " << "ref[" << i << "]{"
+                    << ref[i] << "} " << "(difference{" << diff << "} > delta{"
+                    << atol << "})";
+      break;
+    }
+  }
+  return res;
+}
+
+template <typename T>
+boost::test_tools::predicate_result almost_equal(walberla::Vector3<T> const &a,
+                                                 walberla::Vector3<T> const &b,
+                                                 T atol) {
+  return almost_equal(std::span(a.data(), 3ul), std::span(b.data(), 3ul), atol);
+}
+
+template <typename T>
+boost::test_tools::predicate_result almost_equal(walberla::Matrix3<T> const &a,
+                                                 walberla::Matrix3<T> const &b,
+                                                 T atol) {
+  return almost_equal(std::span(a.data(), 9ul), std::span(b.data(), 9ul), atol);
+}
+
+template <typename T>
+  requires std::floating_point<T>
+boost::test_tools::predicate_result almost_equal(T a, T b, T atol) {
+  return almost_equal(std::span(&a, 1ul), std::span(&b, 1ul), atol);
+}
+
+template <typename FT, lbmpy::Arch Architecture>
+class LBWalberlaImplTest : public walberla::LBWalberlaImpl<FT, Architecture> {
+public:
+  using Base = walberla::LBWalberlaImpl<FT, Architecture>;
+  using Base::Base;
+  using Base::m_density;
+  using Base::m_last_applied_force_field_id;
+  using Base::m_pdf_field_id;
+  using Base::m_velocity_field_id;
+};
+
+template <typename FT, lbmpy::Arch Architecture> struct Fixture {
+  std::shared_ptr<::LatticeWalberla> lattice;
+  std::shared_ptr<LBWalberlaImplTest<FT, Architecture>> lbfluid;
+
+  Fixture() {
+    auto const grid_dim = Utils::Vector3i::broadcast(4);
+    auto const viscosity = FT(1.5);
+    auto const density = FT(0.9);
+    lattice = std::make_shared<::LatticeWalberla>(grid_dim, mpi_shape, 1u);
+    lbfluid = std::make_shared<LBWalberlaImplTest<FT, Architecture>>(
+        lattice, viscosity, density);
+  }
+
+  void runTest() {
+    auto const bc = walberla::get_block_and_cell(*lattice, {0, 0, 0}, false);
+    BOOST_CHECK(bc);
+    auto const cell = bc->cell;
+    auto const ci = walberla::CellInterval(cell, cell);
+    auto &block = *(bc->block);
+    check_getters_setters(block, cell);
+    check_getters_setters(block, ci);
+  }
+
+  template <typename CellIt>
+  void check_getters_setters(walberla::IBlock &block, CellIt const &it) {
+    using namespace walberla;
+    using PdfField = LBWalberlaImplTest<FT, Architecture>::PdfField;
+    using VectorField = LBWalberlaImplTest<FT, Architecture>::VectorField;
+
+    auto constexpr is_interval = std::is_same_v<CellIt, CellInterval>;
+    auto constexpr epsilon = std::numeric_limits<FT>::epsilon();
+    auto constexpr exact = FT{0};
+
+    auto const make_ref_vector = [](std::initializer_list<FT> values) {
+      if constexpr (is_interval) {
+        assert(values.size() % 3ul == 0ul);
+        return std::vector<FT>(values);
+      } else {
+        assert(values.size() == 3ul);
+        return Vector3<FT>(std::data(values));
+      }
+    };
+    auto const make_ref_matrix = [](std::initializer_list<FT> values) {
+      if constexpr (is_interval) {
+        assert(values.size() % 9ul == 0ul);
+        return std::vector<FT>(values);
+      } else {
+        assert(values.size() == 9ul);
+        return Matrix3<FT>(std::data(values));
+      }
+    };
+    auto const to_number = [](auto const &value) {
+      if constexpr (std::is_same_v<decltype(value), FT const &>) {
+        return value;
+      } else {
+        assert(value.size() == 1ul);
+        return value[0u];
+      }
+    };
+
+    auto const density = lbfluid->m_density;
+    auto pdf_field = block.template getData<PdfField>(lbfluid->m_pdf_field_id);
+    auto velocity_field =
+        block.template getData<VectorField>(lbfluid->m_velocity_field_id);
+    auto force_field = block.template getData<VectorField>(
+        lbfluid->m_last_applied_force_field_id);
+
+    std::conditional_t<is_interval, std::vector<FT>, std::array<FT, 19u>>
+        cur_pop = lbm::accessor::Population::get(pdf_field, it);
+    std::conditional_t<is_interval, std::vector<FT>, Vector3<FT>> cur_vel;
+    std::conditional_t<is_interval, std::vector<FT>, Vector3<FT>> cur_laf;
+
+    auto const reset_fields = [&, initial_pop = cur_pop]() {
+      auto const null_vector = Vector3<FT>(FT{0});
+      std::array<FT, 19u> pop;
+      for (std::size_t i = 0ul; i < pop.size(); ++i) {
+        pop[i] = initial_pop[i];
+      }
+      lbm::accessor::Population::initialize(pdf_field, pop);
+      lbm::accessor::Vector::initialize(force_field, null_vector);
+      lbm::accessor::Vector::initialize(velocity_field, null_vector);
+    };
+
+    {
+      auto diag = FT{0};
+      auto const zero = FT{0};
+      auto const old_pop = lbm::accessor::Population::get(pdf_field, it);
+      auto const old_pre = lbm::accessor::PressureTensor::get(pdf_field, it);
+      auto const old_laf = lbm::accessor::Vector::get(force_field, it);
+      auto const old_rho = lbm::accessor::Density::get(pdf_field, it);
+      auto ref_pop = old_pop;
+      std::transform(old_pop.begin(), old_pop.end(), ref_pop.begin(),
+                     [](auto const &f) { return FT{2} * f; });
+      lbm::accessor::Population::set(pdf_field, ref_pop, it);
+      auto const new_pop = lbm::accessor::Population::get(pdf_field, it);
+      auto const new_pre = lbm::accessor::PressureTensor::get(pdf_field, it);
+      auto const new_rho = lbm::accessor::Density::get(pdf_field, it);
+      BOOST_CHECK(almost_equal(new_pop, ref_pop, exact));
+      // clang-format off
+      diag = density * (FT{1} / FT{3});
+      auto const old_pre_ref = make_ref_matrix({diag, zero, zero,
+                                                zero, diag, zero,
+                                                zero, zero, diag});
+      BOOST_CHECK(almost_equal(old_pre, old_pre_ref, epsilon));
+      diag = density * (FT{2} / FT{3});
+      auto const new_pre_ref = make_ref_matrix({diag, zero, zero,
+                                                zero, diag, zero,
+                                                zero, zero, diag});
+      BOOST_CHECK(almost_equal(new_pre, new_pre_ref, epsilon));
+      // clang-format on
+      auto const old_laf_ref = make_ref_vector({FT{0}, FT{0}, FT{0}});
+      auto const new_laf_ref = make_ref_vector({FT{2}, FT{3}, FT{4}});
+      lbm::accessor::Vector::set(force_field, new_laf_ref, it);
+      auto const new_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(old_laf, old_laf_ref, exact));
+      BOOST_CHECK(almost_equal(new_laf, new_laf_ref, exact));
+      lbm::accessor::Vector::set(force_field, old_laf_ref, it);
+      cur_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(cur_laf, old_laf_ref, exact));
+      lbm::accessor::Vector::add_to_all(force_field,
+                                        Vector3<FT>(new_laf_ref.data()));
+      cur_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(cur_laf, new_laf_ref, exact));
+      lbm::accessor::Vector::set(force_field, old_laf_ref, it);
+      cur_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(cur_laf, old_laf_ref, exact));
+      lbm::accessor::Density::set(pdf_field, {FT{7} * density}, it);
+      auto const cur_rho = lbm::accessor::Density::get(pdf_field, it);
+      BOOST_CHECK(
+          almost_equal(to_number(old_rho), density * FT{1}, FT{20} * epsilon));
+      BOOST_CHECK(
+          almost_equal(to_number(new_rho), density * FT{2}, FT{20} * epsilon));
+      BOOST_CHECK(
+          almost_equal(to_number(cur_rho), density * FT{7}, FT{20} * epsilon));
+    }
+    reset_fields();
+    {
+      // update cached velocities and recalculate populations in a single step
+      auto const old_pop = lbm::accessor::Population::get(pdf_field, it);
+      auto const old_vel = lbm::accessor::Vector::get(velocity_field, it);
+      auto const ref_vel = make_ref_vector({FT(0.001), FT(0.002), FT(0.003)});
+      lbm::accessor::Velocity::set(pdf_field, velocity_field, force_field,
+                                   ref_vel, it);
+      auto const new_pop = lbm::accessor::Population::get(pdf_field, it);
+      auto const new_vel = lbm::accessor::Vector::get(velocity_field, it);
+      BOOST_CHECK(almost_equal(new_vel, ref_vel, epsilon));
+      auto const new_mom =
+          lbm::accessor::MomentumDensity::reduce(pdf_field, force_field);
+      auto const ref_mom = Vector3<FT>(
+          ref_vel[0u] * density, ref_vel[1u] * density, ref_vel[2u] * density);
+      BOOST_CHECK(almost_equal(new_mom, ref_mom, FT{20} * epsilon));
+      // update populations and recalculate cached velocities in a single step
+      lbm::accessor::Population::set(pdf_field, velocity_field, force_field,
+                                     old_pop, it);
+      cur_pop = lbm::accessor::Population::get(pdf_field, it);
+      cur_vel = lbm::accessor::Vector::get(velocity_field, it);
+      BOOST_CHECK(almost_equal(cur_pop, old_pop, exact));
+      BOOST_CHECK(almost_equal(cur_vel, old_vel, epsilon));
+      cur_vel = lbm::accessor::Velocity::get(pdf_field, force_field, it);
+      BOOST_CHECK(almost_equal(cur_vel, old_vel, epsilon));
+      lbm::accessor::Population::set(pdf_field, velocity_field, force_field,
+                                     new_pop, it);
+      cur_pop = lbm::accessor::Population::get(pdf_field, it);
+      cur_vel = lbm::accessor::Vector::get(velocity_field, it);
+      BOOST_CHECK(almost_equal(cur_pop, new_pop, exact));
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+      cur_vel = lbm::accessor::Velocity::get(pdf_field, force_field, it);
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+      // update forces and recalculate cached velocities in a single step
+      auto const ref_laf = make_ref_vector({ref_vel[0u] * FT{2} * density,
+                                            ref_vel[1u] * FT{2} * density,
+                                            ref_vel[2u] * FT{2} * density});
+      lbm::accessor::Population::set(pdf_field, velocity_field, force_field,
+                                     old_pop, it);
+      lbm::accessor::Force::set(pdf_field, velocity_field, force_field, ref_laf,
+                                it);
+      cur_pop = lbm::accessor::Population::get(pdf_field, it);
+      cur_vel = lbm::accessor::Vector::get(velocity_field, it);
+      cur_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(cur_pop, old_pop, exact));
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+      BOOST_CHECK(almost_equal(cur_laf, ref_laf, epsilon));
+      cur_vel = lbm::accessor::Velocity::get(pdf_field, force_field, it);
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+      // update velocities and recalculate populations in a single step
+      lbm::accessor::Population::set(pdf_field, velocity_field, force_field,
+                                     old_pop, it);
+      lbm::accessor::Velocity::set(pdf_field, velocity_field, force_field,
+                                   ref_vel, it);
+      cur_pop = lbm::accessor::Population::get(pdf_field, it);
+      cur_vel = lbm::accessor::Vector::get(velocity_field, it);
+      cur_laf = lbm::accessor::Vector::get(force_field, it);
+      BOOST_CHECK(almost_equal(cur_pop, old_pop, epsilon));
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+      BOOST_CHECK(almost_equal(cur_laf, ref_laf, epsilon));
+      cur_vel = lbm::accessor::Velocity::get(pdf_field, force_field, it);
+      BOOST_CHECK(almost_equal(cur_vel, new_vel, epsilon));
+    }
+    reset_fields();
+  }
+};
+
+BOOST_AUTO_TEST_SUITE(suite, *boost::unit_test::precondition(has_gpu))
+
+BOOST_AUTO_TEST_CASE(test_custom_predicate) {
+  std::vector<int> const val = {0, 1, 2, 3, 4, 5, 99, 2};
+  std::vector<int> const ref = {0, 1, 2, 3, 4, 5, 7, 80};
+  auto const is_true = almost_equal(ref, ref, 0);
+  auto const is_false = almost_equal(val, ref, 1);
+  BOOST_REQUIRE(is_true);
+  BOOST_REQUIRE(not is_false);
+  BOOST_CHECK_EQUAL(is_true.message(), "");
+  BOOST_REQUIRE_EQUAL(
+      is_false.message(),
+      "val{[0, 1, 2, 3, 4, 5, ...]} and ref{[0, 1, 2, 3, 4, 5, ...]} "
+      "mismatch: val[6]{99} != ref[6]{7} (difference{92} > delta{1})");
+}
+
+using test_types = boost::mpl::list<float, double>;
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(macroscopic_accessors, FT, test_types) {
+  Fixture<FT, lbmpy::Arch::CPU>().runTest();
+  Fixture<FT, lbmpy::Arch::GPU>().runTest();
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
+int main(int argc, char **argv) {
+  int n_nodes;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &n_nodes);
+  MPI_Dims_create(n_nodes, 3, mpi_shape.data());
+  walberla::mpi_init();
+
+  auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
+  MPI_Finalize();
+  return res;
+}
+
+#if defined(__NVCC__)
+#pragma nv_diagnostic pop
+#endif
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp
similarity index 100%
rename from src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards.cpp
rename to src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp
index 9b3db0fdd98..9732bc8a716 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp
@@ -107,9 +107,9 @@ BOOST_DATA_TEST_CASE(velocity_fluctuation, bdata::make(thermalized_lbs()),
 
   // check
   auto const tol_v = 3E-6;
-  BOOST_CHECK_SMALL(std::abs(sum_v[0] / steps), tol_v * 100); // boost oddity
-  BOOST_CHECK_SMALL(std::abs(sum_v[1] / steps), tol_v * 100);
-  BOOST_CHECK_SMALL(std::abs(sum_v[2] / steps), tol_v * 100);
+  BOOST_CHECK_SMALL(std::abs(sum_v[0] / steps), tol_v * 100.); // boost oddity
+  BOOST_CHECK_SMALL(std::abs(sum_v[1] / steps), tol_v * 100.);
+  BOOST_CHECK_SMALL(std::abs(sum_v[2] / steps), tol_v * 100.);
 
   const double tol_kT = 5; // this is in percent ...
   BOOST_CHECK_CLOSE(sum_v_square[0] / steps, params.kT, tol_kT);
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp
index 94e90de8623..5df3715cd95 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp
@@ -271,7 +271,7 @@ BOOST_DATA_TEST_CASE(domain_and_halo, bdata::make(all_lbs()), lb_generator) {
 
     // If the cell is in the global physical domain
     // check that only one mpi rank said the node was local
-    auto constexpr origin = Vector3i{0, 0, 0};
+    auto constexpr origin = Vector3i{{0, 0, 0}};
     if (n >= origin and n < params.grid_dimensions) {
       boost::mpi::communicator world;
       auto const is_local_sum =
@@ -282,7 +282,7 @@ BOOST_DATA_TEST_CASE(domain_and_halo, bdata::make(all_lbs()), lb_generator) {
 }
 
 static auto fold_node(Vector3i n) {
-  for (unsigned int i = 0; i < 3; i++) {
+  for (auto i = 0u; i < 3u; ++i) {
     if (n[i] < 0) {
       n[i] += params.grid_dimensions[i];
     } else if (n[i] >= params.grid_dimensions[i]) {
diff --git a/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp b/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp
index c26ad40a084..977586ad896 100644
--- a/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp
+++ b/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp
@@ -99,7 +99,7 @@ BOOST_DATA_TEST_CASE(domain_and_halo, bdata::xrange(3u), n_ghost_layers) {
 }
 
 BOOST_AUTO_TEST_CASE(exceptions) {
-  for (int i : {0, 1, 2}) {
+  for (auto i : {0u, 1u, 2u}) {
     auto node_grid = Vector3i::broadcast(1);
     auto grid_dims = Vector3i::broadcast(1);
     grid_dims[i] = 3;
diff --git a/src/walberla_bridge/tests/lb_kernels_unit_tests.cpp b/src/walberla_bridge/tests/lb_kernels_unit_tests.cpp
index 3045e9974ce..f45cfb542fe 100644
--- a/src/walberla_bridge/tests/lb_kernels_unit_tests.cpp
+++ b/src/walberla_bridge/tests/lb_kernels_unit_tests.cpp
@@ -19,10 +19,6 @@
 #define BOOST_TEST_MODULE waLBerla LB kernels
 #define BOOST_TEST_DYN_LINK
 
-#include "config/config.hpp"
-
-#ifdef WALBERLA
-
 #include <boost/test/unit_test.hpp>
 
 #include "../src/lattice_boltzmann/generated_kernels/Dynamic_UBB_double_precision.h"
@@ -189,5 +185,3 @@ BOOST_AUTO_TEST_CASE(macroscopic_accessor_equilibrium_distribution) {
     }
   }
 }
-
-#endif
diff --git a/testsuite/CMakeLists.txt b/testsuite/CMakeLists.txt
index 027cc3f8529..d46e6094db2 100644
--- a/testsuite/CMakeLists.txt
+++ b/testsuite/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2015-2022 The ESPResSo project
+# Copyright (C) 2015-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,8 +17,6 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-include(unit_test)
-
 if(ESPRESSO_BUILD_WITH_COVERAGE_PYTHON)
   set(PYPRESSO_OPTIONS --coverage)
 endif()
diff --git a/testsuite/cmake/CMakeLists.txt b/testsuite/cmake/CMakeLists.txt
index 5578f17edf7..f39018100a7 100644
--- a/testsuite/cmake/CMakeLists.txt
+++ b/testsuite/cmake/CMakeLists.txt
@@ -19,7 +19,7 @@
 
 function(CMAKE_TEST)
   cmake_parse_arguments(TEST "" "FILE" "DEPENDENCIES" ${ARGN})
-  get_filename_component(TEST_NAME ${TEST_FILE} NAME_WE)
+  cmake_path(GET TEST_FILE STEM TEST_NAME)
   configure_file(${TEST_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${TEST_FILE} @ONLY)
   foreach(dependency IN LISTS TEST_DEPENDENCIES)
     configure_file(${dependency} ${CMAKE_CURRENT_BINARY_DIR}/${dependency}
diff --git a/testsuite/cmake/test_install.sh b/testsuite/cmake/test_install.sh
index 5f5c906ba66..845abcadf58 100755
--- a/testsuite/cmake/test_install.sh
+++ b/testsuite/cmake/test_install.sh
@@ -1,5 +1,6 @@
 #!/usr/bin/env bash
-# Copyright (C) 2018-2022 The ESPResSo project
+#
+# Copyright (C) 2018-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -15,6 +16,7 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
 
 # load bash unit testing library
 source BashUnitTests.sh
@@ -25,14 +27,18 @@ function test_install() {
   local -r filepaths=("@ESPRESSO_INSTALL_BINDIR@/pypresso" \
                       "@ESPRESSO_INSTALL_PYTHON@/espressomd/espresso_core.so" \
                       "@ESPRESSO_INSTALL_PYTHON@/espressomd/_init.so" \
-                      "@ESPRESSO_INSTALL_PYTHON@/espressomd/__init__.py"
+                      "@ESPRESSO_INSTALL_PYTHON@/espressomd/__init__.py" \
+                      "@ESPRESSO_INSTALL_PYTHON@/espressomd/io/__init__.py" \
+                      "@ESPRESSO_INSTALL_PYTHON@/espressomd/io/writer/__init__.py" \
+                      "@ESPRESSO_INSTALL_PYTHON@/object_in_fluid/__init__.py" \
+                      "@ESPRESSO_INSTALL_PYTHON@/object_in_fluid/oif_classes.py" \
                      )
   for filepath in "${filepaths[@]}"; do
     assert_file_exists "${filepath}"
   done
 
   # check no Python file was installed outside espressomd
-  paths=$(find "@CMAKE_INSTALL_PREFIX@" -path "@ESPRESSO_INSTALL_PYTHON@/espressomd" -prune -o \( -name '*.py' -o -name '*.so' \) -print)
+  paths=$(find "@CMAKE_INSTALL_PREFIX@" -path "@ESPRESSO_INSTALL_PYTHON@/espressomd" -prune -o -path "@ESPRESSO_INSTALL_PYTHON@/object_in_fluid" -prune -o \( -name '*.py' -o -name '*.so' \) -print)
   count=$(echo "${paths}" | wc -l)
   assert_string_equal "${paths}" "" "${count} files were installed in the wrong directories:"$'\n'"${paths}"
 
diff --git a/testsuite/python/CMakeLists.txt b/testsuite/python/CMakeLists.txt
index 0389f71ad7c..73618834b1c 100644
--- a/testsuite/python/CMakeLists.txt
+++ b/testsuite/python/CMakeLists.txt
@@ -56,7 +56,7 @@
 function(python_test)
   cmake_parse_arguments(TEST "NO_MPI" "FILE;MAX_NUM_PROC;GPU_SLOTS;SUFFIX"
                         "DEPENDS;DEPENDENCIES;LABELS;ARGUMENTS" ${ARGN})
-  get_filename_component(TEST_NAME ${TEST_FILE} NAME_WE)
+  cmake_path(GET TEST_FILE STEM TEST_NAME)
   set(TEST_FILE_CONFIGURED "${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}.py")
   if(TEST_SUFFIX)
     set(TEST_NAME "${TEST_NAME}_${TEST_SUFFIX}")
@@ -194,14 +194,15 @@ checkpoint_test(MODES therm_lb__p3m_cpu__lj__lb_walberla_cpu_ascii SUFFIX
                 1_core MAX_NUM_PROC 1)
 checkpoint_test(MODES therm_lb__p3m_cpu__lj__lb_walberla_cpu_ascii)
 checkpoint_test(MODES therm_lb__elc_cpu__lj__lb_walberla_cpu_binary)
-checkpoint_test(MODES therm_lb__elc_gpu__lj__lb_walberla_cpu_ascii GPU_SLOTS 3)
-checkpoint_test(MODES therm_lb__p3m_gpu__lj__lb_walberla_cpu_binary GPU_SLOTS 3)
+checkpoint_test(MODES therm_lb__elc_gpu__lj__lb_walberla_gpu_ascii GPU_SLOTS 3)
+checkpoint_test(MODES therm_lb__p3m_gpu__lj__lb_walberla_gpu_binary GPU_SLOTS 3)
 checkpoint_test(MODES therm_npt__int_npt)
 checkpoint_test(MODES int_sd__lj)
 checkpoint_test(MODES dp3m_cpu__therm_langevin__int_nvt)
 checkpoint_test(MODES therm_dpd__int_nvt)
 checkpoint_test(MODES scafacos__therm_bd__int_bd)
 checkpoint_test(MODES therm_sdm__int_sdm)
+checkpoint_test(MODES lj__ase SUFFIX 1_core MAX_NUM_PROC 1)
 
 python_test(FILE bond_breakage.py MAX_NUM_PROC 4)
 python_test(FILE cell_system.py MAX_NUM_PROC 4)
@@ -219,8 +220,6 @@ python_test(FILE accumulator_correlator.py MAX_NUM_PROC 4)
 python_test(FILE accumulator_mean_variance.py MAX_NUM_PROC 4)
 python_test(FILE accumulator_time_series.py MAX_NUM_PROC 1)
 python_test(FILE dawaanr-and-dds-gpu.py MAX_NUM_PROC 1 GPU_SLOTS 1)
-python_test(FILE dawaanr-and-bh-gpu.py MAX_NUM_PROC 1 GPU_SLOTS 1)
-python_test(FILE dds-and-bh-gpu.py MAX_NUM_PROC 4 GPU_SLOTS 3)
 python_test(FILE electrostatic_interactions.py MAX_NUM_PROC 2)
 python_test(FILE engine_langevin.py MAX_NUM_PROC 4)
 python_test(FILE engine_lb.py MAX_NUM_PROC 2 GPU_SLOTS 1)
@@ -241,7 +240,6 @@ python_test(FILE pressure.py MAX_NUM_PROC 4)
 python_test(FILE scafacos_dipoles_1d_2d.py MAX_NUM_PROC 4)
 python_test(FILE scafacos_interface.py MAX_NUM_PROC 2)
 python_test(FILE long_range_actors.py MAX_NUM_PROC 4 GPU_SLOTS 2)
-python_test(FILE long_range_actors.py MAX_NUM_PROC 1 GPU_SLOTS 1 SUFFIX 1_core)
 python_test(FILE tabulated.py MAX_NUM_PROC 2)
 python_test(FILE particle_slice.py MAX_NUM_PROC 4)
 python_test(FILE rigid_bond.py MAX_NUM_PROC 4)
@@ -331,28 +329,33 @@ python_test(FILE drude.py MAX_NUM_PROC 2)
 python_test(FILE thermostats_anisotropic.py MAX_NUM_PROC 4)
 python_test(FILE thermalized_bond.py MAX_NUM_PROC 4)
 python_test(FILE thole.py MAX_NUM_PROC 4)
-python_test(FILE lb_slice.py MAX_NUM_PROC 2)
+python_test(FILE lb_slice.py MAX_NUM_PROC 2 GPU_SLOTS 1)
 python_test(FILE lb_boundary_velocity.py MAX_NUM_PROC 1)
 # python_test(FILE lb_boundary_volume_force.py MAX_NUM_PROC 2) # TODO
-python_test(FILE lb_boundary_ghost_layer.py MAX_NUM_PROC 2)
+python_test(FILE lb_boundary_ghost_layer.py MAX_NUM_PROC 2 GPU_SLOTS 1)
 python_test(FILE lb_circular_couette.py MAX_NUM_PROC 2 GPU_SLOTS 1)
 python_test(FILE lb_poiseuille.py MAX_NUM_PROC 4 GPU_SLOTS 1)
 python_test(FILE lb_poiseuille_cylinder.py MAX_NUM_PROC 2 GPU_SLOTS 1)
-python_test(FILE lb_interpolation.py MAX_NUM_PROC 4 GPU_SLOTS 1)
+python_test(FILE lb_interpolation.py MAX_NUM_PROC 4 GPU_SLOTS 0)
+python_test(FILE lb_interpolation.py MAX_NUM_PROC 1 GPU_SLOTS 1 SUFFIX 1_core)
 python_test(FILE analyze_gyration_tensor.py MAX_NUM_PROC 2)
 python_test(FILE oif_volume_conservation.py MAX_NUM_PROC 2)
 python_test(FILE simple_pore.py MAX_NUM_PROC 1)
 python_test(FILE field_test.py MAX_NUM_PROC 1)
 python_test(FILE lb_boundary.py MAX_NUM_PROC 2 GPU_SLOTS 1)
-python_test(FILE lb_streaming.py MAX_NUM_PROC 4 GPU_SLOTS 1)
+python_test(FILE lb_streaming.py MAX_NUM_PROC 4 GPU_SLOTS 0)
+python_test(FILE lb_streaming.py MAX_NUM_PROC 1 GPU_SLOTS 1 SUFFIX 1_core)
 python_test(FILE lb_shear.py MAX_NUM_PROC 2 GPU_SLOTS 1)
-python_test(FILE lb_thermostat.py MAX_NUM_PROC 2 GPU_SLOTS 1)
+python_test(FILE lb_thermostat.py MAX_NUM_PROC 2 GPU_SLOTS 0)
+python_test(FILE lb_thermostat.py MAX_NUM_PROC 1 GPU_SLOTS 1 SUFFIX 1_core)
 # python_test(FILE lb_buoyancy_force.py MAX_NUM_PROC 2 GPU_SLOTS 1) # TODO
 python_test(FILE lb_momentum_conservation.py MAX_NUM_PROC 2 GPU_SLOTS 1 LABELS
             long)
 python_test(FILE lb_momentum_conservation.py MAX_NUM_PROC 1 GPU_SLOTS 1 LABELS
             long SUFFIX 1_core)
-python_test(FILE lb_mass_conservation.py MAX_NUM_PROC 2)
+python_test(FILE lb_mass_conservation.py MAX_NUM_PROC 2 GPU_SLOTS 0)
+python_test(FILE lb_mass_conservation.py MAX_NUM_PROC 1 GPU_SLOTS 1 SUFFIX
+            1_core)
 python_test(FILE p3m_electrostatic_pressure.py MAX_NUM_PROC 2 GPU_SLOTS 1)
 python_test(FILE p3m_madelung.py MAX_NUM_PROC 2 GPU_SLOTS 2 LABELS long)
 python_test(FILE sigint.py DEPENDENCIES sigint_child.py NO_MPI)
@@ -366,7 +369,7 @@ python_test(FILE decorators.py MAX_NUM_PROC 1)
 python_test(FILE galilei.py MAX_NUM_PROC 4)
 python_test(FILE linear_momentum.py MAX_NUM_PROC 4)
 python_test(FILE linear_momentum_lb.py MAX_NUM_PROC 2 GPU_SLOTS 1)
-python_test(FILE mmm1d.py MAX_NUM_PROC 2 GPU_SLOTS 1)
+python_test(FILE mmm1d.py MAX_NUM_PROC 2)
 python_test(FILE integrator_stokesian_stats.py MAX_NUM_PROC 2 LABELS long)
 python_test(FILE elc.py MAX_NUM_PROC 2 GPU_SLOTS 1)
 python_test(FILE elc_vs_analytic.py MAX_NUM_PROC 2 GPU_SLOTS 1)
@@ -404,6 +407,7 @@ python_test(FILE propagation_brownian.py MAX_NUM_PROC 1)
 python_test(FILE propagation_lb.py MAX_NUM_PROC 2 GPU_SLOTS 1)
 python_test(FILE propagation_npt.py MAX_NUM_PROC 4 GPU_SLOTS 1)
 python_test(FILE propagation_stokesian.py MAX_NUM_PROC 2)
+python_test(FILE coordinates_folding.py MAX_NUM_PROC 2)
 
 set(ESPRESSO_CTEST_RESOURCE_SPEC_FILE resources.json)
 configure_file(
diff --git a/testsuite/python/analyze_chains.py b/testsuite/python/analyze_chains.py
index 0669485e81b..35b8c092748 100644
--- a/testsuite/python/analyze_chains.py
+++ b/testsuite/python/analyze_chains.py
@@ -25,6 +25,7 @@
 import espressomd.interactions
 import espressomd.lees_edwards
 import espressomd.polymer
+import espressomd.propagation
 
 
 @utx.skipIfMissingFeatures("LENNARD_JONES")
@@ -126,7 +127,8 @@ def test_observables_no_pbc(self):
         # increase PBC to remove interactions with periodic images
         all_partcls = self.system.part.all()
         old_pos = all_partcls.pos.copy()
-        self.system.box_l = self.system.box_l * 2.
+        self.system.change_volume_and_rescale_particles(
+            2. * self.system.box_l[0], "xyz")
         all_partcls.pos = old_pos
         # compare calc_re()
         core_re = self.system.analysis.calc_re(chain_start=0,
@@ -191,6 +193,7 @@ def test_observables_lebc(self):
                                    atol=1e-8)
 
     def test_exceptions(self):
+        Propagation = espressomd.propagation.Propagation
         num_poly = 2
         num_mono = 5
         self.insert_polymers(num_poly, num_mono)
@@ -216,6 +219,8 @@ def test_exceptions(self):
             with self.assertRaisesRegex(RuntimeError, "Center of mass is not well-defined"):
                 p = self.system.part.by_id(0)
                 p.vs_relative = (1, 0.01, (1., 0., 0., 0.))
+                p.propagation = (Propagation.TRANS_VS_RELATIVE |
+                                 Propagation.ROT_VS_RELATIVE)
                 analysis.calc_rg(chain_start=0, number_of_chains=num_poly,
                                  chain_length=num_mono)
         with self.assertRaisesRegex(RuntimeError, "Parameter 'analysis' is read-only"):
diff --git a/testsuite/python/array_properties.py b/testsuite/python/array_properties.py
index 85a036bd3d9..60e0fb4a04c 100644
--- a/testsuite/python/array_properties.py
+++ b/testsuite/python/array_properties.py
@@ -102,14 +102,15 @@ class ArrayPropertyTest(ArrayCommon):
     system.box_l = [12.0, 12.0, 12.0]
     system.time_step = 0.01
     system.cell_system.skin = 0.01
-    partcl = system.part.add(pos=[0, 0, 0])
 
     def setUp(self):
         self.system.box_l = [12.0, 12.0, 12.0]
+        self.partcl = self.system.part.add(pos=[0, 0, 0])
 
     def tearDown(self):
         if espressomd.has_features("WALBERLA"):
             self.system.lb = None
+        self.system.part.clear()
 
     def assert_copy_is_writable(self, array):
         cpy = np.copy(array)
diff --git a/testsuite/python/ase_interface.py b/testsuite/python/ase_interface.py
new file mode 100644
index 00000000000..7313f9f56be
--- /dev/null
+++ b/testsuite/python/ase_interface.py
@@ -0,0 +1,69 @@
+#
+# Copyright (C) 2024 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import unittest as ut
+import unittest_decorators as utx
+import espressomd
+import espressomd.plugins.ase
+import numpy as np
+import ase
+
+
+class ASEInterfaceTest(ut.TestCase):
+
+    system = espressomd.System(
+        box_l=[10., 10., 10.], periodicity=[True, True, False])
+
+    def setUp(self):
+        self.system.part.add(pos=[0., 0., 0.], f=[1., -1., 0.], type=0)
+        self.system.part.add(pos=[0., 0., 1.], f=[0., 12., 0.], type=1)
+        self.system.ase = espressomd.plugins.ase.ASEInterface(
+            type_mapping={0: "H", 1: "O"},
+        )
+
+    def tearDown(self):
+        self.system.part.clear()
+
+    def test_ase_get(self):
+        """Test the ``ASEInterface.get()`` method."""
+        # Create a simple ASE atoms object
+        atoms = self.system.ase.get()
+        self.assertIsInstance(atoms, ase.Atoms)
+        self.assertEqual(set(atoms.get_chemical_symbols()), {"H", "O"})
+        np.testing.assert_equal(atoms.pbc, np.copy(self.system.periodicity))
+        np.testing.assert_allclose(atoms.cell, np.diag(self.system.box_l))
+        np.testing.assert_allclose(atoms.get_positions(),
+                                   [[0., 0., 0.], [0., 0., 1.]])
+        np.testing.assert_allclose(atoms.get_forces(),
+                                   [[1., -1., 0.], [0., 12., 0.]])
+
+    @utx.skipIfMissingFeatures("VIRTUAL_SITES_RELATIVE")
+    def test_exceptions(self):
+        p = self.system.part.add(pos=[0., 0., 0.], type=10)
+        with self.assertRaisesRegex(RuntimeError, r"Particle types '\{10\}' haven't been registered"):
+            self.system.ase.get()
+        p.type = 1
+        vs = self.system.part.add(pos=[0., 0., 0.], type=1)
+        vs.vs_relative = (p.id, 0.01, (1., 0., 0., 0.))
+        with self.assertRaisesRegex(RuntimeError, "ASE doesn't support virtual sites"):
+            self.system.ase.get()
+
+
+if __name__ == "__main__":
+    ut.main()
diff --git a/testsuite/python/caliper.py b/testsuite/python/caliper.py
index e5cf639a9cb..67aedbb11bc 100644
--- a/testsuite/python/caliper.py
+++ b/testsuite/python/caliper.py
@@ -26,16 +26,23 @@
 import os
 
 EXPECTED_LABELS = """
-calc_energies
-  short_range_loop
 integrate
-  Integration loop
+  Initial Force Calculation
     calculate_forces
-      copy_forces_from_GPU
-      short_range_loop
-      calc_long_range_forces
+      copy_particles_to_GPU
       init_forces
+      calc_long_range_forces
+      short_range_loop
+      copy_forces_from_GPU
+  Integration loop
+    calculate_forces
       copy_particles_to_GPU
+      init_forces
+      calc_long_range_forces
+      short_range_loop
+      copy_forces_from_GPU
+calc_energies
+  short_range_loop
 """
 
 
@@ -47,7 +54,7 @@ def test_runtime_report(self):
         has_cuda = espressomd.has_features(["CUDA"])
         script = str(pathlib.Path(__file__).parent / "caliper_child.py")
         my_env = os.environ.copy()
-        my_env["CALI_CONFIG_PROFILE"] = "runtime-report"
+        my_env["CALI_CONFIG"] = "runtime-report"
         process = subprocess.Popen([sys.executable, script],
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
@@ -58,7 +65,7 @@ def test_runtime_report(self):
             if not line.startswith("WARNING:"):
                 lines = lines[i:]
                 break
-        header = "Path\tInclusive time\tExclusive\ttime\tTime %"
+        header = "Path\tMin time/rank\tMax time/rank\tAvg time/rank\tTime %"
         self.assertEqual(lines[0].split(), header.split(),
                          msg=f"Caliper summary should start with '{header}'")
         labels = [line[:30].strip() for line in lines[1:]]
diff --git a/testsuite/python/collision_detection.py b/testsuite/python/collision_detection.py
index 1f67d676f4c..1f0a2ae2411 100644
--- a/testsuite/python/collision_detection.py
+++ b/testsuite/python/collision_detection.py
@@ -684,61 +684,6 @@ def test_glue_to_surface_random(self):
         # Tidy
         system.non_bonded_inter[0, 0].lennard_jones.deactivate()
 
-    def test_bind_three_particles(self):
-        system = self.system
-        # Setup particles
-        system.part.clear()
-        dx = np.array((1, 0, 0))
-        dy = np.array((0, 1, 0))
-        a = np.array((0.499, 0.499, 0.499))
-        b = a + 0.1 * dx
-        c = a + 0.03 * dx + 0.03 * dy
-        d = a + 0.03 * dx - 0.03 * dy
-        e = a - 0.1 * dx
-
-        system.part.add(id=0, pos=a)
-        system.part.add(id=1, pos=b)
-        system.part.add(id=2, pos=c)
-        system.part.add(id=3, pos=d)
-        system.part.add(id=4, pos=e)
-
-        # Setup bonds
-        res = 181
-        for i in range(res):
-            system.bonded_inter[i + 4] = espressomd.interactions.AngleHarmonic(
-                bend=1, phi0=float(i) / (res - 1) * np.pi)
-        cutoff = 0.11
-        system.collision_detection.set_params(
-            mode="bind_three_particles", bond_centers=self.bond_center,
-            bond_three_particles=4, three_particle_binding_angle_resolution=res, distance=cutoff)
-        self.get_state_set_state_consistency()
-
-        system.time_step = 1E-6
-        system.integrator.run(1, recalc_forces=True)
-        self.verify_triangle_binding(cutoff, system.bonded_inter[4], res)
-        # Make sure no extra bonds appear
-        system.integrator.run(1, recalc_forces=True)
-        self.verify_triangle_binding(cutoff, system.bonded_inter[4], res)
-
-        # Place the particles in two steps and make sure, the bonds are the
-        # same
-        system.part.clear()
-        system.part.add(id=0, pos=a)
-        system.part.add(id=2, pos=c)
-        system.part.add(id=3, pos=d)
-        system.integrator.run(1, recalc_forces=True)
-
-        system.part.add(id=4, pos=e)
-        system.part.add(id=1, pos=b)
-        system.cell_system.set_regular_decomposition()
-        system.integrator.run(1, recalc_forces=True)
-        self.verify_triangle_binding(cutoff, system.bonded_inter[4], res)
-        system.cell_system.set_n_square()
-        system.part.all().bonds = ()
-        system.integrator.run(1, recalc_forces=True)
-        self.verify_triangle_binding(cutoff, system.bonded_inter[4], res)
-        system.time_step = self.time_step
-
     def verify_triangle_binding(self, distance, first_bond, angle_res):
         system = self.system
         # Gather pairs
diff --git a/testsuite/python/collision_detection_interface.py b/testsuite/python/collision_detection_interface.py
index c035a413bc2..25df82dad93 100644
--- a/testsuite/python/collision_detection_interface.py
+++ b/testsuite/python/collision_detection_interface.py
@@ -46,11 +46,6 @@ class CollisionDetection(ut.TestCase):
             "bond_vs": bond_harmonic, "bond_centers": bond_harmonic,
             "part_type_vs": 1, "distance": 0.1, "vs_placement": 0.1
         },
-        "bind_three_particles": {
-            "bond_centers": bond_harmonic, "bond_three_particles": 0,
-            "three_particle_binding_angle_resolution": bond_angle_resolution,
-            "distance": 0.1
-        },
         "glue_to_surface": {
             "distance": 0.1, "distance_glued_particle_to_vs": 0.02,
             "bond_centers": bond_harmonic, "bond_vs": bond_harmonic,
@@ -136,7 +131,7 @@ def test_bind_at_point_of_collision(self):
         with self.assertRaisesRegex(RuntimeError, "Bond in parameter 'bond_vs' was not added to the system"):
             bond = espressomd.interactions.HarmonicBond(k=1., r_0=0.1)
             self.set_coldet("bind_at_point_of_collision", bond_vs=bond)
-        with self.assertRaisesRegex(RuntimeError, "bond type to be used for binding virtual sites needs to be a pair or three-particle bond"):
+        with self.assertRaisesRegex(RuntimeError, "bond type to be used for binding virtual sites needs to be a pair bond"):
             self.set_coldet(
                 "bind_at_point_of_collision", bond_vs=self.bond_dihe)
         with self.assertRaisesRegex(ValueError, "type for virtual sites needs to be >=0"):
@@ -151,20 +146,6 @@ def test_bind_at_point_of_collision_norotation(self):
             with self.assertRaisesRegex(RuntimeError, "require the VIRTUAL_SITES_RELATIVE feature"):
                 self.set_coldet("bind_at_point_of_collision")
 
-    def test_bind_three_particles(self):
-        self.set_coldet("bind_three_particles", distance=0.5)
-        with self.assertRaisesRegex(RuntimeError, "Insufficient bonds defined for three particle binding"):
-            self.set_coldet(
-                "bind_three_particles",
-                three_particle_binding_angle_resolution=self.bond_angle_resolution +
-                1000)
-        with self.assertRaisesRegex(RuntimeError, "The bonds for three particle binding need to be angle bonds"):
-            self.set_coldet(
-                "bind_three_particles",
-                three_particle_binding_angle_resolution=self.bond_angle_resolution + 1)
-        # check if original parameters have been preserved
-        self.check_stored_parameters("bind_three_particles", distance=0.5)
-
     @utx.skipIfMissingFeatures("VIRTUAL_SITES_RELATIVE")
     def test_glue_to_surface(self):
         self.set_coldet("glue_to_surface", distance=0.5)
diff --git a/testsuite/python/constant_pH.py b/testsuite/python/constant_pH.py
index afbbf692d6a..177079541d0 100644
--- a/testsuite/python/constant_pH.py
+++ b/testsuite/python/constant_pH.py
@@ -17,6 +17,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
+# pylint: disable=cyclic-import
 import unittest as ut
 import numpy as np
 import espressomd
diff --git a/testsuite/python/constraint_shape_based.py b/testsuite/python/constraint_shape_based.py
index 738ef27606c..88adf51a073 100644
--- a/testsuite/python/constraint_shape_based.py
+++ b/testsuite/python/constraint_shape_based.py
@@ -868,7 +868,7 @@ def test_rhomboid(self):
                     dist_vec = np.array([0.0, 0.0, 0.0])
 
                     # check if outside or inside
-                    if(pos[0] < (self.box_l + length[0] - abs(length[0])) / 2.0 or
+                    if (pos[0] < (self.box_l + length[0] - abs(length[0])) / 2.0 or
                        pos[0] > (self.box_l + length[0] + abs(length[0])) / 2.0 or
                        pos[1] < (self.box_l + length[1] - abs(length[1])) / 2.0 or
                        pos[1] > (self.box_l + length[1] + abs(length[1])) / 2.0 or
@@ -1072,14 +1072,26 @@ def test_torus(self):
 
     def test_exceptions(self):
         system = self.system
+        box_l = self.box_l
         wall = espressomd.shapes.Wall(normal=[0., 1., 0.], dist=0.)
         constraint = espressomd.constraints.ShapeBasedConstraint(
             shape=wall, particle_type=1)
         system.constraints.add(constraint)
         with self.assertRaisesRegex(RuntimeError, "there are active constraints"):
             system.box_l = 0.5 * system.box_l
+        np.testing.assert_allclose(np.copy(system.box_l), box_l, atol=1e-7)
+        with self.assertRaisesRegex(RuntimeError, "there are active constraints"):
+            system.change_volume_and_rescale_particles(
+                0.5 * system.box_l[0], "xyz")
+        np.testing.assert_allclose(np.copy(system.box_l), box_l, atol=1e-7)
         system.constraints.remove(constraint)
         system.box_l = 0.75 * system.box_l
+        np.testing.assert_allclose(
+            np.copy(system.box_l), 0.75 * box_l, atol=1e-7)
+        system.change_volume_and_rescale_particles(
+            0.5 * system.box_l[0], "xyz")
+        np.testing.assert_allclose(
+            np.copy(system.box_l), 0.75 * 0.5 * box_l, atol=1e-7)
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/coordinates_folding.py b/testsuite/python/coordinates_folding.py
new file mode 100644
index 00000000000..8eb5d0b023d
--- /dev/null
+++ b/testsuite/python/coordinates_folding.py
@@ -0,0 +1,140 @@
+#
+# Copyright (C) 2024 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import sys
+import pathlib
+import tempfile
+import contextlib
+import unittest as ut
+import numpy as np
+import espressomd
+import espressomd.io.writer
+import espressomd.accumulators
+import espressomd.observables
+import espressomd.io.writer
+with contextlib.suppress(ImportError):
+    import h5py  # h5py has to be imported *after* espressomd (MPI)
+
+
+class Test(ut.TestCase):
+
+    system = espressomd.System(box_l=[10., 10., 10.])
+    system.cell_system.skin = 0.4
+    system.time_step = 0.01
+
+    def tearDown(self):
+        self.system.part.clear()
+        self.system.auto_update_accumulators.clear()
+        self.system.cell_system.skin = 0.4
+        self.system.time_step = 0.01
+        self.system.time = 0.
+
+    def check_trajectory(self, pos_unfolded_ref, **kwargs):
+        tol = {"atol": 1e-9, "rtol": 0.}
+        box_l = np.copy(self.system.box_l)
+        pos_unfolded_ref = np.copy(pos_unfolded_ref)
+        if "image_box" in kwargs:
+            image_box = np.copy(kwargs["image_box"])
+            np.testing.assert_equal(image_box[:, 0], 5 * [+0] + 5 * [+1])
+            np.testing.assert_equal(image_box[:, 1], 5 * [+3] + 5 * [+2])
+            np.testing.assert_equal(image_box[:, 2], 5 * [-2] + 5 * [-1])
+        if "pos_unfolded" in kwargs:
+            pos_unfolded = np.copy(kwargs["pos_unfolded"])
+            np.testing.assert_allclose(pos_unfolded, pos_unfolded_ref, **tol)
+        if "pos_folded" in kwargs:
+            pos_folded = np.copy(kwargs["pos_folded"])
+            image_box = np.copy(kwargs["image_box"])
+            pos_folded_ref = pos_unfolded_ref - image_box * box_l
+            np.testing.assert_allclose(pos_folded, pos_folded_ref, **tol)
+
+    def check_folding(self, description, skin, le_flag, temp_path):
+        has_hdf5 = "h5py" in sys.modules and espressomd.has_features(["H5MD"])
+        system = self.system
+        temp_file = temp_path / f"skin_{skin:.5f}_le_{le_flag}.h5"
+        system.part.clear()
+        system.auto_update_accumulators.clear()
+        system.lees_edwards.protocol = None
+        system.cell_system.skin = skin
+        system.time_step = 0.01
+        system.time = 0.
+        v0 = np.array([0.1, -0.1, 0.1])
+        x0 = np.array([9.95 - 1e-9, 30.05 + 1e-9, -10.05 - 1e-9])
+        p = system.part.add(pos=x0, v=v0)
+        if le_flag:
+            protocol = espressomd.lees_edwards.LinearShear(
+                shear_velocity=0., initial_pos_offset=0.5, time_0=0.)
+            system.lees_edwards.set_boundary_conditions(
+                shear_direction="y", shear_plane_normal="x", protocol=protocol)
+        obs = espressomd.observables.ParticlePositions(ids=[0])
+        acc = espressomd.accumulators.TimeSeries(obs=obs, delta_N=10)
+        system.auto_update_accumulators.add(acc)
+        if has_hdf5:
+            h5 = espressomd.io.writer.h5md.H5md(file_path=str(temp_file))
+        pos_unfolded_ref = []
+        pos_unfolded = []
+        pos_folded = []
+        image_box = []
+        for _ in range(10):
+            system.integrator.run(10)
+            pos = x0 + v0 * system.time
+            if le_flag and pos[0] > system.box_l[1]:
+                pos[1] -= protocol.initial_pos_offset
+            pos_unfolded_ref.append(pos)
+            pos_unfolded.append(p.pos)
+            pos_folded.append(p.pos_folded)
+            image_box.append(p.image_box)
+            if has_hdf5:
+                h5.write()
+        if has_hdf5:
+            h5.flush()
+            h5.close()
+        with self.subTest(msg=f"{description}; trajectory from particle handle"):
+            self.check_trajectory(pos_unfolded_ref, image_box=image_box,
+                                  pos_folded=pos_folded, pos_unfolded=pos_unfolded)
+        with self.subTest(msg=f"{description}; trajectory from observable"):
+            obs_lag = (acc.delta_N - 1) * system.time_step * v0
+            obs_unfolded = acc.time_series().reshape((-1, 3)) + obs_lag
+            self.check_trajectory(pos_unfolded_ref, pos_unfolded=obs_unfolded)
+        if has_hdf5:
+            with self.subTest(msg=f"{description}; trajectory from hdf5 file"):
+                with h5py.File(temp_file, "r") as h5:
+                    prop = "particles/atoms/{}/value"
+                    h5py_pos = h5[prop.format("position")][:].reshape((-1, 3))
+                    h5py_img = h5[prop.format("image")][:].reshape((-1, 3))
+                self.check_trajectory(
+                    pos_unfolded_ref, image_box=h5py_img, pos_folded=h5py_pos)
+
+    def test_folding(self):
+        scenarios = [
+            (0.400, "scenario: local cell resort only at first time step"),
+            (0.040, "scenario: local cell resort in 1 cell at every time step"),
+            (0.004, "scenario: local cell resort in 5 cells at every time step"),
+            (0.000, "scenario: local cell resort in 10 cells at every time step"),
+        ]
+        with tempfile.TemporaryDirectory() as temp_dir_name:
+            path = pathlib.Path(temp_dir_name).resolve()
+            for le_flag in [False, True]:
+                for skin, description in scenarios:
+                    le_qualifier = "without" if le_flag else "with"
+                    scenario = f"{description}; {le_qualifier} Lees-Edwards"
+                    self.check_folding(scenario, skin, le_flag, path)
+
+
+if __name__ == "__main__":
+    ut.main()
diff --git a/testsuite/python/coulomb_interface.py b/testsuite/python/coulomb_interface.py
index b2257fe49d8..fc40860e4a2 100644
--- a/testsuite/python/coulomb_interface.py
+++ b/testsuite/python/coulomb_interface.py
@@ -122,25 +122,6 @@ def test_mmm1d_cpu(self):
         self.assertAlmostEqual(
             self.system.analysis.energy()["coulomb"], ref_energy, delta=1e-7)
 
-    @utx.skipIfMissingGPU()
-    @utx.skipIfMissingFeatures(["CUDA", "MMM1D_GPU"])
-    def test_mmm1d_gpu(self):
-        self.system.periodicity = [False, False, True]
-        self.system.cell_system.set_n_square()
-        valid_params = dict(
-            prefactor=1., maxPWerror=1e-3, far_switch_radius=1.,
-            check_neutrality=True, charge_neutrality_tolerance=7e-12,
-            bessel_cutoff=1)
-        tests_common.generate_test_for_actor_class(
-            self.system.electrostatics, espressomd.electrostatics.MMM1DGPU, valid_params)(self)
-
-        for key in ["prefactor", "maxPWerror",
-                    "far_switch_radius", "bessel_cutoff"]:
-            invalid_params = valid_params.copy()
-            invalid_params[key] = -2
-            with self.assertRaisesRegex(ValueError, f"Parameter '{key}' must be > 0"):
-                espressomd.electrostatics.MMM1DGPU(**invalid_params)
-
     def test_charge_neutrality_check(self):
         self.system.part.add(pos=(0.0, 0.0, 0.0), q=1.)
         self.system.periodicity = [False, False, True]
@@ -174,18 +155,6 @@ def test_mmm1d_cpu_tuning_exceptions(self):
         self.assertIsNone(self.system.electrostatics.solver)
         self.assertFalse(actor.is_tuned)
 
-    @utx.skipIfMissingGPU()
-    @utx.skipIfMissingFeatures(["CUDA", "MMM1D_GPU"])
-    def test_mmm1d_gpu_tuning_exceptions(self):
-        self.system.periodicity = [False, False, True]
-        self.system.cell_system.set_n_square()
-        actor = espressomd.electrostatics.MMM1DGPU(
-            prefactor=1., maxPWerror=1e-3, far_switch_radius=0.1)
-        with self.assertRaisesRegex(RuntimeError, "No reasonable Bessel cutoff could be determined"):
-            self.system.electrostatics.solver = actor
-        self.assertIsNone(self.system.electrostatics.solver)
-        self.assertFalse(actor.is_tuned)
-
     @utx.skipIfMissingFeatures(["P3M"])
     def test_elc_p3m_exceptions(self):
         P3M = espressomd.electrostatics.P3M
@@ -253,12 +222,12 @@ def test_elc_p3m_exceptions(self):
             self.assertEqual(
                 list(self.system.cell_system.node_grid),
                 list(self.original_node_grid))
-        with self.assertRaisesRegex(Exception, "while setting parameter 'box_l': ERROR: ELC gap size .+ larger than box length in z-direction"):
-            self.system.box_l = [10., 10., 2.5]
-        self.system.box_l = [10., 10., 10.]
+        with self.assertRaisesRegex(Exception, "ERROR: ELC gap size .+ larger than box length in z-direction"):
+            self.system.change_volume_and_rescale_particles(2.5, "z")
+        self.system.change_volume_and_rescale_particles(10., "z")
         self.system.electrostatics.solver = None
         with self.assertRaisesRegex(RuntimeError, "P3M real-space cutoff too large for ELC w/ dielectric contrast"):
-            self.system.box_l = [10., 10., 5.]
+            self.system.change_volume_and_rescale_particles(5., "z")
             elc = espressomd.electrostatics.ELC(
                 actor=p3m,
                 gap_size=1.,
@@ -271,7 +240,7 @@ def test_elc_p3m_exceptions(self):
             )
             self.system.electrostatics.solver = elc
         self.assertIsNone(self.system.electrostatics.solver)
-        self.system.box_l = [10., 10., 10.]
+        self.system.change_volume_and_rescale_particles(10., "z")
         self.system.periodicity = [True, True, False]
         with self.assertRaisesRegex(RuntimeError, periodicity_err_msg):
             elc = espressomd.electrostatics.ELC(
diff --git a/testsuite/python/coulomb_mixed_periodicity.py b/testsuite/python/coulomb_mixed_periodicity.py
index b7584c62458..d7a918fcc82 100644
--- a/testsuite/python/coulomb_mixed_periodicity.py
+++ b/testsuite/python/coulomb_mixed_periodicity.py
@@ -41,6 +41,7 @@ def setUp(self):
         self.system.time_step = 0.01
         self.system.cell_system.skin = 0.
 
+    def setup_particles(self):
         # Add particles to system and store reference forces in hash
         # Input format: id pos q f
         self.system.part.add(pos=self.data[:, 1:4], q=self.data[:, 4])
@@ -64,17 +65,21 @@ def compare(self, method_name, force_tol, energy_tol):
         # triggering a solver re-initialization via a box resize
         # should not affect the forces nor the energies
         original_box_l = np.copy(self.system.box_l)
-        self.system.box_l = original_box_l * 1.1
-        self.system.box_l = original_box_l
+        for i in range(3):
+            self.system.change_volume_and_rescale_particles(
+                1.05 * original_box_l[i], "xyz"[i])
+        for i in range(3):
+            self.system.change_volume_and_rescale_particles(
+                original_box_l[i], "xyz"[i])
         self.system.integrator.run(0)
         forces_step2 = np.copy(self.system.part.all().f)
         energy_step2 = self.system.analysis.energy()["total"]
 
         err_msg = f"method {method_name} deviates after cells reinitialization"
-        np.testing.assert_allclose(forces_step1, forces_step2, atol=1e-12,
-                                   err_msg=f"Force {err_msg}")
-        np.testing.assert_allclose(energy_step2, energy_step1, rtol=1e-12,
-                                   err_msg=f"Energy {err_msg}")
+        np.testing.assert_allclose(forces_step1, forces_step2, rtol=1e-9,
+                                   atol=0., err_msg=f"Force {err_msg}")
+        np.testing.assert_allclose(energy_step2, energy_step1, rtol=1e-9,
+                                   atol=0., err_msg=f"Energy {err_msg}")
 
     def setup_elc_system(self):
         # Make sure, the data satisfies the gap
@@ -89,6 +94,7 @@ def setup_elc_system(self):
     @utx.skipIfMissingFeatures(["P3M"])
     def test_elc_cpu(self):
         self.system.box_l = [10., 10., 12.]
+        self.setup_particles()
         self.setup_elc_system()
 
         p3m = espressomd.electrostatics.P3M(
@@ -103,6 +109,7 @@ def test_elc_cpu(self):
     @utx.skipIfMissingFeatures(["P3M"])
     def test_elc_gpu(self):
         self.system.box_l = [10., 10., 12.]
+        self.setup_particles()
         self.setup_elc_system()
 
         p3m = espressomd.electrostatics.P3M(
@@ -119,6 +126,7 @@ def test_scafacos_p2nfft(self):
         self.system.box_l = [10., 10., 10.]
         self.system.periodicity = [True, True, False]
         self.system.cell_system.set_regular_decomposition()
+        self.setup_particles()
 
         scafacos = espressomd.electrostatics.Scafacos(
             prefactor=1,
diff --git a/testsuite/python/cutoffs.py b/testsuite/python/cutoffs.py
index 51c4ca0522e..963f86d6a17 100644
--- a/testsuite/python/cutoffs.py
+++ b/testsuite/python/cutoffs.py
@@ -42,7 +42,7 @@ def test(self):
         fene = FeneBond(r_0=1, d_r_max=2, k=1)
         system.bonded_inter.add(fene)
         self.assertEqual(system.cell_system.max_cut_bonded, 3)
-        n_nodes = np.product(system.cell_system.node_grid)
+        n_nodes = np.prod(system.cell_system.node_grid)
         if n_nodes == 1:
             # Bonds don't influence interaction range
             self.assertEqual(system.cell_system.interaction_range, -1)
diff --git a/testsuite/python/dawaanr-and-bh-gpu.py b/testsuite/python/dawaanr-and-bh-gpu.py
deleted file mode 100644
index 264340eb99a..00000000000
--- a/testsuite/python/dawaanr-and-bh-gpu.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#
-# Copyright (C) 2010-2022 The ESPResSo project
-#
-# This file is part of ESPResSo.
-#
-# ESPResSo is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# ESPResSo is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-import unittest as ut
-import unittest_decorators as utx
-import numpy as np
-import tests_common
-
-import espressomd
-import espressomd.magnetostatics
-import espressomd.analyze
-import espressomd.cuda_init
-import espressomd.galilei
-
-
-@utx.skipIfMissingGPU()
-@utx.skipIfMissingFeatures(["DIPOLAR_BARNES_HUT", "LENNARD_JONES"])
-class BHGPUTest(ut.TestCase):
-    system = espressomd.System(box_l=[1, 1, 1])
-    np.random.seed(seed=42)
-
-    def vectorsTheSame(self, a, b):
-        tol = 5E-2
-        vec_len = np.linalg.norm(a - b)
-        rel = 2 * vec_len / (np.linalg.norm(a) + np.linalg.norm(b))
-        return rel <= tol
-
-    @ut.skipIf(system.cell_system.get_state()["n_nodes"] > 1,
-               "Skipping test: only runs for n_nodes == 1")
-    def test(self):
-        np.random.seed(1)
-        pf_bh_gpu = 2.34
-        pf_dawaanr = 3.524
-        ratio_dawaanr_bh_gpu = pf_dawaanr / pf_bh_gpu
-        system = self.system
-        system.box_l = [15., 15., 15.]
-        system.periodicity = [False, False, False]
-        system.time_step = 1E-4
-        system.cell_system.skin = 0.1
-
-        for n in [128, 541]:
-            dipole_modulus = 1.3
-            part_dip = dipole_modulus * tests_common.random_dipoles(n)
-            part_pos = np.random.random((n, 3)) * system.box_l[0]
-            system.part.add(pos=part_pos, dip=part_dip)
-
-            system.non_bonded_inter[0, 0].lennard_jones.set_params(
-                epsilon=10.0, sigma=0.5, cutoff=0.55, shift="auto")
-            system.thermostat.set_langevin(kT=0.0, gamma=10.0, seed=42)
-            self.system.galilei.kill_particle_motion(rotation=True)
-            system.integrator.set_vv()
-
-            system.non_bonded_inter[0, 0].lennard_jones.deactivate()
-
-            system.cell_system.skin = 0.0
-            system.time_step = 0.01
-            system.thermostat.turn_off()
-
-            # gamma should be zero in order to avoid the noise term in force
-            # and torque
-            system.thermostat.set_langevin(kT=1.297, gamma=0.0)
-
-            dds_cpu = espressomd.magnetostatics.DipolarDirectSumCpu(
-                prefactor=pf_dawaanr)
-            system.magnetostatics.solver = dds_cpu
-            system.integrator.run(steps=0, recalc_forces=True)
-
-            dawaanr_f = np.copy(system.part.all().f)
-            dawaanr_t = np.copy(system.part.all().torque_lab)
-            dawaanr_e = system.analysis.energy()["total"]
-
-            del dds_cpu
-            system.magnetostatics.clear()
-
-            system.integrator.run(steps=0, recalc_forces=True)
-            bh_gpu = espressomd.magnetostatics.DipolarBarnesHutGpu(
-                prefactor=pf_bh_gpu, epssq=200.0, itolsq=8.0)
-            system.magnetostatics.solver = bh_gpu
-            system.integrator.run(steps=0, recalc_forces=True)
-
-            bhgpu_f = np.copy(system.part.all().f)
-            bhgpu_t = np.copy(system.part.all().torque_lab)
-            bhgpu_e = system.analysis.energy()["total"]
-
-            # compare
-            for i in range(n):
-                self.assertTrue(
-                    self.vectorsTheSame(
-                        np.array(dawaanr_t[i]),
-                        ratio_dawaanr_bh_gpu * np.array(bhgpu_t[i])),
-                    msg='Torques on particle do not match. i={0} dawaanr_t={1} '
-                        'ratio_dawaanr_bh_gpu*bhgpu_t={2}'.format(
-                            i, np.array(dawaanr_t[i]),
-                            ratio_dawaanr_bh_gpu * np.array(bhgpu_t[i])))
-                self.assertTrue(
-                    self.vectorsTheSame(
-                        np.array(dawaanr_f[i]),
-                        ratio_dawaanr_bh_gpu * np.array(bhgpu_f[i])),
-                    msg='Forces on particle do not match: i={0} dawaanr_f={1} '
-                        'ratio_dawaanr_bh_gpu*bhgpu_f={2}'.format(
-                            i, np.array(dawaanr_f[i]),
-                            ratio_dawaanr_bh_gpu * np.array(bhgpu_f[i])))
-            self.assertLessEqual(
-                abs(dawaanr_e - bhgpu_e * ratio_dawaanr_bh_gpu),
-                abs(1E-3 * dawaanr_e),
-                msg='Energies for dawaanr {0} and bh_gpu {1} do not match.'
-                .format(dawaanr_e, ratio_dawaanr_bh_gpu * bhgpu_e))
-
-            system.integrator.run(steps=0, recalc_forces=True)
-
-            del bh_gpu
-            system.magnetostatics.clear()
-            system.part.clear()
-
-
-if __name__ == '__main__':
-    ut.main()
diff --git a/testsuite/python/dawaanr-and-dds-gpu.py b/testsuite/python/dawaanr-and-dds-gpu.py
index 115969f12ea..dbb8f771b80 100644
--- a/testsuite/python/dawaanr-and-dds-gpu.py
+++ b/testsuite/python/dawaanr-and-dds-gpu.py
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2010-2022 The ESPResSo project
+# Copyright (C) 2010-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -23,100 +23,83 @@
 import numpy as np
 
 import espressomd
-import espressomd.interactions
 import espressomd.magnetostatics
-import espressomd.analyze
-import espressomd.galilei
 
 
 @utx.skipIfMissingGPU()
-@utx.skipIfMissingFeatures(["DIPOLES", "ROTATION", "LENNARD_JONES"])
+@utx.skipIfMissingFeatures(["DIPOLAR_DIRECT_SUM", "LENNARD_JONES"])
 class DDSGPUTest(ut.TestCase):
-    # Handle for espresso system
     system = espressomd.System(box_l=[1.0, 1.0, 1.0])
+    np.random.seed(seed=42)
 
-    @ut.skipIf(system.cell_system.get_state()["n_nodes"] > 1,
-               "Skipping test: only runs for n_nodes == 1")
     def test(self):
-        pf_dds_gpu = 2.34
-        pf_dawaanr = 3.524
-        ratio_dawaanr_dds_gpu = pf_dawaanr / pf_dds_gpu
-        self.system.box_l = [15., 15., 15.]
-        self.system.periodicity = [False, False, False]
-        self.system.time_step = 1E-4
-        self.system.cell_system.skin = 0.1
+        pf_dds_cpu = 2.34
+        pf_dds_gpu = 3.524
+        ratio = pf_dds_cpu / pf_dds_gpu
+        system = self.system
+        system.box_l = [15., 15., 15.]
+        system.periodicity = [False, False, False]
+        system.time_step = 0.01
+        system.cell_system.skin = 0.1
 
         for n in [128, 541]:
             dipole_modulus = 1.3
             part_dip = dipole_modulus * tests_common.random_dipoles(n)
-            part_pos = np.random.random((n, 3)) * self.system.box_l[0]
-            self.system.part.add(pos=part_pos, dip=part_dip)
+            part_pos = np.random.random((n, 3)) * system.box_l[0]
+            system.part.add(pos=part_pos, dip=part_dip)
 
-            self.system.non_bonded_inter[0, 0].lennard_jones.set_params(
+            # warmup
+            system.non_bonded_inter[0, 0].lennard_jones.set_params(
                 epsilon=10.0, sigma=0.5, cutoff=0.55, shift="auto")
+            system.thermostat.set_langevin(kT=0.0, gamma=10.0, seed=42)
+            system.integrator.set_vv()
+            system.integrator.run(steps=20)
+            system.non_bonded_inter[0, 0].lennard_jones.deactivate()
+            system.thermostat.turn_off()
+            system.galilei.kill_particle_motion(rotation=True)
 
-            self.system.thermostat.turn_off()
-            self.system.integrator.set_steepest_descent(
-                f_max=0.0, gamma=0.1, max_displacement=0.1)
-            self.system.integrator.run(500)
-            self.system.galilei.kill_particle_motion(rotation=True)
-            self.system.integrator.set_vv()
-
-            self.system.non_bonded_inter[0, 0].lennard_jones.deactivate()
-
-            self.system.cell_system.skin = 0.0
-            self.system.time_step = 0.01
-            self.system.thermostat.turn_off()
             # gamma should be zero in order to avoid the noise term in force
             # and torque
-            self.system.thermostat.set_langevin(kT=1.297, gamma=0.0, seed=42)
+            system.thermostat.set_langevin(kT=1.297, gamma=0.0)
 
             dds_cpu = espressomd.magnetostatics.DipolarDirectSumCpu(
-                prefactor=pf_dawaanr)
-            self.system.magnetostatics.solver = dds_cpu
-            self.system.integrator.run(steps=0, recalc_forces=True)
+                prefactor=pf_dds_cpu)
+            system.magnetostatics.solver = dds_cpu
+            system.integrator.run(steps=0, recalc_forces=True)
 
-            dawaanr_f = np.copy(self.system.part.all().f)
-            dawaanr_t = np.copy(self.system.part.all().torque_lab)
-            dawaanr_e = self.system.analysis.energy()["total"]
+            dawaanr_f = np.copy(system.part.all().f)
+            dawaanr_t = np.copy(system.part.all().torque_lab)
+            dawaanr_e = system.analysis.energy()["total"]
 
             del dds_cpu
-            self.system.magnetostatics.clear()
+            system.magnetostatics.clear()
 
-            self.system.integrator.run(steps=0, recalc_forces=True)
+            system.integrator.run(steps=0, recalc_forces=True)
             dds_gpu = espressomd.magnetostatics.DipolarDirectSumGpu(
                 prefactor=pf_dds_gpu)
-            self.system.magnetostatics.solver = dds_gpu
-            self.system.integrator.run(steps=0, recalc_forces=True)
-
-            ddsgpu_f = np.copy(self.system.part.all().f)
-            ddsgpu_t = np.copy(self.system.part.all().torque_lab)
-            ddsgpu_e = self.system.analysis.energy()["total"]
-
-            # compare
-            for i in range(n):
-                np.testing.assert_allclose(
-                    np.array(dawaanr_t[i]),
-                    ratio_dawaanr_dds_gpu * np.array(ddsgpu_t[i]),
-                    err_msg=f'Torques do not match for particle {i}',
-                    atol=3.2e-3)
-                np.testing.assert_allclose(
-                    np.array(dawaanr_f[i]),
-                    ratio_dawaanr_dds_gpu * np.array(ddsgpu_f[i]),
-                    err_msg=f'Forces do not match for particle {i}',
-                    atol=3.2e-3)
-            self.assertAlmostEqual(
-                dawaanr_e,
-                ddsgpu_e * ratio_dawaanr_dds_gpu,
-                places=2,
-                msg='Energies for dawaanr {0} and dds_gpu {1} do not match.'
-                .format(dawaanr_e, ratio_dawaanr_dds_gpu * ddsgpu_e))
-
-            self.system.integrator.run(steps=0, recalc_forces=True)
+            system.magnetostatics.solver = dds_gpu
+            system.integrator.run(steps=0, recalc_forces=True)
+
+            dds_gpu_f = np.copy(system.part.all().f)
+            dds_gpu_t = np.copy(system.part.all().torque_lab)
+            dds_gpu_e = system.analysis.energy()["total"]
+
+            np.testing.assert_allclose(dawaanr_t, ratio * dds_gpu_t,
+                                       atol=1e-10, rtol=1e-2)
+            np.testing.assert_allclose(dawaanr_f, ratio * dds_gpu_f,
+                                       atol=1e-10, rtol=1e-2)
+            np.testing.assert_allclose(dawaanr_e, ratio * dds_gpu_e,
+                                       atol=1e-10, rtol=1e-5)
+
+            # check MD cell reset has no impact
+            system.change_volume_and_rescale_particles(system.box_l[0], "x")
+            system.periodicity = system.periodicity
+            system.cell_system.node_grid = system.cell_system.node_grid
+            system.integrator.run(steps=0, recalc_forces=True)
 
             del dds_gpu
-            self.system.magnetostatics.clear()
-            self.system.part.clear()
+            system.magnetostatics.clear()
+            system.part.clear()
 
 
 if __name__ == '__main__':
diff --git a/testsuite/python/dds-and-bh-gpu.py b/testsuite/python/dds-and-bh-gpu.py
deleted file mode 100644
index 20016f442d8..00000000000
--- a/testsuite/python/dds-and-bh-gpu.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#
-# Copyright (C) 2010-2022 The ESPResSo project
-#
-# This file is part of ESPResSo.
-#
-# ESPResSo is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# ESPResSo is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-import unittest as ut
-import unittest_decorators as utx
-import tests_common
-import numpy as np
-
-import espressomd
-import espressomd.magnetostatics
-import espressomd.analyze
-import espressomd.cuda_init
-import espressomd.galilei
-
-
-@utx.skipIfMissingGPU()
-@utx.skipIfMissingFeatures(["DIPOLAR_BARNES_HUT", "LENNARD_JONES"])
-class BH_DDS_gpu_multCPU_test(ut.TestCase):
-    system = espressomd.System(box_l=[1, 1, 1])
-    np.random.seed(71)
-
-    def vectorsTheSame(self, a, b):
-        tol = 5E-2
-        vec_len = np.linalg.norm(a - b)
-        rel = 2 * vec_len / (np.linalg.norm(a) + np.linalg.norm(b))
-        return rel <= tol
-
-    def test(self):
-        pf_bh_gpu = 2.34
-        pf_dds_gpu = 3.524
-        ratio_dawaanr_bh_gpu = pf_dds_gpu / pf_bh_gpu
-        system = self.system
-        system.box_l = [15., 15., 15.]
-        system.periodicity = 3 * [False]
-        system.time_step = 1E-4
-        system.cell_system.skin = 0.1
-
-        part_dip = np.zeros((3))
-
-        for n in [128, 541]:
-            dipole_modulus = 1.3
-            part_pos = np.random.random((n, 3)) * system.box_l[0]
-            part_dip = dipole_modulus * tests_common.random_dipoles(n)
-            system.part.add(pos=part_pos, dip=part_dip,
-                            v=n * [(0, 0, 0)], omega_body=n * [(0, 0, 0)])
-
-            system.non_bonded_inter[0, 0].lennard_jones.set_params(
-                epsilon=10.0, sigma=0.5, cutoff=0.55, shift="auto")
-            system.thermostat.set_langevin(kT=0.0, gamma=10.0, seed=42)
-            self.system.galilei.kill_particle_motion(rotation=True)
-            system.integrator.set_vv()
-
-            system.non_bonded_inter[0, 0].lennard_jones.deactivate()
-
-            system.cell_system.skin = 0.0
-            system.time_step = 0.01
-            system.thermostat.turn_off()
-
-            # gamma should be zero in order to avoid the noise term in force
-            # and torque
-            system.thermostat.set_langevin(kT=1.297, gamma=0.0)
-
-            dds_gpu = espressomd.magnetostatics.DipolarDirectSumGpu(
-                prefactor=pf_dds_gpu)
-            system.magnetostatics.solver = dds_gpu
-            # check MD cell reset has no impact
-            system.box_l = system.box_l
-            system.periodicity = system.periodicity
-            system.cell_system.node_grid = system.cell_system.node_grid
-            system.integrator.run(steps=0, recalc_forces=True)
-
-            dawaanr_f = np.copy(system.part.all().f)
-            dawaanr_t = np.copy(system.part.all().torque_lab)
-            dawaanr_e = system.analysis.energy()["total"]
-
-            del dds_gpu
-            system.magnetostatics.clear()
-
-            system.integrator.run(steps=0, recalc_forces=True)
-            bh_gpu = espressomd.magnetostatics.DipolarBarnesHutGpu(
-                prefactor=pf_bh_gpu, epssq=200.0, itolsq=8.0)
-            system.magnetostatics.solver = bh_gpu
-            # check MD cell reset has no impact
-            system.box_l = system.box_l
-            system.periodicity = system.periodicity
-            system.cell_system.node_grid = system.cell_system.node_grid
-            system.integrator.run(steps=0, recalc_forces=True)
-
-            bhgpu_f = np.copy(system.part.all().f)
-            bhgpu_t = np.copy(system.part.all().torque_lab)
-            bhgpu_e = system.analysis.energy()["total"]
-
-            # compare
-            for i in range(n):
-                self.assertTrue(
-                    self.vectorsTheSame(
-                        np.array(dawaanr_t[i]),
-                        ratio_dawaanr_bh_gpu * np.array(bhgpu_t[i])),
-                    msg='Torques on particle do not match. i={0} dawaanr_t={1} '
-                        'ratio_dawaanr_bh_gpu*bhgpu_t={2}'.format(
-                        i, np.array(dawaanr_t[i]),
-                        ratio_dawaanr_bh_gpu * np.array(bhgpu_t[i])))
-                self.assertTrue(
-                    self.vectorsTheSame(
-                        np.array(dawaanr_f[i]),
-                        ratio_dawaanr_bh_gpu * np.array(bhgpu_f[i])),
-                    msg='Forces on particle do not match: i={0} dawaanr_f={1} '
-                        'ratio_dawaanr_bh_gpu*bhgpu_f={2}'.format(
-                        i, np.array(dawaanr_f[i]),
-                        ratio_dawaanr_bh_gpu * np.array(bhgpu_f[i])))
-            self.assertLessEqual(
-                abs(dawaanr_e - bhgpu_e * ratio_dawaanr_bh_gpu),
-                abs(1E-3 * dawaanr_e),
-                msg='Energies for dawaanr {0} and bh_gpu {1} do not match.'
-                    .format(dawaanr_e, ratio_dawaanr_bh_gpu * bhgpu_e))
-
-            system.integrator.run(steps=0, recalc_forces=True)
-
-            del bh_gpu
-            system.magnetostatics.clear()
-            system.part.clear()
-
-
-if __name__ == '__main__':
-    ut.main()
diff --git a/testsuite/python/dipolar_direct_summation.py b/testsuite/python/dipolar_direct_summation.py
index 6ae5a397a64..61b3af5f48a 100644
--- a/testsuite/python/dipolar_direct_summation.py
+++ b/testsuite/python/dipolar_direct_summation.py
@@ -49,7 +49,8 @@ def dds_gpu_data(self):
         dds_cpu = espressomd.magnetostatics.DipolarDirectSumGpu(prefactor=1.2)
         system.magnetostatics.solver = dds_cpu
         # check MD cell reset has no impact
-        self.system.box_l = self.system.box_l
+        self.system.change_volume_and_rescale_particles(
+            self.system.box_l[0], "x")
         self.system.periodicity = self.system.periodicity
         self.system.cell_system.node_grid = self.system.cell_system.node_grid
 
@@ -68,7 +69,8 @@ def dds_data(self):
         dds_cpu = espressomd.magnetostatics.DipolarDirectSumCpu(prefactor=1.2)
         system.magnetostatics.solver = dds_cpu
         # check MD cell reset has no impact
-        self.system.box_l = self.system.box_l
+        self.system.change_volume_and_rescale_particles(
+            self.system.box_l[0], "x")
         self.system.periodicity = self.system.periodicity
         self.system.cell_system.node_grid = self.system.cell_system.node_grid
 
diff --git a/testsuite/python/dipolar_interface.py b/testsuite/python/dipolar_interface.py
index 9334c764f7f..1f6500f0bb5 100644
--- a/testsuite/python/dipolar_interface.py
+++ b/testsuite/python/dipolar_interface.py
@@ -54,12 +54,6 @@ def tearDown(self):
             system.magnetostatics, espressomd.magnetostatics.DipolarDirectSumGpu,
             dict(prefactor=3.4))
 
-    if espressomd.has_features(
-            "DIPOLAR_BARNES_HUT") and espressomd.gpu_available():
-        test_dds_gpu = tests_common.generate_test_for_actor_class(
-            system.magnetostatics, espressomd.magnetostatics.DipolarBarnesHutGpu,
-            dict(prefactor=3.4, epssq=200.0, itolsq=8.0))
-
     if espressomd.has_features("DP3M"):
         test_dp3m_metallic = tests_common.generate_test_for_actor_class(
             system.magnetostatics, espressomd.magnetostatics.DipolarP3M,
@@ -112,7 +106,7 @@ def test_exceptions_non_p3m(self):
             self.system.magnetostatics.solver = mdlc
         self.assertIsNone(self.system.magnetostatics.solver)
         self.system.periodicity = [True, True, True]
-        self.system.box_l = [10., 10. + 2e-3, 10.]
+        self.system.change_volume_and_rescale_particles(10. + 2e-3, "y")
         with self.assertRaisesRegex(Exception, "box size in x direction is different from y direction"):
             mdlc = MDLC(gap_size=1., maxPWerror=1e-5, actor=ddsr)
             self.system.magnetostatics.solver = mdlc
@@ -128,14 +122,14 @@ def test_exceptions_non_p3m(self):
             self.system.magnetostatics.clear()
         # check it's safe to resize the box, i.e. there are no currently
         # active sanity check in the core
-        self.system.box_l = [10., 10., 10.]
+        self.system.change_volume_and_rescale_particles(10., "y")
         with self.assertRaisesRegex(Exception, "box size in x direction is different from y direction"):
             ddsr = DDSR(prefactor=1., n_replicas=1)
             mdlc = MDLC(gap_size=1., maxPWerror=1e-5, actor=ddsr)
             self.system.magnetostatics.solver = mdlc
-            self.system.box_l = [9., 10., 10.]
+            self.system.change_volume_and_rescale_particles(9., "x")
         self.system.magnetostatics.clear()
-        self.system.box_l = [10., 10., 10.]
+        self.system.change_volume_and_rescale_particles(10., "x")
 
     @utx.skipIfMissingFeatures(["DP3M"])
     def test_exceptions_p3m(self):
@@ -159,16 +153,6 @@ def test_exceptions_p3m(self):
         with self.assertRaisesRegex(RuntimeError, "Parameter 'accuracy' is not a valid parameter"):
             MDLC(gap_size=2., maxPWerror=0.1, actor=dp3m, accuracy=1e-3)
 
-    @utx.skipIfMissingGPU()
-    @utx.skipIfMissingFeatures(["DIPOLAR_BARNES_HUT"])
-    def test_exceptions_barnes_hut(self):
-        valid_params = dict(prefactor=2., epssq=200., itolsq=8.)
-        for key in valid_params.keys():
-            invalid_params = valid_params.copy()
-            invalid_params[key] = -1.
-            with self.assertRaisesRegex(ValueError, f"Parameter '{key}' must be > 0"):
-                espressomd.magnetostatics.DipolarBarnesHutGpu(**invalid_params)
-
 
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/dipolar_p3m.py b/testsuite/python/dipolar_p3m.py
index 04a1a3870c8..33a5d60e065 100644
--- a/testsuite/python/dipolar_p3m.py
+++ b/testsuite/python/dipolar_p3m.py
@@ -92,7 +92,8 @@ def test_dp3m(self):
         ref_mdlc_torque_metallic = np.copy(partcls.torque_lab)
 
         # solvers should remain in a valid state after a cell system reset
-        self.system.box_l = self.system.box_l
+        self.system.change_volume_and_rescale_particles(
+            self.system.box_l[0], "x")
         self.system.periodicity = self.system.periodicity
         self.system.cell_system.node_grid = self.system.cell_system.node_grid
         self.system.integrator.run(0, recalc_forces=True)
diff --git a/testsuite/python/ek_bulk_reactions.py b/testsuite/python/ek_bulk_reactions.py
index 0c1d5e31e51..8b1f23ad048 100644
--- a/testsuite/python/ek_bulk_reactions.py
+++ b/testsuite/python/ek_bulk_reactions.py
@@ -112,7 +112,7 @@ def detail_test_reaction(self, single_precision: bool):
 
         self.system.integrator.run(self.TIME)
 
-        domain_volume = np.product(ek_species_product.shape)
+        domain_volume = np.prod(ek_species_product.shape)
         analytic_time = (self.TIME + 0.5) * self.system.time_step
 
         measured_educt_densities = np.zeros(len(stoech_coeffs))
diff --git a/testsuite/python/ek_interface.py b/testsuite/python/ek_interface.py
index 136ea769f24..f868b9baa68 100644
--- a/testsuite/python/ek_interface.py
+++ b/testsuite/python/ek_interface.py
@@ -229,8 +229,7 @@ def test_parameter_change_exceptions(self):
             self.system.electrostatics.clear()
         with self.assertRaisesRegex(RuntimeError, "MD cell geometry change not supported by EK"):
             self.system.box_l = [1., 2., 3.]
-        np.testing.assert_allclose(
-            np.copy(self.system.box_l), [1., 2., 3.], atol=1e-7)
+        np.testing.assert_allclose(np.copy(self.system.box_l), 6., atol=1e-7)
         with self.assertRaisesRegex(RuntimeError, "MPI topology change not supported by EK"):
             self.system.cell_system.node_grid = self.system.cell_system.node_grid
 
@@ -282,7 +281,7 @@ def test_grid_index(self):
             for offset in (shape[i] + 1, -(shape[i] + 1)):
                 n = [0, 0, 0]
                 n[i] += offset
-                err_msg = rf"provided index \[{str(n)[1:-1]}\] is out of range for shape \[{str(list(shape))[1:-1]}\]"
+                err_msg = rf"provided index \[{str(n)[1:-1]}\] is out of range for shape \[{str(list(shape))[1:-1]}\]"  # nopep8
                 with self.assertRaisesRegex(IndexError, err_msg):
                     ek_reaction[tuple(n)]
                 with self.assertRaisesRegex(IndexError, err_msg):
diff --git a/testsuite/python/elc_vs_analytic.py b/testsuite/python/elc_vs_analytic.py
index 261b12d453d..d7cb4edb722 100644
--- a/testsuite/python/elc_vs_analytic.py
+++ b/testsuite/python/elc_vs_analytic.py
@@ -53,14 +53,13 @@ def test_elc(self):
         simulation box with dielectric contrast on the bottom of the box,
         which can be calculated analytically with image charges.
         """
-        self.system.part.add(pos=self.system.box_l / 2., q=self.q[0])
-        self.system.part.add(pos=self.system.box_l / 2. + [0, 0, self.distance],
-                             q=-self.q[0])
-
         self.system.box_l = [self.box_l, self.box_l, self.box_l + self.elc_gap]
         self.system.cell_system.set_regular_decomposition(
             use_verlet_lists=True)
         self.system.periodicity = [True, True, True]
+        self.system.part.add(pos=self.system.box_l / 2., q=self.q[0])
+        self.system.part.add(pos=self.system.box_l / 2. + [0, 0, self.distance],
+                             q=-self.q[0])
         prefactor = 2.0
         p3m = self.p3m_class(prefactor=prefactor, accuracy=self.accuracy,
                              mesh=[58, 58, 70], cao=4)
diff --git a/testsuite/python/electrostatic_interactions.py b/testsuite/python/electrostatic_interactions.py
index 8bc8b94cf0c..de2043434a3 100644
--- a/testsuite/python/electrostatic_interactions.py
+++ b/testsuite/python/electrostatic_interactions.py
@@ -133,7 +133,8 @@ def test_dh(self):
 
         self.system.electrostatics.solver = dh
         # actor should remain in a valid state after a cell system reset
-        self.system.box_l = self.system.box_l
+        self.system.change_volume_and_rescale_particles(
+            self.system.box_l[0], "x")
         self.system.periodicity = self.system.periodicity
         self.system.cell_system.node_grid = self.system.cell_system.node_grid
 
@@ -209,7 +210,8 @@ def test_rf(self):
 
         self.system.electrostatics.solver = rf
         # actor should remain in a valid state after a cell system reset
-        self.system.box_l = self.system.box_l
+        self.system.change_volume_and_rescale_particles(
+            self.system.box_l[0], "x")
         self.system.periodicity = self.system.periodicity
         self.system.cell_system.node_grid = self.system.cell_system.node_grid
 
diff --git a/testsuite/python/field_test.py b/testsuite/python/field_test.py
index dd07b44be63..f27063f0c9c 100644
--- a/testsuite/python/field_test.py
+++ b/testsuite/python/field_test.py
@@ -82,14 +82,13 @@ def test_linear_electric_potential(self):
 
         self.system.constraints.add(electric_field)
 
-        p = self.system.part.add(pos=[0.5, 0.5, 0.5])
-        p.q = -3.1
+        p = self.system.part.add(pos=[0.5, -0.5, 10.5], q=-3.1)
 
         self.system.integrator.run(0)
         np.testing.assert_almost_equal(p.q * E, np.copy(p.f))
 
         self.assertAlmostEqual(self.system.analysis.energy()['total'],
-                               p.q * (- np.dot(E, p.pos) + phi0))
+                               p.q * (- np.dot(E, p.pos_folded) + phi0))
         self.assertAlmostEqual(self.system.analysis.energy()['total'],
                                self.system.analysis.energy()['external_fields'])
 
@@ -117,7 +116,7 @@ def test_electric_plane_wave(self):
 
         self.system.constraints.add(electric_wave)
 
-        p = self.system.part.add(pos=[0.4, 0.1, 0.11], q=-14.)
+        p = self.system.part.add(pos=[0.4, -0.1, 10.11], q=-14.)
         self.system.time = 1042.
 
         self.system.integrator.run(0)
@@ -265,7 +264,7 @@ def test_flow_field(self):
             f_val = np.array(F.call_method("_eval_field", x=x))
             np.testing.assert_allclose(f_val, self.force(x))
 
-            p.pos = x
+            p.pos = x + np.multiply([0, -1, 1], self.system.box_l)
             self.system.integrator.run(0)
             np.testing.assert_allclose(
                 -gamma * (p.v - f_val), np.copy(p.f), atol=1e-12)
diff --git a/testsuite/python/lattice_vtk.py b/testsuite/python/lattice_vtk.py
index 90c43f788d7..9b4e1981a18 100644
--- a/testsuite/python/lattice_vtk.py
+++ b/testsuite/python/lattice_vtk.py
@@ -51,7 +51,7 @@ def tearDown(self):
                "this test is slow on more than 4 MPI ranks")
     def test_exceptions(self):
         label_invalid_obs = f"test_vtk_{self.vtk_id}_invalid_obs"
-        error_msg = rf"Only the following VTK observables are supported: \[{repr(sorted(self.valid_obs))[1:-1]}\], got 'dens'"
+        error_msg = rf"Only the following VTK observables are supported: \[{repr(sorted(self.valid_obs))[1:-1]}\], got 'dens'"  # nopep8
         with self.assertRaisesRegex(ValueError, error_msg):
             self.vtk_class(
                 identifier=label_invalid_obs, delta_N=0, observables=["dens"])
diff --git a/testsuite/python/lb.py b/testsuite/python/lb.py
index 704c8669282..86daa7fe518 100644
--- a/testsuite/python/lb.py
+++ b/testsuite/python/lb.py
@@ -52,7 +52,10 @@ class LBTest:
     system.periodicity = [True, True, True]
     system.time_step = params['tau']
     system.cell_system.skin = 1.0
+    if espressomd.gpu_available():
+        system.cuda_init_handle.call_method("set_device_id_per_rank")
     interpolation = False
+    n_nodes = system.cell_system.get_state()["n_nodes"]
 
     def setUp(self):
         self.system.box_l = 3 * [6.0]
@@ -86,13 +89,14 @@ def test_properties(self):
         self.check_properties(lbf)
 
     def check_properties(self, lbf):
+        density = self.params["density"]
         agrid = self.params["agrid"]
-        tau = self.system.time_step
+        tau = self.params["tau"]
         # check LB object
         self.assertAlmostEqual(lbf.tau, tau, delta=self.atol)
         self.assertAlmostEqual(lbf.agrid, agrid, delta=self.atol)
         self.assertAlmostEqual(lbf.kinematic_viscosity, 3., delta=self.atol)
-        self.assertAlmostEqual(lbf.density, 0.85, delta=self.atol)
+        self.assertAlmostEqual(lbf.density, density, delta=self.atol)
         self.assertAlmostEqual(lbf.kT, 1.0, delta=self.atol)
         self.assertEqual(lbf.seed, 42)
         self.assertEqual(
@@ -133,6 +137,57 @@ def check_properties(self, lbf):
             lbf[0, 0, 0].velocity = [1, 2]
         with self.assertRaises(TypeError):
             lbf[0, 1].velocity = [1, 2, 3]
+        # check node setters update cached velocities (with precision loss)
+        lbnode = lbf[0, 0, 0]
+        lbnode.last_applied_force = [0., 0., 0.]
+        lbnode.velocity = [0., 0., 0.]
+        old_pop = np.copy(lbnode.population)
+        old_vel = np.copy(lbnode.velocity)
+        old_rho = np.copy(lbnode.density)
+        lbnode.velocity = [1., 2., 3.]
+        new_pop = np.copy(lbnode.population)
+        new_vel = np.copy(lbnode.velocity)
+        lbnode.population = old_pop
+        np.testing.assert_allclose(
+            np.copy(lbnode.velocity), old_vel, atol=self.atol * 20.)
+        lbnode.population = new_pop
+        np.testing.assert_allclose(
+            np.copy(lbnode.velocity), new_vel, atol=self.atol * 20.)
+        lbnode.population = old_pop
+        lbnode.last_applied_force = [0.4, 0.5, 0.6]
+        np.testing.assert_allclose(
+            np.copy(lbnode.velocity),
+            np.copy([0.4, 0.5, 0.6]) / (agrid / tau * old_rho / 2.),
+            atol=self.atol * 20.)
+        lbnode.last_applied_force = [0., 0., 0.]
+        lbnode.population = old_pop
+        # check slice setters update cached velocities (with precision loss)
+        lbslice = lbf[0:5, 0:5, 0:5]
+        lbslice.last_applied_force = [0., 0., 0.]
+        lbslice.velocity = [0., 0., 0.]
+        old_pop = np.copy(lbslice.population)
+        old_vel = np.copy(lbslice.velocity)
+        old_rho = np.copy(lbslice.density)
+        lbslice.velocity = [1., 2., 3.]
+        new_pop = np.copy(lbslice.population)
+        new_vel = np.copy(lbslice.velocity)
+        lbslice.population = old_pop
+        np.testing.assert_allclose(
+            np.copy(lbslice.velocity), old_vel, atol=self.atol * 100.)
+        lbslice.population = new_pop
+        np.testing.assert_allclose(
+            np.copy(lbslice.velocity), new_vel, atol=self.atol * 100.)
+        lbslice.population = old_pop
+        lbslice.last_applied_force = [0.4, 0.5, 0.6]
+        vel2force = 2. * tau / agrid
+        np.testing.assert_allclose(
+            np.copy(lbslice.velocity),
+            np.tile(np.copy([0.4, 0.5, 0.6]), (5, 5, 5, 1)) * vel2force /
+            np.tile(old_rho.reshape((5, 5, 5, 1)), (3,)),
+            atol=self.atol * 20.)
+        lbslice.last_applied_force = [0., 0., 0.]
+        lbslice.population = old_pop
+        # check node boundary conditions
         node = lbf[0, 0, 0]
         self.assertIsNone(node.boundary)
         self.assertIsNone(node.boundary_force)
@@ -167,6 +222,9 @@ def check_properties(self, lbf):
         node.velocity = velocity_old
         # check slice matches node
         lbslice = lbf[0:5, 0:5, 0:5]
+        np.testing.assert_allclose(
+            np.copy(lbslice.population)[1, 2, 3, :],
+            np.copy(node.population), atol=self.atol)
         np.testing.assert_allclose(
             np.copy(lbslice.velocity)[1, 2, 3, :],
             np.copy(node.velocity), atol=self.atol)
@@ -367,13 +425,13 @@ def test_lb_node_set_get(self):
         self.assertAlmostEqual(lbf[0, 0, 0].density, density, delta=1e-4)
 
         self.assertEqual(lbf[3, 2, 1].index, (3, 2, 1))
-        ext_force_density = [0.1, 0.2, 1.2]
-        last_applied_force = [0.2, 0.4, 0.6]
+        ext_force_density = np.array([0.1, 0.2, 1.2])
+        last_applied_force = np.array([0.2, 0.4, 0.6])
         lbf.ext_force_density = ext_force_density
         node = lbf[1, 2, 3]
         node.velocity = v_fluid
-        node.last_applied_force = last_applied_force
         np.testing.assert_allclose(np.copy(node.velocity), v_fluid, atol=1e-4)
+        node.last_applied_force = last_applied_force
         np.testing.assert_allclose(
             np.copy(node.last_applied_force), last_applied_force, atol=1e-4)
         np.testing.assert_allclose(
@@ -414,8 +472,7 @@ def test_parameter_change_without_seed(self):
             self.system.electrostatics.clear()
         with self.assertRaisesRegex(RuntimeError, "MD cell geometry change not supported by LB"):
             self.system.box_l = [1., 2., 3.]
-        np.testing.assert_allclose(
-            np.copy(self.system.box_l), [1., 2., 3.], atol=1e-7)
+        np.testing.assert_allclose(np.copy(self.system.box_l), 6., atol=1e-7)
         with self.assertRaisesRegex(RuntimeError, "MPI topology change not supported by LB"):
             self.system.cell_system.node_grid = self.system.cell_system.node_grid
 
@@ -432,7 +489,7 @@ def test_grid_index(self):
             for offset in (shape[i] + 1, -(shape[i] + 1)):
                 n = [0, 0, 0]
                 n[i] += offset
-                err_msg = rf"provided index \[{str(n)[1:-1]}\] is out of range for shape \[{str(list(shape))[1:-1]}\]"
+                err_msg = rf"provided index \[{str(n)[1:-1]}\] is out of range for shape \[{str(list(shape))[1:-1]}\]"  # nopep8
                 with self.assertRaisesRegex(IndexError, err_msg):
                     lbf[tuple(n)].velocity
         # node index
@@ -494,6 +551,12 @@ def test_viscous_coupling(self):
         self.system.lb = lbf
         self.system.thermostat.set_lb(LB_fluid=lbf, seed=3, gamma=self.gamma)
 
+        particle_rtol = 1e-10
+        fluid_rtol = 1e-10
+        if self.lb_params["single_precision"]:
+            particle_rtol = 4e-6
+            fluid_rtol = 4e-6
+
         # Random velocities
         lbf[:, :, :].velocity = np.random.random((*lbf.shape, 3))
         # Test several particle positions
@@ -512,12 +575,14 @@ def test_viscous_coupling(self):
             self.system.integrator.run(1)
             # Check friction force
             np.testing.assert_allclose(
-                np.copy(p.f), -self.gamma * (v_part - v_fluid), atol=1E-10)
+                np.copy(p.f), -self.gamma * (v_part - v_fluid),
+                rtol=particle_rtol, atol=0.)
 
             # check particle/fluid force balance
             applied_forces = np.array([n.last_applied_force for n in lb_nodes])
             np.testing.assert_allclose(
-                np.sum(applied_forces, axis=0), -np.copy(p.f), atol=1E-10)
+                np.sum(applied_forces, axis=0), -np.copy(p.f),
+                rtol=fluid_rtol, atol=0.)
 
             # Check that last_applied_force gets cleared
             p.remove()
@@ -531,6 +596,12 @@ def test_viscous_coupling_pairs(self):
         self.system.lb = lbf
         self.system.thermostat.set_lb(LB_fluid=lbf, seed=3, gamma=self.gamma)
 
+        particle_rtol = 1e-10
+        fluid_rtol = 1e-10
+        if self.lb_params["single_precision"]:
+            particle_rtol = 4e-6
+            fluid_rtol = 4e-6
+
         # Random velocities
         lbf[:, :, :].velocity = np.random.random((*lbf.shape, 3))
         # Test several particle positions
@@ -561,15 +632,18 @@ def test_viscous_coupling_pairs(self):
             self.system.integrator.run(1)
             # Check friction force
             np.testing.assert_allclose(
-                np.copy(p1.f), -self.gamma * (v_part1 - v_fluid1), atol=1E-10)
+                np.copy(p1.f), -self.gamma * (v_part1 - v_fluid1),
+                rtol=particle_rtol, atol=0.)
             np.testing.assert_allclose(
-                np.copy(p2.f), -self.gamma * (v_part2 - v_fluid2), atol=1E-10)
+                np.copy(p2.f), -self.gamma * (v_part2 - v_fluid2),
+                rtol=particle_rtol, atol=0.)
 
             # check particle/fluid force balance
             applied_forces = np.array(
                 [n.last_applied_force for n in all_coupling_nodes])
             np.testing.assert_allclose(
-                np.sum(applied_forces, axis=0), -np.copy(p1.f) - np.copy(p2.f), atol=1E-10)
+                np.sum(applied_forces, axis=0), -np.copy(p1.f) - np.copy(p2.f),
+                rtol=fluid_rtol, atol=0.)
 
             # Check that last_applied_force gets cleared
             self.system.part.clear()
@@ -589,6 +663,36 @@ def test_viscous_coupling_rounding(self):
             self.system.integrator.run(1)
             self.assertTrue(np.all(p.f != 0.0))
 
+    def test_tracers_coupling_rounding(self):
+        import espressomd.propagation
+        lbf = self.lb_class(**self.params, **self.lb_params)
+        self.system.lb = lbf
+        ext_f = np.array([0.01, 0.02, 0.03])
+        lbf.ext_force_density = ext_f
+        self.system.thermostat.set_lb(LB_fluid=lbf, seed=3, gamma=self.gamma)
+        rtol = self.rtol
+        if lbf.single_precision:
+            rtol *= 100.
+        mode_tracer = espressomd.propagation.Propagation.TRANS_LB_TRACER
+        self.system.time = 0.
+        p = self.system.part.add(pos=[-1E-30] * 3, propagation=mode_tracer)
+        for _ in range(10):
+            self.system.integrator.run(1)
+            vel = np.copy(p.v) * self.params["density"]
+            vel_ref = (self.system.time + lbf.tau * 0.5) * ext_f
+            np.testing.assert_allclose(vel, vel_ref, rtol=rtol, atol=0.)
+
+    @ut.skipIf(n_nodes != 2, "test is designed to run on 2 MPI ranks")
+    def test_rng(self):
+        system = self.system
+        system.lb = self.lb_class(kT=15., **self.params, **self.lb_params)
+        system.integrator.run(1)
+        diff = system.lb[0, :, :].population - system.lb[6, :, :].population
+        # if the RNG uses the local cell index instead of the global cell index,
+        # the noise will be identical in all blocks, and the RMS is zero
+        rms = np.sqrt(np.mean(np.square(diff)))
+        self.assertGreater(rms, 0.01, msg="thermal noise might be correlated!")
+
     def test_thermalization_force_balance(self):
         system = self.system
 
@@ -724,10 +828,7 @@ def params_with_tau(tau):
 
 
 @utx.skipIfMissingFeatures("WALBERLA")
-class LBTestWalberlaDoublePrecision(LBTest, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in double-precision."""
-
+class LBTestWalberlaDoublePrecisionCPU(LBTest, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_lattice_class = espressomd.lb.LatticeWalberla
     lb_params = {"single_precision": False}
@@ -736,10 +837,7 @@ class LBTestWalberlaDoublePrecision(LBTest, ut.TestCase):
 
 
 @utx.skipIfMissingFeatures("WALBERLA")
-class LBTestWalberlaSinglePrecision(LBTest, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in single-precision."""
-
+class LBTestWalberlaSinglePrecisionCPU(LBTest, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_lattice_class = espressomd.lb.LatticeWalberla
     lb_params = {"single_precision": True}
@@ -747,5 +845,25 @@ class LBTestWalberlaSinglePrecision(LBTest, ut.TestCase):
     rtol = 5e-5
 
 
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBTestWalberlaDoublePrecisionGPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    lb_params = {"single_precision": False}
+    atol = 1e-10
+    rtol = 1e-7
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBTestWalberlaSinglePrecisionGPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    lb_params = {"single_precision": True}
+    atol = 1e-6
+    rtol = 2e-4
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_boundary.py b/testsuite/python/lb_boundary.py
index e40de2a91c2..6ad5a6c0ad0 100644
--- a/testsuite/python/lb_boundary.py
+++ b/testsuite/python/lb_boundary.py
@@ -100,20 +100,28 @@ def test_exceptions(self):
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBBoundariesWalberlaDoublePrecision(LBBoundariesBase, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in double-precision."""
-
+class LBBoundariesWalberlaDoublePrecisionCPU(LBBoundariesBase, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBBoundariesWalberlaSinglePrecision(LBBoundariesBase, ut.TestCase):
+class LBBoundariesWalberlaSinglePrecisionCPU(LBBoundariesBase, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True}
 
-    """Test for the Walberla implementation of the LB in single-precision."""
 
-    lb_class = espressomd.lb.LBFluidWalberla
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBBoundariesWalberlaDoublePrecisionGPU(LBBoundariesBase, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": False}
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBBoundariesWalberlaSinglePrecisionGPU(LBBoundariesBase, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
     lb_params = {"single_precision": True}
 
 
diff --git a/testsuite/python/lb_boundary_ghost_layer.py b/testsuite/python/lb_boundary_ghost_layer.py
index 0c226d4e033..84ce9180f01 100644
--- a/testsuite/python/lb_boundary_ghost_layer.py
+++ b/testsuite/python/lb_boundary_ghost_layer.py
@@ -38,6 +38,8 @@ class TestCommon:
     system = espressomd.System(box_l=[16.0, 1.0, 1.0])
     system.time_step = TIME_STEP
     system.cell_system.skin = 0.4 * AGRID
+    if espressomd.gpu_available():
+        system.cuda_init_handle.call_method("set_device_id_per_rank")
     n_nodes = system.cell_system.get_state()["n_nodes"]
 
     def setUp(self):
@@ -64,7 +66,7 @@ def quadratic(x, a, b, c):
         popt_ref = (4e-8, -1e-6, 1e-5)
         popt, _ = scipy.optimize.curve_fit(
             quadratic, xdata, ydata, p0=popt_ref)
-        rtol = 0.3 if self.lbf.single_precision else 0.1
+        rtol = 0.33 if self.lbf.single_precision else 0.1
         np.testing.assert_allclose(popt, popt_ref, rtol=0.5, atol=0.)
         np.testing.assert_allclose(ydata, quadratic(xdata, *popt),
                                    rtol=rtol, atol=0.)
@@ -87,17 +89,33 @@ def test_shape_setter(self):
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
 @ut.skipIf(TestCommon.n_nodes != 2, "only runs for 2 MPI ranks")
-class LBPoiseuilleWalberlaSinglePrecision(TestCommon, ut.TestCase):
+class LBPoiseuilleWalberlaSinglePrecisionCPU(TestCommon, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": True}
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
 @ut.skipIf(TestCommon.n_nodes != 2, "only runs for 2 MPI ranks")
-class LBPoiseuilleWalberlaDoublePrecision(TestCommon, ut.TestCase):
+class LBPoiseuilleWalberlaDoublePrecisionCPU(TestCommon, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
 
 
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+@ut.skipIf(TestCommon.n_nodes != 2, "only runs for 2 MPI ranks")
+class LBPoiseuilleWalberlaSinglePrecisionGPU(TestCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": True}
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+@ut.skipIf(TestCommon.n_nodes != 2, "only runs for 2 MPI ranks")
+class LBPoiseuilleWalberlaDoublePrecisionGPU(TestCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": False}
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_circular_couette.py b/testsuite/python/lb_circular_couette.py
index e27e52d6cf2..f16b6bbf244 100644
--- a/testsuite/python/lb_circular_couette.py
+++ b/testsuite/python/lb_circular_couette.py
@@ -42,8 +42,7 @@ def taylor_couette(v1, v2, r1, r2):
     return a, b
 
 
-@utx.skipIfMissingFeatures(["WALBERLA"])
-class LBCircularCouetteCommon:
+class LBCouetteTest:
 
     system = espressomd.System(box_l=(GRID_SIZE + [1, 1, 0]) * AGRID)
     system.time_step = TIME_STEP
@@ -61,9 +60,9 @@ def test_taylor_couette_flow(self):
         """
 
         system = self.system
-        lb_fluid = espressomd.lb.LBFluidWalberla(
+        lb_fluid = self.lb_class(
             agrid=AGRID, density=0.5, kinematic_viscosity=3.2,
-            tau=system.time_step)
+            tau=system.time_step, **self.lb_params)
         self.system.lb = lb_fluid
 
         # set up two cylinders
@@ -148,21 +147,28 @@ def test_taylor_couette_flow(self):
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBCircularCouetteWalberla(LBCircularCouetteCommon, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in double-precision."""
-
+class LBCircularCouetteWalberlaDoublePRecisionCPU(LBCouetteTest, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBCircularCouetteWalberlaSinglePrecision(
-        LBCircularCouetteCommon, ut.TestCase):
+class LBCircularCouetteWalberlaSinglePrecisionCPU(LBCouetteTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True}
 
-    """Test for the Walberla implementation of the LB in single-precision."""
 
-    lb_class = espressomd.lb.LBFluidWalberla
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBCircularCouetteWalberlaDoublePrecisionGPU(LBCouetteTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": False}
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBCircularCouetteWalberlaSinglePrecisionGPU(LBCouetteTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
     lb_params = {"single_precision": True}
 
 
diff --git a/testsuite/python/lb_interpolation.py b/testsuite/python/lb_interpolation.py
index 267886eaafe..96e93523b5a 100644
--- a/testsuite/python/lb_interpolation.py
+++ b/testsuite/python/lb_interpolation.py
@@ -62,6 +62,8 @@ def setUp(self):
 
     def tearDown(self):
         self.system.lb = None
+        self.system.thermostat.turn_off()
+        self.system.part.clear()
 
     def set_boundaries(self, velocity):
         """Place boundaries *not* exactly on a LB node."""
@@ -125,22 +127,56 @@ def test_mach_limit_check(self):
             self.lbf.add_boundary_from_shape(shape, vbb.velocity)
         self.assertIsNone(self.lbf[0, 0, 0].boundary)
 
+    def test_interpolated_force(self):
+        system = self.system
+        system.thermostat.set_lb(LB_fluid=system.lb, seed=42, gamma=1.)
+        system.integrator.run(1)
+        lb_vel = np.zeros([12, 12, 12, 3], dtype=float)
+        for i in range(12):
+            for j in range(12):
+                for k in range(12):
+                    lb_vel[i, j, k] = 1e-3 * np.array([i + 1, j + 1, k + 1])
+        p = system.part.add(pos=3 * [1.5])
+        for i in range(3):
+            for j in range(3):
+                for k in range(3):
+                    system.lb[:, :, :].velocity = lb_vel
+                    p.pos = (
+                        (i + 0.5) * AGRID,
+                        (j + 0.5) * AGRID,
+                        (k + 0.5) * AGRID)
+                    p.v = [0., 0., 0.]
+                    system.integrator.run(1)
+                    np.testing.assert_allclose(np.copy(p.f), lb_vel[i, j, k])
 
-@utx.skipIfMissingFeatures(["WALBERLA"])
-class LBInterpolationWalberla(LBInterpolation, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in double-precision."""
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBInterpolationWalberlaDoublePrecisionCPU(LBInterpolation, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBInterpolationWalberlaSinglePrecision(LBInterpolation, ut.TestCase):
+class LBInterpolationWalberlaSinglePrecisionCPU(LBInterpolation, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True}
 
-    """Test for the Walberla implementation of the LB in single-precision."""
 
-    lb_class = espressomd.lb.LBFluidWalberla
+@utx.skipIfMissingGPU()
+@ut.skipIf(LBInterpolation.system.cell_system.get_state()["n_nodes"] != 1,
+           "only runs for 1 MPI rank")
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBInterpolationWalberlaDoublePrecisionGPU(LBInterpolation, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": False}
+
+
+@utx.skipIfMissingGPU()
+@ut.skipIf(LBInterpolation.system.cell_system.get_state()["n_nodes"] != 1,
+           "only runs for 1 MPI rank")
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBInterpolationWalberlaSinglePrecisionGPU(LBInterpolation, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
     lb_params = {"single_precision": True}
 
 
diff --git a/testsuite/python/lb_lees_edwards_particle_coupling.py b/testsuite/python/lb_lees_edwards_particle_coupling.py
index 103197c721f..713a43e5fea 100644
--- a/testsuite/python/lb_lees_edwards_particle_coupling.py
+++ b/testsuite/python/lb_lees_edwards_particle_coupling.py
@@ -21,22 +21,268 @@
 import espressomd
 import espressomd.lb
 import numpy as np
+import itertools
+import copy
 import unittest_decorators as utx
+from tests_common import fold_index
+
+system = espressomd.System(box_l=[10, 10, 10])
+
+
+def unit_vec(k):
+    res = np.zeros(3)
+    res[k] = 1
+    return res
+
+
+def within_grid(idx, shape):
+    return np.all(idx >= 0) and np.all(idx < shape)
+
+
+def min_image_dist(a, b, l):
+    res = b - a
+    for i in range(3):
+        while res[i] < -l[i] / 2: res += l[i]
+        while res[i] >= l[i] / 2: res -= l[i]
+    return res
+
+
+def coupling_weight(pos_lb_units, node_idx, lb_shape):
+    # 1. For the coupling weights it does not matter on which side of the
+    #    lb_node the position is
+    # 2. To determine the lb node to position distance, we need
+    #    minimum image convention, node and coupling position can be
+    #    at different sides of a periodic boundary
+    dx = np.abs(min_image_dist(pos_lb_units, node_idx, lb_shape))
+    # If the coupling point is >=1 lattice constant away from the node,
+    # no coupling. Otherwise, distance pos to node via module with lattice
+    # constant 1
+    weight = 0. if np.any(dx >= 1.) else np.product(1. - dx)
+
+    return weight
+
+
+def le_aware_lb_nodes_around_pos(
+        folded_pos, lbf, le_pos_offset, shear_direction, shear_plane_normal):
+    """Returns LB node(s) relevant for interpolation around the given position"""
+
+    # helper to get lb node from index with folding
+    def lb_node(idx): return lbf[fold_index(idx, lbf.shape)]
+
+    # center of first lb lattice point is at 0.5 in each Cartesian direction.
+
+    # determine position in lb units for 3 cases;
+    # unshifted, lees-edwards shifted to the left and to the right
+    pos_unshifted_lb_units = folded_pos / lbf.agrid - 0.5  # relative to node centers
+    shear_vec = unit_vec(shear_direction)
+    pos_shifted_left_lb_units = (
+        folded_pos - shear_vec * le_pos_offset) / lbf.agrid - 0.5
+    pos_shifted_right_lb_units = (
+        folded_pos + shear_vec * le_pos_offset) / lbf.agrid - 0.5
+
+    # Particle couples to its 8 neighboring lattice sites
+    # when a particle is at a lattice site in any coordinate,
+    # the right hand side neighbor is included
+    # but the coupling weight for that neighbor is 0
+
+    # find the lower left lb node to which the particle couples
+    lower_idx_unshifted = np.array(np.floor(pos_unshifted_lb_units), dtype=int)
+    lower_idx_shifted_left = np.array(
+        np.floor(pos_shifted_left_lb_units), dtype=int)
+    lower_idx_shifted_right = np.array(
+        np.floor(pos_shifted_right_lb_units), dtype=int)
+
+    ijks = np.array(list(itertools.product([0, 1], repeat=3)))
+    indices_unshifted = [lower_idx_unshifted + ijk for ijk in ijks]
+
+    # Nodes with an index within the primary box in shear_plane_normal direction
+    # do not need Lees-Edwards handling
+    dont_need_shift = [
+        idx for idx in indices_unshifted if within_grid(
+            idx[shear_plane_normal],
+            lbf.shape[shear_plane_normal])]
+    unshifted_nodes = [lb_node(idx) for idx in dont_need_shift]
+    unshifted_weights = [
+        coupling_weight(pos_unshifted_lb_units, idx, lbf.shape)
+        for idx in dont_need_shift]
+
+    # Handle indices which are not in the primary box in the sheare plane
+    # normal
+    to_be_shifted_left = [
+        (idx, ijk) for idx, ijk in zip(indices_unshifted, ijks)
+        if idx[shear_plane_normal] >= lbf.shape[shear_plane_normal]]
+    to_be_shifted_right = [(idx, ijk) for idx, ijk in zip(
+        indices_unshifted, ijks) if idx[shear_plane_normal] < 0]
+
+    # replace the index in shear direction
+    shifted_left = copy.deepcopy(to_be_shifted_left)
+    for idx, ijk in shifted_left:
+        idx[shear_direction] = lower_idx_shifted_left[shear_direction] + \
+            ijk[shear_direction]
+    shifted_right = copy.deepcopy(to_be_shifted_right)
+    for idx, ijk in shifted_right:
+        idx[shear_direction] = lower_idx_shifted_right[shear_direction] + \
+            ijk[shear_direction]
+
+    weights_shifted_left = [
+        coupling_weight(pos_shifted_left_lb_units, idx, lbf.shape)
+        for idx, _ in shifted_left]
+    weights_shifted_right = [
+        coupling_weight(pos_shifted_right_lb_units, idx, lbf.shape)
+        for idx, _ in shifted_right]
+
+    shifted_nodes = [lb_node(idx) for idx, _ in (shifted_left + shifted_right)]
+    shifted_weights = weights_shifted_left + weights_shifted_right
+
+    np.testing.assert_allclose(sum(unshifted_weights + shifted_weights), 1)
+    assert len(shifted_nodes + unshifted_nodes) == 8
+    assert len(set(shifted_nodes + unshifted_nodes)) == 8  # no duplicates
+
+    return unshifted_nodes, shifted_nodes, unshifted_weights, shifted_weights
 
 
 @utx.skipIfMissingFeatures("WALBERLA")
+@ut.skipIf(np.prod(system.cell_system.node_grid) != 1, "Requires 1 MPI rank")
 class LBLeesEdwardsParticleCoupling(ut.TestCase):
-    def test(self):
-        system = espressomd.System(box_l=[10, 10, 10])
+    """Test LB Lees-Edwards corner cases with a random RNG seed (smoke test)"""
 
+    def test_viscous_coupling_with_offset(self):
+        system.lb = None
         system.time_step = 1
         system.cell_system.skin = 0.1
         system.cell_system.set_n_square()
+        offset = (np.random.random() - 1.) * 6. * system.box_l[1]
+        protocol = lees_edwards.LinearShear(
+            shear_velocity=0, initial_pos_offset=offset, time_0=0.)
+        system.lees_edwards.set_boundary_conditions(
+            shear_direction="x", shear_plane_normal="y", protocol=protocol)
+        lbf = espressomd.lb.LBFluidWalberla(
+            agrid=1., density=1., kinematic_viscosity=1., tau=system.time_step)
+        system.lb = lbf
+        system.thermostat.set_lb(LB_fluid=lbf, seed=123, gamma=1)
+        for _ in range(10):
+            system.part.clear()
+            lbf[:, :, :].velocity = np.zeros(3)
+
+            x = np.random.random() * system.box_l[0]
+            z = np.random.random() * system.box_l[2]
+            # within 0.5 of the lees-edwards boundary
+            y = (np.random.random() - 0.5) % system.box_l[1]
+            pos = np.array((x, y, z))
+            p = system.part.add(pos=pos)
+            v0 = np.random.random(3) - 1 / 2
 
-        offset = 1
-        idx = int(offset)
+            nodes_unshifted, nodes_shifted, weights_unshifted, weights_shifted = \
+                le_aware_lb_nodes_around_pos(pos, lbf, offset, 0, 1)
+            all_nodes = nodes_unshifted + nodes_shifted
+            all_weights = weights_unshifted + weights_shifted
+
+            for n in all_nodes:
+                n.velocity = v0
+
+            system.integrator.run(1)
+
+            # Gather forces applied to the LB by the particle coupling
+            lb_force = np.sum(
+                np.array([n.last_applied_force for n in all_nodes]), axis=0)
+
+            # total force on lb = - force on particle?
+            np.testing.assert_allclose(lb_force, -np.copy(p.f))
+
+            # validate our assumptions about which lb nodes get a force
+            # from the coupling. Exactly the nodes listed in `nodes`
+            # should have received a force during coupling.
+            lb_nodes_with_force_idx = sorted(
+                [n.index for n in lbf[:, :, :] if np.any(n.last_applied_force != 0)])
+            expected_nodes_idx = sorted(
+                [n.index for n, w in zip(all_nodes, all_weights) if w > 0])
+            np.testing.assert_array_equal(
+                lb_nodes_with_force_idx, expected_nodes_idx)
+
+            # force on individual nodes
+            for n, w in zip(all_nodes, all_weights):
+                np.testing.assert_allclose(
+                    np.copy(n.last_applied_force), -w * np.copy(p.f))
+
+    def check_velocity_interpolation(self, pos_offset, shear_vel, test_positions):
+        system.lb = None
+        system.part.clear()
+        system.time_step = 1
+        system.cell_system.skin = 0.1
+        system.cell_system.set_n_square()
+        system.time = 0
+        protocol = lees_edwards.LinearShear(
+            shear_velocity=shear_vel, initial_pos_offset=pos_offset, time_0=0.)
+        system.lees_edwards.set_boundary_conditions(
+            shear_direction="x", shear_plane_normal="y", protocol=protocol)
+        lbf = espressomd.lb.LBFluidWalberla(
+            agrid=1., density=1., kinematic_viscosity=1., tau=system.time_step)
+        system.lb = lbf
+        system.thermostat.set_lb(LB_fluid=lbf, seed=123, gamma=1)
+        system.part.clear()
+
+        def v_x(x): return np.interp(
+            x, [0.5, lbf.shape[0] - .5], [0, lbf.shape[0] - 1], period=lbf.shape[0])
+        nodes_at_y_boundary = list(
+            lbf[:, 0, :]) + list(lbf[:, lbf.shape[1] - 1, :])
+        for n in nodes_at_y_boundary:
+            node_x = 0.5 + n.index[0]
+            n.velocity = [v_x(node_x), 0, 0]
+        for pos in test_positions:
+            y = pos[1]
+            if abs(y <= 0.5):
+                pref = -1.
+                dist_to_unshifted_lb_nodes = 0.5 - y
+            else:
+                assert y >= system.box_l[1] - 0.5
+                pref = 1.
+                dist_to_unshifted_lb_nodes = y - (system.box_l[2] - 0.5)
+            vel_shift = pref * shear_vel
+            xs = 0.5 + np.arange(lbf.shape[0])
+            ys = [v_x(x - pref * pos_offset) for x in xs]
+            def v_x_shifted(x): return np.interp(
+                x, xs, ys, period=system.box_l[0])
+            unshifted_vel = v_x(pos[0])
+            shifted_vel = v_x_shifted(pos[0]) + vel_shift
+            weight_unshifted = 1 - dist_to_unshifted_lb_nodes
+            weight_shifted = 1 - weight_unshifted
+            expected_vel = np.array(
+                [weight_unshifted * unshifted_vel + weight_shifted * shifted_vel, 0, 0])
+            observed_vel = np.copy(lbf.get_interpolated_velocity(pos=pos))
+            np.testing.assert_allclose(observed_vel, expected_vel)
+
+    def test_vel_interpol_all(self):
+        n = 25
+        xs = np.linspace(0, system.box_l[0], n)
+        y_ls = [0.2] * n
+        y_us = [system.box_l[1] - .2] * n
+        zs = np.random.random(n) * system.box_l[2]
+        pos_lower = np.vstack((xs, y_ls, zs)).T
+        pos_upper = np.vstack((xs, y_us, zs)).T
+        pos_all = np.vstack((pos_lower, pos_upper))
+        # non-integer offset
+        pos_offsets = 100 * system.box_l[0] * (np.random.random(10) - .5)
+        for pos_offset in pos_offsets:
+            self.check_velocity_interpolation(
+                pos_offset, 2 * np.random.random() - 1, pos_all)
+
+    def test_viscous_coupling_with_shear_vel(self):
+        # Place a co-moving particle close to the LE boundary in shear flow.
+        # Check that it remains force-free. This is only the case,
+        # if the periodic images in the halo region calculate
+        # the drag force including the LE shear velocity.
+        box_l = system.box_l
+        system.lb = None
+        system.part.clear()
+        system.time_step = 0.1
+        system.time = 0
+        system.cell_system.skin = 0.1
+        system.cell_system.set_n_square()
+        v_shear = 2. * (np.random.random() - 0.5)
         protocol = lees_edwards.LinearShear(
-            shear_velocity=0., initial_pos_offset=offset, time_0=0.)
+            shear_velocity=v_shear,
+            initial_pos_offset=(np.random.random() - 0.5) * 5. * box_l[0],
+            time_0=np.random.random())
         system.lees_edwards.set_boundary_conditions(
             shear_direction="x", shear_plane_normal="y", protocol=protocol)
 
@@ -44,50 +290,43 @@ def test(self):
             agrid=1., density=1., kinematic_viscosity=1., tau=system.time_step)
         system.lb = lbf
         system.thermostat.set_lb(LB_fluid=lbf, seed=123, gamma=1)
+        system.integrator.run(5000)
+        for n in lbf[:, :, :]:
+            np.testing.assert_allclose(n.velocity[1:], [0, 0], atol=1E-8)
+        pos = np.random.random(3) * box_l
+        p = system.part.add(pos=pos, v=lbf.get_interpolated_velocity(pos=pos))
+        np.testing.assert_allclose(p.v[1:], [0, 0], atol=1E-8)
+        for _ in range(1000):
+            system.integrator.run(1, reuse_forces=True)
+            np.testing.assert_allclose(np.copy(p.f), np.zeros(3), atol=2E-6)
 
-        pos = [system.box_l[0] / 2., 0., system.box_l[0] / 2.]
-        p = system.part.add(pos=pos)
-        v0 = np.array([1, 2, 3])
-        mid_x = lbf.shape[0] // 2
-        mid_z = lbf.shape[2] // 2
-
-        upper_y = lbf.shape[1] - 1
-        nodes = [lbf[mid_x - 1, 0, mid_z],
-                 lbf[mid_x, 0, mid_z - 1],
-                 lbf[mid_x - 1, 0, mid_z],
-                 lbf[mid_x, 0, mid_z],
-                 lbf[mid_x - 1 + idx, upper_y, mid_z],
-                 lbf[mid_x + idx, upper_y, mid_z - 1],
-                 lbf[mid_x - 1 + idx, upper_y, mid_z],
-                 lbf[mid_x + idx, upper_y, mid_z]]
-        for n in nodes:
-            n.velocity = v0
+    def test_momentum_conservation(self):
+        system.lb = None
+        system.part.clear()
+        system.time_step = 0.01
+        system.cell_system.skin = 0.1
+        system.cell_system.set_n_square()
+        v_shear = np.random.random() - 0.5
+        protocol = lees_edwards.LinearShear(
+            shear_velocity=v_shear, initial_pos_offset=13.7, time_0=0.)
+        system.lees_edwards.set_boundary_conditions(
+            shear_direction="x", shear_plane_normal="y", protocol=protocol)
 
+        lbf = espressomd.lb.LBFluidWalberla(
+            agrid=1., density=1., kinematic_viscosity=1., tau=system.time_step)
+        system.lb = lbf
+        system.thermostat.set_lb(LB_fluid=lbf, seed=123, gamma=1)
+        pos = (0, 0, 0)
+        p = system.part.add(pos=pos, v=(0, 0, 0))
         system.integrator.run(1)
-        lb_forces = np.array([n.last_applied_force for n in nodes])
-        lb_force = np.sum(lb_forces, axis=0)
-        np.testing.assert_allclose(lb_force, -np.copy(p.f))
-        for f in lb_forces:
-            np.testing.assert_allclose(f, lb_forces[0])
-
-        lbf[:, :, :].velocity = [0, 0, 0]
-
-        lower_nodes = nodes[:4]
-        upper_nodes = nodes[4:]
-        for n in lower_nodes:
-            n.velocity = v0
-        for n in upper_nodes:
-            n.velocity = - v0
-        p.update(dict(pos=pos, v=np.zeros(3)))
-        np.testing.assert_allclose(
-            np.copy(lbf.get_interpolated_velocity(pos=pos)),
-            np.zeros(3))
-        system.integrator.run(1)
-        np.testing.assert_allclose(np.copy(p.pos), pos)
-        np.testing.assert_allclose(np.copy(p.f), np.zeros(3))
-        for n in nodes:
+        initial_mom = np.copy(system.analysis.linear_momentum())
+        for _ in range(100):
+            system.integrator.run(1)
+            np.testing.assert_allclose(-np.copy(p.f), np.copy(
+                np.sum(lbf[:, :, :].last_applied_force, axis=(0, 1, 2))), atol=1E-9)
+            current_mom = np.copy(system.analysis.linear_momentum())
             np.testing.assert_allclose(
-                np.copy(n.last_applied_force), np.zeros(3))
+                initial_mom[1:], current_mom[1:], atol=2E-7)
 
 
 if __name__ == '__main__':
diff --git a/testsuite/python/lb_mass_conservation.py b/testsuite/python/lb_mass_conservation.py
index cf8ee65c500..fcbbab66b63 100644
--- a/testsuite/python/lb_mass_conservation.py
+++ b/testsuite/python/lb_mass_conservation.py
@@ -67,21 +67,31 @@ def test_mass_conservation(self):
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBMassWalberlaDoublePrecision(LBMassCommon, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in double-precision."""
-
+class LBMassWalberlaDoublePrecisionCPU(LBMassCommon, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
     atol = 1e-10
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBMassWalberlaSinglePrecision(LBMassCommon, ut.TestCase):
+class LBMassWalberlaSinglePrecisionCPU(LBMassCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True}
+    atol = 5e-7
 
-    """Test for the Walberla implementation of the LB in single-precision."""
 
-    lb_class = espressomd.lb.LBFluidWalberla
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBMassWalberlaDoublePrecisionGPU(LBMassCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": False}
+    atol = 1e-10
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBMassWalberlaSinglePrecisionGPU(LBMassCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
     lb_params = {"single_precision": True}
     atol = 5e-7
 
diff --git a/testsuite/python/lb_momentum_conservation.py b/testsuite/python/lb_momentum_conservation.py
index 558185a911b..8b3c058efc3 100644
--- a/testsuite/python/lb_momentum_conservation.py
+++ b/testsuite/python/lb_momentum_conservation.py
@@ -102,7 +102,7 @@ def test(self):
 @ut.skipIf(TestLBMomentumConservation.n_nodes == 1,
            "LB with regular decomposition already tested with 2 MPI ranks")
 @utx.skipIfMissingFeatures(["WALBERLA", "EXTERNAL_FORCES"])
-class TestLBMomentumConservationRegularWalberla(
+class TestLBMomentumConservationRegularDoublePrecisionWalberlaCPU(
         TestLBMomentumConservation, ut.TestCase):
 
     lb_class = espressomd.lb.LBFluidWalberla
@@ -116,7 +116,7 @@ def set_cellsystem(self):
 @ut.skipIf(TestLBMomentumConservation.n_nodes == 1,
            "LB with regular decomposition already tested with 2 MPI ranks")
 @utx.skipIfMissingFeatures(["WALBERLA", "EXTERNAL_FORCES"])
-class TestLBMomentumConservationRegularWalberlaSinglePrecision(
+class TestLBMomentumConservationRegularSinglePrecisionWalberlaCPU(
         TestLBMomentumConservation, ut.TestCase):
 
     lb_class = espressomd.lb.LBFluidWalberla
@@ -127,6 +127,21 @@ def set_cellsystem(self):
         self.system.cell_system.set_regular_decomposition()
 
 
+@utx.skipIfMissingGPU()
+@ut.skipIf(TestLBMomentumConservation.n_nodes != 1,
+           "LB with regular decomposition already tested with 2 MPI ranks")
+@utx.skipIfMissingFeatures(["WALBERLA", "EXTERNAL_FORCES", "CUDA"])
+class TestLBMomentumConservationRegularSinglePrecisionWalberlaGPU(
+        TestLBMomentumConservation, ut.TestCase):
+
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": True}
+    atol = 6.5e-4
+
+    def set_cellsystem(self):
+        self.system.cell_system.set_regular_decomposition()
+
+
 @utx.skipIfMissingFeatures(["WALBERLA", "EXTERNAL_FORCES"])
 class TestLBCPUMomentumConservationHybridNSquareWalberla(
         TestLBMomentumConservation, ut.TestCase):
diff --git a/testsuite/python/lb_poiseuille.py b/testsuite/python/lb_poiseuille.py
index 2176263ed8b..9a4178d7af6 100644
--- a/testsuite/python/lb_poiseuille.py
+++ b/testsuite/python/lb_poiseuille.py
@@ -120,20 +120,28 @@ def test_profile(self):
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBPoiseuilleWalberla(LBPoiseuilleCommon, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in double-precision."""
-
+class LBPoiseuilleWalberlaDoublePrecisionCPU(LBPoiseuilleCommon, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBPoiseuilleWalberlaSinglePrecision(LBPoiseuilleCommon, ut.TestCase):
+class LBPoiseuilleWalberlaSinglePrecisionCPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True}
 
-    """Test for the Walberla implementation of the LB in single-precision."""
 
-    lb_class = espressomd.lb.LBFluidWalberla
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBPoiseuilleWalberlaDoublePrecisionGPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": False}
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBPoiseuilleWalberlaSinglePrecisionGPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
     lb_params = {"single_precision": True}
 
 
diff --git a/testsuite/python/lb_poiseuille_cylinder.py b/testsuite/python/lb_poiseuille_cylinder.py
index ccc1fa0519c..4499f8661d9 100644
--- a/testsuite/python/lb_poiseuille_cylinder.py
+++ b/testsuite/python/lb_poiseuille_cylinder.py
@@ -197,20 +197,28 @@ def test_z(self):
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBPoiseuilleWalberla(LBPoiseuilleCommon, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in double-precision."""
-
+class LBPoiseuilleWalberlaDoublePrecisionCPU(LBPoiseuilleCommon, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBPoiseuilleWalberlaSinglePrecision(LBPoiseuilleCommon, ut.TestCase):
+class LBPoiseuilleWalberlaSinglePrecisionCPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True}
 
-    """Test for the Walberla implementation of the LB in single-precision."""
 
-    lb_class = espressomd.lb.LBFluidWalberla
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBPoiseuilleWalberlaDoublePrecisionGPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": True}
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBPoiseuilleWalberlaSinglePrecisionGPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
     lb_params = {"single_precision": True}
 
 
diff --git a/testsuite/python/lb_pressure_tensor.py b/testsuite/python/lb_pressure_tensor.py
index 21f43797567..59ff0f2b5db 100644
--- a/testsuite/python/lb_pressure_tensor.py
+++ b/testsuite/python/lb_pressure_tensor.py
@@ -22,7 +22,7 @@
 
 import espressomd
 import espressomd.lb
-#import scipy.optimize
+# import scipy.optimize
 
 N_CELLS = 12
 
diff --git a/testsuite/python/lb_slice.py b/testsuite/python/lb_slice.py
index 840d2501189..09a49dc4bd1 100644
--- a/testsuite/python/lb_slice.py
+++ b/testsuite/python/lb_slice.py
@@ -172,9 +172,6 @@ def test_iterator(self):
 
 @utx.skipIfMissingFeatures("WALBERLA")
 class LBTestWalberlaDoublePrecisionCPU(LBTest, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in single-precision."""
-
     lb_class = espressomd.lb.LBFluidWalberla
     lb_lattice_class = espressomd.lb.LatticeWalberla
     lb_params = {"single_precision": False}
@@ -182,10 +179,23 @@ class LBTestWalberlaDoublePrecisionCPU(LBTest, ut.TestCase):
 
 @utx.skipIfMissingFeatures("WALBERLA")
 class LBTestWalberlaSinglePrecisionCPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    lb_params = {"single_precision": True}
 
-    """Test for the Walberla implementation of the LB in single-precision."""
 
-    lb_class = espressomd.lb.LBFluidWalberla
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBTestWalberlaDoublePrecisionGPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    lb_params = {"single_precision": False}
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBTestWalberlaSinglePrecisionGPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
     lb_lattice_class = espressomd.lb.LatticeWalberla
     lb_params = {"single_precision": True}
 
diff --git a/testsuite/python/lb_stats.py b/testsuite/python/lb_stats.py
index 11c2c51b88b..7b61d9f309c 100644
--- a/testsuite/python/lb_stats.py
+++ b/testsuite/python/lb_stats.py
@@ -106,9 +106,9 @@ def test_mass_momentum_thermostat(self):
             fluid_temp += np.sum(np.multiply(nodes_dens, nodes_vel))
 
             # Normalize
-            fluid_mass /= np.product(self.lbf.shape)
+            fluid_mass /= np.prod(self.lbf.shape)
             fluid_temp *= self.system.volume() / (
-                3. * np.product(self.lbf.shape)**2)
+                3. * np.prod(self.lbf.shape)**2)
 
             # check mass conversation
             self.assertAlmostEqual(fluid_mass, self.params["dens"], delta=1E-9)
diff --git a/testsuite/python/lb_streaming.py b/testsuite/python/lb_streaming.py
index 1a5740cee74..ad9fefa3502 100644
--- a/testsuite/python/lb_streaming.py
+++ b/testsuite/python/lb_streaming.py
@@ -95,6 +95,7 @@ class LBStreamingCommon:
     system.time_step = TAU
 
     def setUp(self):
+        self.system.box_l = self.box_l
         self.lbf = self.lb_class(**LB_PARAMETERS, **self.lb_params)
         self.system.lb = self.lbf
 
@@ -107,9 +108,7 @@ def test_population_streaming(self):
         grid = np.array(self.system.box_l / AGRID, dtype=int)
 
         # reset fluid populations
-        for i in itertools.product(
-                range(grid[0]), range(grid[1]), range(grid[2])):
-            self.lbf[i].population = pop_default
+        self.lbf[:, :, :].population = pop_default
 
         # check streaming
         for grid_index in itertools.product(
@@ -117,8 +116,8 @@ def test_population_streaming(self):
             self.lbf[grid_index].population = pop_source
             self.system.integrator.run(1)
             for n_v in range(19):
-                target_node_index = np.mod(
-                    grid_index + VELOCITY_VECTORS[n_v], grid)
+                dst_vec = np.array(VELOCITY_VECTORS[n_v])
+                target_node_index = np.mod(grid_index + dst_vec, grid)
                 np.testing.assert_allclose(
                     self.lbf[target_node_index].population[n_v],
                     REFERENCE_POPULATIONS[n_v], rtol=self.rtol,
@@ -127,35 +126,41 @@ def test_population_streaming(self):
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBStreamingWalberla(LBStreamingCommon, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in double-precision."""
-
+class LBStreamingWalberlaDoublePrecisionCPU(LBStreamingCommon, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
+    box_l = [3., 2., 2.]
     rtol = 1e-10
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBStreamingWalberlaSinglePrecision(LBStreamingCommon, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in single-precision."""
-
+class LBStreamingWalberlaSinglePrecisionCPU(LBStreamingCommon, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": True}
+    box_l = [3., 2., 2.]
     rtol = 1e-5
 
 
-# TODO WALBERLA
-# @utx.skipIfMissingGPU()
-# @utx.skipIfMissingFeatures(["WALBERLA"])
-# class LBGPU(LBStreamingCommon, ut.TestCase):
+@utx.skipIfMissingGPU()
+@ut.skipIf(LBStreamingCommon.system.cell_system.get_state()["n_nodes"] > 2,
+           "only runs for 2 or less MPI ranks")
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBStreamingWalberlaDoublePrecisionGPU(LBStreamingCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": False}
+    box_l = [2., 1.5, 1.5]
+    rtol = 1e-10
 
-#    """Test for the Walberla implementation of the LB on the GPU."""
 
-#    lb_class = espressomd.lb.LBFluidWalberlaGPU
-#    lb_params = {}
-#    rtol = 1e-7
+@utx.skipIfMissingGPU()
+@ut.skipIf(LBStreamingCommon.system.cell_system.get_state()["n_nodes"] > 2,
+           "only runs for 2 or less MPI ranks")
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBStreamingWalberlaSinglePrecisionGPU(LBStreamingCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": True}
+    box_l = [2., 1.5, 1.5]
+    rtol = 1e-5
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/lb_thermostat.py b/testsuite/python/lb_thermostat.py
index 8a56d94080b..c67709dd98d 100644
--- a/testsuite/python/lb_thermostat.py
+++ b/testsuite/python/lb_thermostat.py
@@ -204,37 +204,42 @@ def test_friction(self):
     @utx.skipIfMissingFeatures(["PARTICLE_ANISOTROPY",
                                "THERMOSTAT_PER_PARTICLE"])
     def test_exceptions(self):
+        with self.assertRaisesRegex(RuntimeError, r"set_lb\(\) got an unexpected keyword argument 'act_on_virtual'"):
+            self.system.thermostat.set_lb(
+                LB_fluid=self.lbf, act_on_virtual=False)
         self.system.part.add(pos=[0., 0., 0.], gamma=[1., 2., 3.], id=2)
         with self.assertRaisesRegex(Exception, r"ERROR: anisotropic particle \(id 2\) coupled to LB"):
             self.system.integrator.run(1)
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBWalberlaThermostat(LBThermostatCommon, ut.TestCase):
-
-    """Test for the CPU implementation of the LB."""
-
+class LBThermostatWalberlaDoublePrecisionCPU(LBThermostatCommon, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class LBWalberlaThermostatSinglePrecision(LBThermostatCommon, ut.TestCase):
-
-    """Test for the CPU implementation of the LB in single-precision."""
-
+class LBThermostatWalberlaSinglePrecisionCPU(LBThermostatCommon, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": True}
 
 
-# @utx.skipIfMissingGPU()
-# @utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
-# class LBWalberlaGPUThermostat(LBThermostatCommon, ut.TestCase):
+@utx.skipIfMissingGPU()
+@ut.skipIf(LBThermostatCommon.system.cell_system.get_state()["n_nodes"] != 1,
+           "only runs for 1 MPI rank")
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBThermostatWalberlaDoublePrecisionGPU(LBThermostatCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": False}
 
-#    """Test for the GPU implementation of the LB."""
 
-#    lb_class = espressomd.lb.LBFluidWalberlaGPU
-#    lb_params = {"single_precision": True}
+@utx.skipIfMissingGPU()
+@ut.skipIf(LBThermostatCommon.system.cell_system.get_state()["n_nodes"] != 1,
+           "only runs for 1 MPI rank")
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBThermostatWalberlaSinglePrecisionGPU(LBThermostatCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": True}
 
 
 if __name__ == '__main__':
diff --git a/testsuite/python/lees_edwards.py b/testsuite/python/lees_edwards.py
index 60e72e87d57..e0af2956a55 100644
--- a/testsuite/python/lees_edwards.py
+++ b/testsuite/python/lees_edwards.py
@@ -27,16 +27,18 @@
 import numpy as np
 import itertools
 
-def deriv(f,x,h=1E-5):
-   # central difference quotient
-   return 1/(2*h) *(f(x+h) -f(x-h))
+
+def deriv(f, x, h=1E-5):
+    # central difference quotient
+    return 1 / (2 * h) * (f(x + h) - f(x - h))
+
 
 np.random.seed(42)
 params_lin = {'initial_pos_offset': 0.1, 'time_0': 0.1, 'shear_velocity': 1.2}
 params_osc = {'initial_pos_offset': 0.1, 'time_0': -2.1, 'amplitude': 2.3,
-              'omega': 2.51,"decay_rate":0}
+              'omega': 2.51, "decay_rate": 0}
 params_osc_decay = {'initial_pos_offset': 0.1, 'time_0': -2.1, 'amplitude': 2.3,
-              'omega': 2.51,"decay_rate":0.1}
+                    'omega': 2.51, "decay_rate": 0.1}
 
 lin_protocol = espressomd.lees_edwards.LinearShear(**params_lin)
 
@@ -47,8 +49,11 @@ def get_lin_pos_offset(time, initial_pos_offset=None,
 
 
 osc_protocol = espressomd.lees_edwards.OscillatoryShear(**params_osc)
-osc_decay_protocol = espressomd.lees_edwards.OscillatoryShear(**params_osc_decay)
+osc_decay_protocol = espressomd.lees_edwards.OscillatoryShear(
+    **params_osc_decay)
 off_protocol = espressomd.lees_edwards.Off()
+const_offset_protocol = espressomd.lees_edwards.LinearShear(
+    initial_pos_offset=2.2, shear_velocity=0)
 
 
 def axis(coord):
@@ -60,21 +65,27 @@ def axis(coord):
 
 
 class LeesEdwards(ut.TestCase):
+    box_l = [5, 5, 5]
+    system = espressomd.System(box_l=box_l)
+    node_grid = np.copy(system.cell_system.node_grid)
+    n_nodes = np.prod(node_grid)
 
-    system = espressomd.System(box_l=[5.0, 5.0, 5.0])
-    system.cell_system.skin = 0.0
-    system.cell_system.set_n_square(use_verlet_lists=True)
-
-    time_step = 0.5
-    system.time_step = time_step
     direction_permutations = list(itertools.permutations(["x", "y", "z"], 2))
 
     def setUp(self):
-        self.system.time = 0.0
+        system = self.system
+        system.box_l = self.box_l
+        system.cell_system.skin = 0.
+        system.cell_system.set_n_square(use_verlet_lists=True)
+        system.time = 0.0
+        system.time_step = 0.5
+        system.min_global_cut = 0.
+        system.cell_system.node_grid = self.node_grid
 
     def tearDown(self):
         system = self.system
         system.part.clear()
+        system.bonded_inter.clear()
         system.lees_edwards.protocol = None
         if espressomd.has_features("COLLISION_DETECTION"):
             system.collision_detection.set_params(mode="off")
@@ -134,7 +145,6 @@ def test_protocols(self):
             self.assertAlmostEqual(
                 system.lees_edwards.pos_offset, expected_pos)
 
-
         system.time = 0.0
 
         # Check that time change during integration updates offsets
@@ -157,32 +167,31 @@ def test_protocols(self):
             shear_direction="y", shear_plane_normal="z", protocol=lin_protocol)
         self.assertEqual(system.lees_edwards.shear_direction, "y")
         self.assertEqual(system.lees_edwards.shear_plane_normal, "z")
-        
+
         # Oscillatory shear
         # Oscillatory shear with exponential decay
         system.lees_edwards.protocol = osc_decay_protocol
 
         # check parameter setter/getter consistency
-        self.assertEqual(system.lees_edwards.protocol.get_params(), params_osc_decay)
-        osc_decay_pos = lambda t: params_osc_decay["initial_pos_offset"] +\
-            params_osc_decay["amplitude"]*np.exp(-(t-params_osc_decay["time_0"]) *params_osc_decay["decay_rate"]) *\
-                np.sin(params_osc_decay["omega"]*(t-params_osc_decay["time_0"]))
+        self.assertEqual(
+            system.lees_edwards.protocol.get_params(), params_osc_decay)
 
+        def osc_decay_pos(t): return params_osc_decay["initial_pos_offset"] +\
+            params_osc_decay["amplitude"] * np.exp(-(t - params_osc_decay["time_0"]) * params_osc_decay["decay_rate"]) *\
+            np.sin(params_osc_decay["omega"] *
+                   (t - params_osc_decay["time_0"]))
 
         # check pos offset and shear velocity at different times,
         # check that LE offsets are recalculated on simulation time change
         for time in [0., 2.3]:
             system.time = time
             expected_pos = osc_decay_pos(time)
-            expected_vel = deriv(osc_decay_pos,time)
+            expected_vel = deriv(osc_decay_pos, time)
             self.assertAlmostEqual(
                 system.lees_edwards.pos_offset, expected_pos)
             self.assertAlmostEqual(
                 system.lees_edwards.shear_velocity, expected_vel)
 
-
-
-
         # Check that LE is disabled correctly via parameter
         system.lees_edwards.protocol = None
         self.assertIsNone(system.lees_edwards.shear_direction)
@@ -222,6 +231,21 @@ def test_protocols(self):
                     shear_direction=valid, shear_plane_normal=valid,
                     protocol=lin_protocol)
 
+        with self.assertRaisesRegex(ValueError, "fully_connected_boundary normal and connection coordinates need to differ"):
+            system.cell_system.set_regular_decomposition(
+                fully_connected_boundary={"boundary": "z", "direction": "z"})
+        self.assertEqual(system.cell_system.decomposition_type, "n_square")
+        with self.assertRaisesRegex(ValueError, "Invalid Cartesian coordinate: 't'"):
+            system.cell_system.set_regular_decomposition(
+                fully_connected_boundary={"boundary": "z", "direction": "t"})
+        self.assertEqual(system.cell_system.decomposition_type, "n_square")
+        if self.n_nodes > 1:
+            with self.assertRaisesRegex(RuntimeError, "The MPI nodegrid must be 1 in the fully connected direction"):
+                system.cell_system.node_grid = [1, self.n_nodes, 1]
+                system.cell_system.set_regular_decomposition(
+                    fully_connected_boundary={"boundary": "z", "direction": "y"})
+            self.assertEqual(system.cell_system.decomposition_type, "n_square")
+
     def test_boundary_crossing_lin(self):
         """
         A particle crosses the upper and lower boundary with linear shear.
@@ -331,13 +355,13 @@ def test_trajectory_reconstruction(self):
         crossing_time = system.time
         system.integrator.run(1)
         np.testing.assert_almost_equal(
-            p.lees_edwards_offset, 
+            p.lees_edwards_offset,
             get_lin_pos_offset(crossing_time, **params_lin))
         np.testing.assert_almost_equal(p.lees_edwards_flag, -1)
 
         system.integrator.run(1)  # no boundary crossing
         np.testing.assert_almost_equal(
-            p.lees_edwards_offset, 
+            p.lees_edwards_offset,
             get_lin_pos_offset(crossing_time, **params_lin))
 
         np.testing.assert_almost_equal(p.lees_edwards_flag, 0)
@@ -516,6 +540,7 @@ def test_virt_sites_interaction(self):
             shear_velocity=2.0, initial_pos_offset=0.0)
         system.lees_edwards.set_boundary_conditions(
             shear_direction="x", shear_plane_normal="y", protocol=protocol)
+        system.min_global_cut = 2.5
         p1 = system.part.add(pos=[2.5, 2.5, 2.5], type=10,
                              rotation=3 * (True,), v=(0.0, -0.1, -0.25))
         p2 = system.part.add(pos=(2.5, 3.5, 2.5), type=11)
@@ -566,6 +591,42 @@ def test_virt_sites_interaction(self):
             weight_function=0, gamma=0, r_cut=0,
             trans_weight_function=0, trans_gamma=0, trans_r_cut=0)
 
+    @utx.skipIfMissingFeatures(
+        ["EXTERNAL_FORCES", "VIRTUAL_SITES_RELATIVE"])
+    def test__virt_sites_rotation(self):
+        """
+        A particle with virtual sites is placed on the boundary. We check if
+        the forces yield the correct torque and if a rotation frequency is
+        transmitted back to the virtual sites.
+        """
+
+        system = self.system
+        system.part.clear()
+        system.min_global_cut = 2.5
+
+        system.lees_edwards.set_boundary_conditions(
+            shear_direction="x", shear_plane_normal="y", protocol=lin_protocol)
+
+        p1 = system.part.add(
+            id=0, pos=[2.5, 5.0, 2.5], rotation=[True] * 3)
+
+        p2 = system.part.add(pos=(2.5, 6.0, 2.5), ext_force=(1.0, 0., 0.))
+        p2.vs_auto_relate_to(0)
+        p3 = system.part.add(pos=(2.5, 4.0, 2.5), ext_force=(-1.0, 0., 0.))
+        p3.vs_auto_relate_to(0)
+
+        system.integrator.run(0, recalc_forces=True)
+
+        np.testing.assert_array_almost_equal(
+            np.copy(p1.torque_lab), [0.0, 0.0, -2.0])
+
+        p1.omega_lab = (0., 0., 2.5)
+        system.integrator.run(0, recalc_forces=True)
+        for vs in p2, p3:
+            np.testing.assert_array_almost_equal(
+                system.velocity_difference(p1, vs),
+                np.cross(p1.omega_lab, system.distance_vec(p1, vs)))
+
     @utx.skipIfMissingFeatures(
         ["EXTERNAL_FORCES", "VIRTUAL_SITES_RELATIVE", "COLLISION_DETECTION"])
     def test_le_colldet(self):
@@ -716,64 +777,93 @@ def test_le_breaking_bonds(self):
             bond_list += p.bonds
         np.testing.assert_array_equal(len(bond_list), 0)
 
-    def setup_lj_liquid(self):
-        system = self.system
-        system.cell_system.set_n_square(use_verlet_lists=False)
-        # Parameters
-        n = 100
-        phi = 0.4
-        sigma = 1.
-        eps = 1
-        cut = sigma * 2**(1 / 6)
-
-        # box
-        l = (n / 6. * np.pi * sigma**3 / phi)**(1. / 3.)
-
-        # Setup
-        system.box_l = [l, l, l]
-        system.lees_edwards.protocol = None
-
-        system.time_step = 0.01
-        system.thermostat.turn_off()
-
-        np.random.seed(42)
-        system.part.add(pos=np.random.random((n, 3)) * l)
-
-        # interactions
-        system.non_bonded_inter[0, 0].lennard_jones.set_params(
-            epsilon=eps, sigma=sigma, cutoff=cut, shift="auto")
-        # Remove overlap
-        system.integrator.set_steepest_descent(
-            f_max=0, gamma=0.05, max_displacement=0.05)
-        while system.analysis.energy()["total"] > 0.5 * n:
-            system.integrator.run(5)
+    const_offset_params = {
+        'shear_velocity': 0.0,
+        'shear_direction': 0,
+        'shear_plane_normal': 1,
+        'initial_pos_offset': 17.2}
 
-        system.integrator.set_vv()
+    shear_params = {
+        'shear_velocity': 0.1,
+        'shear_direction': 0,
+        'shear_plane_normal': 2,
+        'initial_pos_offset': -np.sqrt(0.1)}
 
     @utx.skipIfMissingFeatures("LENNARD_JONES")
-    def test_zz_lj(self):
+    def run_lj_pair_visibility(self, shear_direction, shear_plane_normal):
         """
-        Simulate an LJ liquid under linear shear and verify forces.
+        Simulate LJ particles coming into contact under linear shear and verify forces.
         This is to make sure that no pairs get lost or are outdated
-        in the short range loop. To have deterministic forces, velocity
-        capping is used rather than a thermostat.
+        in the short range loop.
         """
+        shear_axis, normal_axis = axis(
+            shear_direction), axis(shear_plane_normal)
         system = self.system
-        self.setup_lj_liquid()
+        system.part.clear()
+        system.time = 0
+        system.time_step = 0.1
         protocol = espressomd.lees_edwards.LinearShear(
-            shear_velocity=0.3, initial_pos_offset=0.01)
+            shear_velocity=3, initial_pos_offset=5)
         system.lees_edwards.set_boundary_conditions(
-            shear_direction="z", shear_plane_normal="x", protocol=protocol)
-        system.integrator.run(1, recalc_forces=True)
-        tests_common.check_non_bonded_loop_trace(self, system)
+            shear_direction=shear_direction, shear_plane_normal=shear_plane_normal, protocol=protocol)
+        system.cell_system.skin = 0.2
+        system.non_bonded_inter[0, 0].lennard_jones.set_params(
+            epsilon=1E-6, sigma=1, cutoff=1.2, shift="auto")
+        system.part.add(
+            pos=(0.1 * normal_axis, -0.8 * normal_axis),
+            v=(1.0 * shear_axis, -0.3 * shear_axis))
+        assert np.all(system.part.all().f == 0.)
+        tests_common.check_non_bonded_loop_trace(
+            self, system, cutoff=system.non_bonded_inter[0, 0].lennard_jones.get_params()["cutoff"] + system.cell_system.skin)
 
         # Rewind the clock to get back the LE offset applied during force calc
         system.time = system.time - system.time_step
         tests_common.verify_lj_forces(system, 1E-7)
-
-        system.thermostat.set_langevin(kT=.1, gamma=5, seed=2)
-        system.integrator.run(50)
-        tests_common.check_non_bonded_loop_trace(self, system)
+        have_interacted = False
+        for _ in range(50):
+            system.integrator.run(3)
+            if np.any(np.abs(system.part.all().f) > 0):
+                have_interacted = True
+            tests_common.check_non_bonded_loop_trace(
+                self, system, cutoff=system.non_bonded_inter[0, 0].lennard_jones.get_params()["cutoff"] + system.cell_system.skin)
+            system.time = system.time - system.time_step
+            tests_common.verify_lj_forces(system, 1E-7)
+        assert have_interacted
+
+    def test_zz_lj_pair_visibility(self):
+        # check that regular decomposition without fully connected doesn't
+        # catch the particle
+        system = self.system
+        system.box_l = [10, 10, 10]
+        with self.assertRaises(AssertionError):
+            system.cell_system.set_regular_decomposition(
+                fully_connected_boundary=None)
+            self.assertIsNone(system.cell_system.fully_connected_boundary)
+            system.cell_system.node_grid = [1, self.n_nodes, 1]
+            self.run_lj_pair_visibility("x", "y")
+
+        for verlet in (False, True):
+            for shear_direction, shear_plane_normal in self.direction_permutations:
+                system.cell_system.set_n_square(use_verlet_lists=verlet)
+                self.run_lj_pair_visibility(
+                    shear_direction, shear_plane_normal)
+
+        for verlet in (False, True):
+            for shear_direction, shear_plane_normal in self.direction_permutations:
+                system.cell_system.set_regular_decomposition(
+                    fully_connected_boundary=None)
+                normal_axis = axis(shear_plane_normal)
+                system.cell_system.node_grid = [
+                    self.n_nodes if normal_axis[i] == 1 else 1 for i in range(3)]
+                fully_connected_boundary = {"boundary": shear_plane_normal,
+                                            "direction": shear_direction}
+                system.cell_system.set_regular_decomposition(
+                    use_verlet_lists=verlet,
+                    fully_connected_boundary=fully_connected_boundary)
+                self.assertEqual(system.cell_system.fully_connected_boundary,
+                                 fully_connected_boundary)
+                self.run_lj_pair_visibility(
+                    shear_direction, shear_plane_normal)
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/long_range_actors.py b/testsuite/python/long_range_actors.py
index ae8249fb004..cc5784b54c0 100644
--- a/testsuite/python/long_range_actors.py
+++ b/testsuite/python/long_range_actors.py
@@ -216,7 +216,6 @@ def test_dh_pressure(self):
         np.testing.assert_allclose(pressure_scalar_far_field, 0., atol=1e-12)
 
     @utx.skipIfMissingFeatures(["DIPOLES"])
-    @ut.skipIf(n_nodes > 1, "only runs for 1 MPI rank")
     def test_mdds_cpu_no_magnetic_particles(self):
         self.system.part.add(pos=2 * [[1., 1., 1.]], dip=2 * [[0., 0., 0.]])
         mdds = espressomd.magnetostatics.DipolarDirectSumCpu(prefactor=2.)
@@ -293,8 +292,10 @@ def test_dp3m_cpu_tuning_errors(self):
             prefactor=1., accuracy=1e-3)
         self.check_p3m_tuning_errors(dp3m)
 
-    def check_mmm1d_exceptions(self, mmm1d_class):
-        mmm1d = mmm1d_class(prefactor=1., maxPWerror=1e-2)
+    @utx.skipIfMissingFeatures(["ELECTROSTATICS"])
+    def test_mmm1d_cpu_exceptions(self):
+        self.system.periodicity = (False, False, True)
+        mmm1d = espressomd.electrostatics.MMM1D(prefactor=1., maxPWerror=1e-2)
 
         # check cell system exceptions
         with self.assertRaisesRegex(Exception, "MMM1D requires the N-square cellsystem"):
@@ -310,32 +311,13 @@ def check_mmm1d_exceptions(self, mmm1d_class):
                 continue
             self.system.periodicity = periodicity
             with self.assertRaisesRegex(Exception, r"MMM1D requires periodicity \(False, False, True\)"):
-                mmm1d = mmm1d_class(prefactor=1., maxPWerror=1e-2)
+                mmm1d = espressomd.electrostatics.MMM1D(
+                    prefactor=1., maxPWerror=1e-2)
                 self.system.electrostatics.solver = mmm1d
             self.assertIsNone(self.system.electrostatics.solver)
             self.assertFalse(mmm1d.is_tuned)
         self.system.periodicity = (False, False, True)
 
-    @utx.skipIfMissingFeatures(["ELECTROSTATICS"])
-    def test_mmm1d_cpu_exceptions(self):
-        self.system.periodicity = (False, False, True)
-        self.check_mmm1d_exceptions(espressomd.electrostatics.MMM1D)
-
-    @utx.skipIfMissingGPU()
-    @utx.skipIfMissingFeatures(["MMM1D_GPU"])
-    @ut.skipIf(n_nodes > 3, "only runs for 3 or less MPI ranks")
-    def test_mmm1d_gpu_exceptions(self):
-        # VRAM peak memory usage: 700 MiB on 4 MPI cores, 500 on 3 MPI cores
-        self.system.periodicity = (False, False, True)
-        self.check_mmm1d_exceptions(espressomd.electrostatics.MMM1DGPU)
-
-        self.system.electrostatics.solver = None
-        with self.assertRaisesRegex(ValueError, "Parameter 'far_switch_radius' must not be larger than box length"):
-            self.system.electrostatics.solver = espressomd.electrostatics.MMM1DGPU(
-                prefactor=1., maxPWerror=1e-2,
-                far_switch_radius=2. * self.system.box_l[2])
-        self.assertIsNone(self.system.electrostatics.solver)
-
     @utx.skipIfMissingFeatures(["P3M"])
     def test_elc_tuning_exceptions(self):
         p3m = espressomd.electrostatics.P3M(**self.valid_p3m_parameters())
diff --git a/testsuite/python/mmm1d.py b/testsuite/python/mmm1d.py
index a2ad6fe3b70..e2f5b27d21d 100644
--- a/testsuite/python/mmm1d.py
+++ b/testsuite/python/mmm1d.py
@@ -23,9 +23,9 @@
 import espressomd.electrostatics
 
 
-class ElectrostaticInteractionsTests:
+@utx.skipIfMissingFeatures(["ELECTROSTATICS"])
+class Test(ut.TestCase):
 
-    # Handle to espresso system
     system = espressomd.System(box_l=[10.0] * 3)
     system.periodicity = [False, False, True]
     system.time_step = 0.01
@@ -38,6 +38,7 @@ class ElectrostaticInteractionsTests:
     p_q = data[:, 4]
     forces_target = data[:, 5:8]
     energy_target = -7.156365298205383
+    allowed_error = 2e-5
 
     def setUp(self):
         self.system.box_l = [10.0] * 3
@@ -50,7 +51,7 @@ def tearDown(self):
 
     def test_forces_and_energy(self):
         self.system.part.add(pos=self.p_pos, q=self.p_q)
-        mmm1d = self.MMM1D(prefactor=1.0, maxPWerror=1e-20)
+        mmm1d = espressomd.electrostatics.MMM1D(prefactor=1., maxPWerror=1e-20)
         self.system.electrostatics.solver = mmm1d
         self.system.integrator.run(steps=0)
         measured_f = np.copy(self.system.part.all().f)
@@ -84,7 +85,7 @@ def check_with_analytical_result(self, prefactor, accuracy):
     def test_with_analytical_result(self):
         self.system.part.add(pos=[0, 0, 0], q=1)
         self.system.part.add(pos=[0, 0, 1], q=-1)
-        mmm1d = self.MMM1D(prefactor=1.0, maxPWerror=1e-20)
+        mmm1d = espressomd.electrostatics.MMM1D(prefactor=1., maxPWerror=1e-20)
         self.system.electrostatics.solver = mmm1d
         self.assertTrue(mmm1d.is_tuned)
         self.system.integrator.run(steps=0, recalc_forces=True)
@@ -93,7 +94,7 @@ def test_with_analytical_result(self):
     def test_bjerrum_length_change(self):
         self.system.part.add(pos=[0, 0, 0], q=1)
         self.system.part.add(pos=[0, 0, 1], q=-1)
-        mmm1d = self.MMM1D(prefactor=2.0, maxPWerror=1e-20)
+        mmm1d = espressomd.electrostatics.MMM1D(prefactor=2., maxPWerror=1e-20)
         self.system.electrostatics.solver = mmm1d
         self.assertTrue(mmm1d.is_tuned)
         self.system.integrator.run(steps=0, recalc_forces=True)
@@ -101,7 +102,8 @@ def test_bjerrum_length_change(self):
 
         # actor should remain in a valid state after a cell system reset
         forces1 = np.copy(self.system.part.all().f)
-        self.system.box_l = self.system.box_l
+        self.system.change_volume_and_rescale_particles(
+            self.system.box_l[0], "x")
         self.system.periodicity = self.system.periodicity
         self.system.cell_system.node_grid = self.system.cell_system.node_grid
         self.system.integrator.run(steps=0, recalc_forces=True)
@@ -122,7 +124,7 @@ def test_infinite_wire(self):
         for i in range(n_pairs):
             self.system.part.add(pos=[0., 0., 2. * i + 0.], q=+1.)
             self.system.part.add(pos=[0., 0., 2. * i + 1.], q=-1.)
-        mmm1d = self.MMM1D(
+        mmm1d = espressomd.electrostatics.MMM1D(
             prefactor=1., maxPWerror=1e-20, far_switch_radius=n_pairs / 2.)
         self.system.electrostatics.solver = mmm1d
         energy = self.system.analysis.energy()["coulomb"]
@@ -134,20 +136,5 @@ def test_infinite_wire(self):
         np.testing.assert_allclose(p_tensor, 0., atol=1e-12)
 
 
-@utx.skipIfMissingFeatures(["ELECTROSTATICS"])
-class MMM1D_Test(ElectrostaticInteractionsTests, ut.TestCase):
-
-    allowed_error = 2e-5
-    MMM1D = espressomd.electrostatics.MMM1D
-
-
-@utx.skipIfMissingFeatures(["ELECTROSTATICS", "MMM1D_GPU"])
-@utx.skipIfMissingGPU()
-class MMM1D_GPU_Test(ElectrostaticInteractionsTests, ut.TestCase):
-
-    allowed_error = 1e-4
-    MMM1D = espressomd.electrostatics.MMM1DGPU
-
-
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/npt_thermostat.py b/testsuite/python/npt_thermostat.py
index 6a3efebc154..ce97dcfc4e4 100644
--- a/testsuite/python/npt_thermostat.py
+++ b/testsuite/python/npt_thermostat.py
@@ -45,8 +45,8 @@ def tearDown(self):
     def test_01__rng(self):
         """Test for RNG consistency."""
         def reset_particle_and_box():
-            self.system.box_l = [1, 1, 1]
             self.system.part.clear()
+            self.system.box_l = [1, 1, 1]
             p = self.system.part.add(pos=[0, 0, 0])
             return p
 
@@ -123,13 +123,13 @@ def test_02__direction(self):
 
         system = self.system
         system.box_l = 3 * [ref_box_l]
-        system.part.add(pos=data[:, 0:3], type=len(data) * [2])
         system.non_bonded_inter[2, 2].wca.set_params(epsilon=1., sigma=1.)
         system.time_step = 0.01
 
         for n in range(3):
             direction = np.roll([True, False, False], n)
             system.box_l = 3 * [ref_box_l]
+            system.part.add(pos=data[:, 0:3], type=len(data) * [2])
             system.part.all().pos = data[:, 0:3]
             system.part.all().v = data[:, 3:6]
             system.thermostat.set_npt(kT=1.0, gamma0=2, gammav=0.004, seed=42)
@@ -140,6 +140,7 @@ def test_02__direction(self):
             box_l_rel_ref = np.roll([np.max(box_l_rel), 1., 1.], n)
             np.testing.assert_allclose(box_l_rel, box_l_rel_ref, atol=1e-10)
             self.assertGreater(np.max(box_l_rel), 2)
+            system.part.clear()
 
     def test_07__virtual(self):
         system = self.system
diff --git a/testsuite/python/observable_cylindricalLB.py b/testsuite/python/observable_cylindricalLB.py
index 60423580898..56b390ec078 100644
--- a/testsuite/python/observable_cylindricalLB.py
+++ b/testsuite/python/observable_cylindricalLB.py
@@ -134,12 +134,8 @@ def setup_system_get_np_hist(self):
                 self.align_with_observable_frame(pos) +
                 self.cyl_transform_params.center)
             vel_aligned.append(self.align_with_observable_frame(vel))
-        node_aligned = np.array(
-            np.rint(
-                np.array(pos_aligned) -
-                3 *
-                [0.5]),
-            dtype=int)
+        node_aligned = np.array(np.rint(np.array(pos_aligned) - 3 * [0.5]),
+                                dtype=int)
         self.system.part.add(pos=pos_aligned, v=vel_aligned)
         self.params['ids'] = self.system.part.all().id
 
@@ -277,22 +273,32 @@ def test_cylindrical_lb_flux_density_obs(self):
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class CylindricalLBObservableWalberla(
+class CylindricalLBObservableWalberlaDoubePrecisionCPU(
         CylindricalLBObservableCommon, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in double-precision."""
-
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params_extra = {"single_precision": False}
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class CylindricalLBObservableWalberlaSinglePrecision(
-        CylindricalLBObservableWalberla, ut.TestCase):
+class CylindricalLBObservableWalberlaSinglePrecisionCPU(
+        CylindricalLBObservableCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params_extra = {"single_precision": True}
 
-    """Test for the Walberla implementation of the LB in single-precision."""
 
-    lb_class = espressomd.lb.LBFluidWalberla
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class CylindricalLBObservableWalberlaDoubePrecisionGPU(
+        CylindricalLBObservableCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params_extra = {"single_precision": False}
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class CylindricalLBObservableWalberlaSinglePrecisionGPU(
+        CylindricalLBObservableCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
     lb_params_extra = {"single_precision": True}
 
 
diff --git a/testsuite/python/observable_profileLB.py b/testsuite/python/observable_profileLB.py
index c2dbc8f1efa..7ca99b4a16e 100644
--- a/testsuite/python/observable_profileLB.py
+++ b/testsuite/python/observable_profileLB.py
@@ -76,15 +76,10 @@ def setUp(self):
     def tearDown(self):
         self.system.lb = None
 
-    def set_fluid_velocities(self):
-        """Set an x dependent fluid velocity."""
-        for x in range(int(np.around(self.system.box_l[0] / AGRID))):
-            for y in range(int(np.around(self.system.box_l[1] / AGRID))):
-                for z in range(int(np.around(self.system.box_l[2] / AGRID))):
-                    self.lbf[x, y, z].velocity = [float(x), 0.0, 0.0]
-
     def test_velocity_profile(self):
-        self.set_fluid_velocities()
+        # set an x-dependent fluid velocity
+        for x in range(self.lbf.shape[0]):
+            self.lbf[x, :, :].velocity = [float(x), 0.0, 0.0]
         obs = espressomd.observables.LBVelocityProfile(
             **LB_VELOCITY_PROFILE_PARAMS)
         obs_data = obs.calculate()
@@ -200,21 +195,32 @@ def test_lb_profile_interface(self):
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class ObservableProfileWalberla(ObservableProfileLBCommon, ut.TestCase):
-
-    """Test for the Walberla implementation of the LB in double-precision."""
-
+class ObservableProfileWalberlaDoublePrecisionCPU(
+        ObservableProfileLBCommon, ut.TestCase):
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
-class ObservableProfileWalberlaSinglePrecision(
+class ObservableProfileWalberlaSinglePrecisionCPU(
+        ObservableProfileLBCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True}
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class ObservableProfileWalberlaDoublePrecisionGPU(
         ObservableProfileLBCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": False}
 
-    """Test for the Walberla implementation of the LB in single-precision."""
 
-    lb_class = espressomd.lb.LBFluidWalberla
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class ObservableProfileWalberlaSinglePrecisionGPU(
+        ObservableProfileLBCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
     lb_params = {"single_precision": True}
 
 
diff --git a/testsuite/python/observables.py b/testsuite/python/observables.py
index a0d637529c4..1385c7bfd43 100644
--- a/testsuite/python/observables.py
+++ b/testsuite/python/observables.py
@@ -121,7 +121,7 @@ def func(self):
             np.testing.assert_array_almost_equal(
                 obs_data,
                 part_data,
-                err_msg=f"Data did not agree for observable {obs_class.__name__} and particle property {pprop_name}",
+                err_msg=f"Data did not agree for observable {obs_class.__name__} and particle property {pprop_name}",  # nopep8
                 decimal=11)
 
             # Test setters and getters
diff --git a/testsuite/python/p3m_tuning_exceptions.py b/testsuite/python/p3m_tuning_exceptions.py
index 677ccfb59b1..1796c9994cb 100644
--- a/testsuite/python/p3m_tuning_exceptions.py
+++ b/testsuite/python/p3m_tuning_exceptions.py
@@ -166,7 +166,9 @@ def test_03_non_cubic_box_p3m_cpu(self):
         with self.assertRaisesRegex(RuntimeError, 'P3M: non-metallic epsilon requires cubic box'):
             self.system.electrostatics.solver = solver
 
+        self.system.part.clear()
         self.system.box_l = [10., 10., 10.]
+        self.add_charged_particles()
         solver = espressomd.electrostatics.P3M(
             prefactor=2, accuracy=1e-2, epsilon=1, mesh=[4, 8, 8])
         with self.assertRaisesRegex(RuntimeError, 'P3M: non-metallic epsilon requires cubic box'):
@@ -326,11 +328,11 @@ def check_invalid_params_elc_p3m(self, container, solver_p3m):
 
         with self.assertRaisesRegex(RuntimeError, "ELC tuning failed: maxPWerror too small"):
             # reduce box size to make tuning converge in at most 50 steps
-            self.system.box_l = [1., 1., 1.]
+            self.system.change_volume_and_rescale_particles(1., "xyz")
             elc = ELC(actor=solver_p3m, gap_size=0.5, maxPWerror=1e-90)
             self.system.electrostatics.solver = elc
         self.assertIsNone(self.system.electrostatics.solver)
-        self.system.box_l = [10., 10., 10.]
+        self.system.change_volume_and_rescale_particles(10., "xyz")
 
         # r_cut > gap isn't allowed with dielectric contrasts
         p3m = espressomd.electrostatics.P3M(
@@ -453,7 +455,7 @@ def test_09_no_errors_p3m_cpu_rescale_mesh(self):
         np.testing.assert_equal(np.copy(solver.mesh), [8, 12, 16])
 
         # check MD cell reset event
-        self.system.box_l = self.system.box_l
+        self.system.change_volume_and_rescale_particles(10., "x")
         self.system.periodicity = self.system.periodicity
         self.system.cell_system.node_grid = self.system.cell_system.node_grid
 
@@ -471,7 +473,7 @@ def test_09_no_errors_p3m_gpu_rescale_mesh(self):
         np.testing.assert_equal(np.copy(solver.mesh), [20, 20, 40])
 
         # check MD cell reset event
-        self.system.box_l = self.system.box_l
+        self.system.change_volume_and_rescale_particles(10., "x")
         self.system.periodicity = self.system.periodicity
         self.system.cell_system.node_grid = self.system.cell_system.node_grid
 
@@ -487,7 +489,7 @@ def test_09_no_errors_dp3m_cpu_rescale_mesh(self):
         self.system.magnetostatics.solver = solver
 
         # check MD cell reset event
-        self.system.box_l = self.system.box_l
+        self.system.change_volume_and_rescale_particles(10., "x")
         self.system.periodicity = self.system.periodicity
         self.system.cell_system.node_grid = self.system.cell_system.node_grid
 
@@ -495,9 +497,13 @@ def check_tuning_layer_corrections(
             self, container, class_p3m, class_lc, params):
         if class_p3m is espressomd.magnetostatics.DipolarP3M:
             mesh_a = np.array([2., 2., 2.])
+            self.system.change_volume_and_rescale_particles(
+                mesh_a[0] * params["mesh"][0], "xyz")
         else:
             mesh_a = np.array([2., 4., 8.])
-        self.system.box_l = mesh_a * params["mesh"]
+            for i in range(3):
+                self.system.change_volume_and_rescale_particles(
+                    mesh_a[i] * params["mesh"][i], "xyz"[i])
         self.system.time_step = 0.01
         non_metallic_epsilon = 20.
         p3m = class_p3m(epsilon=non_metallic_epsilon, **params)
@@ -520,7 +526,13 @@ def check_tuning_layer_corrections(
         np.testing.assert_allclose(p3m.r_cut_iL, r_cut_iL, atol=1e-12)
         np.testing.assert_allclose(p3m.alpha_L, alpha_L, atol=1e-12)
         mesh_a = np.array([4., 4., 4.])
-        self.system.box_l = mesh_a * params["mesh"]
+        if class_p3m is espressomd.magnetostatics.DipolarP3M:
+            self.system.change_volume_and_rescale_particles(
+                mesh_a[0] * params["mesh"][0], "xyz")
+        else:
+            for i in range(3):
+                self.system.change_volume_and_rescale_particles(
+                    mesh_a[i] * params["mesh"][i], "xyz"[i])
         np.testing.assert_allclose(np.copy(p3m.a), mesh_a, atol=1e-12)
         np.testing.assert_allclose(p3m.r_cut, r_cut * 2., atol=1e-12)
         np.testing.assert_allclose(p3m.r_cut_iL, r_cut_iL, atol=1e-12)
diff --git a/testsuite/python/particle.py b/testsuite/python/particle.py
index 75e4f630eab..cc2e40f11a8 100644
--- a/testsuite/python/particle.py
+++ b/testsuite/python/particle.py
@@ -198,18 +198,21 @@ def test_gamma_rot_single(self):
 
     @utx.skipIfMissingFeatures(["VIRTUAL_SITES_RELATIVE"])
     def test_vs_relative(self):
+        Propagation = espressomd.propagation.Propagation
         self.system.part.add(id=0, pos=(0, 0, 0))
         p1 = self.system.part.add(id=1, pos=(0, 0, 0))
         self.assertFalse(p1.is_virtual())
         p1.vs_relative = (0, 5.0, (0.5, -0.5, -0.5, -0.5))
         p1.vs_quat = [1, 2, 3, 4]
+        p1.propagation = (Propagation.TRANS_VS_RELATIVE |
+                          Propagation.ROT_VS_RELATIVE)
         self.assertTrue(p1.is_virtual())
         np.testing.assert_array_equal(p1.vs_quat, [1, 2, 3, 4])
         res = p1.vs_relative
         self.assertEqual(res[0], 0, f"vs_relative: {res}")
         self.assertEqual(res[1], 5.0, f"vs_relative: {res}")
         np.testing.assert_allclose(
-            res[2], np.array((0.5, -0.5, -0.5, -0.5)),
+            np.copy(res[2]), np.array([0.5, -0.5, -0.5, -0.5]),
             err_msg=f"vs_relative: {res}", atol=self.tol)
         # check exceptions
         error_msg = r"attribute 'vs_relative' of 'ParticleHandle' must take the form \[id, distance, quaternion\]"
@@ -543,7 +546,7 @@ def test_particle_slice(self):
         np.testing.assert_equal(system.part.by_ids(range(3, 6)).id,
                                 [i for i in sorted(ids) if i >= 3 and i < 6])
         np.testing.assert_equal(system.part.by_ids(range(6, 3, -1)).id,
-                                [i for i in sorted(ids, key=lambda i:-i) if i > 3 and i <= 6])
+                                [i for i in sorted(ids, key=lambda i: -i) if i > 3 and i <= 6])
 
         # Setting particle properties on a slice
         system.part.by_ids(range(9, 10)).pos = (0, 0, 0)
diff --git a/testsuite/python/propagation_langevin.py b/testsuite/python/propagation_langevin.py
index 0834265a98f..83fc17e111c 100644
--- a/testsuite/python/propagation_langevin.py
+++ b/testsuite/python/propagation_langevin.py
@@ -233,6 +233,7 @@ def test_03__friction_rot(self):
 
     @utx.skipIfMissingFeatures("VIRTUAL_SITES_RELATIVE")
     def test_07__virtual(self):
+        Propagation = espressomd.propagation.Propagation
         system = self.system
         system.time_step = dt = 0.03
 
@@ -240,6 +241,8 @@ def test_07__virtual(self):
         virtual = system.part.add(pos=[0, 0, 0], v=[1, 0, 0])
         physical = system.part.add(pos=[0, 0, 0], v=v0)
         virtual.vs_relative = (physical.id, 0.01, (1., 0., 0., 0.))
+        virtual.propagation = (Propagation.TRANS_VS_RELATIVE |
+                               Propagation.ROT_VS_RELATIVE)
 
         system.thermostat.set_langevin(
             kT=0, gamma=1, gamma_rotation=1., seed=41)
diff --git a/testsuite/python/propagation_lb.py b/testsuite/python/propagation_lb.py
index 91582099a2d..a355ebf389b 100644
--- a/testsuite/python/propagation_lb.py
+++ b/testsuite/python/propagation_lb.py
@@ -255,8 +255,8 @@ def make_particle(propagation):
         make_particle(Propagation.TRANS_LB_MOMENTUM_EXCHANGE |
                       Propagation.ROT_LANGEVIN)
         system.time = 0.
-        for i in range(10):
-            system.integrator.run(2**i)
+        for power in range(self.lb_geom_progression):
+            system.integrator.run(2**power)
             for p, r0 in zip(system.part.all(), positions):
                 pos = np.copy(p.pos)
                 vel = np.copy(p.v)
@@ -269,20 +269,32 @@ def make_particle(propagation):
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
 class LBWalberlaThermostatDoublePrecisionCPU(LBThermostatCommon, ut.TestCase):
-
-    """Test for the CPU implementation of the LB."""
-
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
+    lb_geom_progression = 10
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
 class LBWalberlaThermostatSinglePrecisionCPU(LBThermostatCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True}
+    lb_geom_progression = 10
 
-    """Test for the CPU implementation of the LB in single-precision."""
 
-    lb_class = espressomd.lb.LBFluidWalberla
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBWalberlaThermostatDoublePrecisionGPU(LBThermostatCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": False}
+    lb_geom_progression = 9
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBWalberlaThermostatSinglePrecisionGPU(LBThermostatCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
     lb_params = {"single_precision": True}
+    lb_geom_progression = 9
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/propagation_newton.py b/testsuite/python/propagation_newton.py
index b9a5b563bfc..b8b03635134 100644
--- a/testsuite/python/propagation_newton.py
+++ b/testsuite/python/propagation_newton.py
@@ -94,12 +94,15 @@ def test_newton_laws(self):
 
     @utx.skipIfMissingFeatures("VIRTUAL_SITES_RELATIVE")
     def test_07__virtual(self):
+        Propagation = espressomd.propagation.Propagation
         system = self.system
         system.time_step = 0.01
 
         virtual = system.part.add(pos=[0., 0., 0.], v=[-1., 0., 0.])
         physical = system.part.add(pos=[0., 0., 0.], v=[1., 0., 0.])
         virtual.vs_relative = (physical.id, 0.3, (1., 0., 0., 0.))
+        virtual.propagation = (Propagation.TRANS_VS_RELATIVE |
+                               Propagation.ROT_VS_RELATIVE)
 
         system.integrator.set_vv()
         system.integrator.run(1)
diff --git a/testsuite/python/propagation_npt.py b/testsuite/python/propagation_npt.py
index 0df370dd087..af9abe24269 100644
--- a/testsuite/python/propagation_npt.py
+++ b/testsuite/python/propagation_npt.py
@@ -121,12 +121,15 @@ def calc_trajectory(p, x0):
 
     @utx.skipIfMissingFeatures(["VIRTUAL_SITES_RELATIVE"])
     def test_07__virtual(self):
+        Propagation = espressomd.propagation.Propagation
         system = self.system
         system.time_step = 0.01
 
         virtual = system.part.add(pos=[0, 0, 0], v=[1, 0, 0])
         physical = system.part.add(pos=[0, 0, 0], v=[2, 0, 0])
         virtual.vs_relative = (physical.id, 0.1, (1., 0., 0., 0.))
+        virtual.propagation = (Propagation.TRANS_VS_RELATIVE |
+                               Propagation.ROT_VS_RELATIVE)
 
         system.thermostat.set_npt(kT=0., gamma0=2., gammav=1e-6, seed=42)
         system.integrator.set_isotropic_npt(ext_pressure=0.01, piston=1e6)
diff --git a/testsuite/python/reaction_methods_interface.py b/testsuite/python/reaction_methods_interface.py
index 8f2bfcf4276..ff2b1c03029 100644
--- a/testsuite/python/reaction_methods_interface.py
+++ b/testsuite/python/reaction_methods_interface.py
@@ -262,7 +262,7 @@ def test_exceptions(self):
         for i in [-2, -1, 1, 2, 3]:
             with self.assertRaisesRegex(IndexError, f"No reaction with id {i}"):
                 method.delete_reaction(reaction_id=i)
-            with self.assertRaisesRegex(IndexError, f"No reaction with id {2*i}"):
+            with self.assertRaisesRegex(IndexError, f"No reaction with id {2 * i}"):
                 method.get_acceptance_rate_reaction(reaction_id=2 * i)
         with self.assertRaisesRegex(ValueError, "Only forward reactions can be selected"):
             method.change_reaction_constant(reaction_id=1, gamma=1.)
diff --git a/testsuite/python/regular_decomposition.py b/testsuite/python/regular_decomposition.py
index b05746f5423..d94720f11e9 100644
--- a/testsuite/python/regular_decomposition.py
+++ b/testsuite/python/regular_decomposition.py
@@ -17,8 +17,10 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 import unittest as ut
+import unittest_decorators as utx
 import espressomd
 import numpy as np
+import itertools
 
 np.random.seed(42)
 
@@ -105,6 +107,109 @@ def test_position_rounding(self):
         self.system.part.add(pos=[25, 25, 0])
         self.assertEqual(1, len(self.system.part))
 
+    @utx.skipIfMissingFeatures("LENNARD_JONES")
+    def test_fully_connected_boundary(self):
+        system = self.system
+        system.part.clear()
+        if system.cell_system.node_grid[1] != 1:
+            ng = system.cell_system.node_grid
+            system.cell_system.node_grid = [ng[0], 1, ng[2] * ng[1]]
+        system.periodic = [True] * 3
+        # Check that it's initially disabled
+        self.assertEqual(system.cell_system.get_params()[
+                         "fully_connected_boundary"], None)
+
+        # check setting and getting the parameter
+        system.cell_system.set_regular_decomposition(
+            fully_connected_boundary=dict(direction="y", boundary="z"))
+        self.assertEqual(system.cell_system.get_params()[
+                         "fully_connected_boundary"], dict(direction="y", boundary="z"))
+        # Check that the setting survives cell system re-initialization
+        system.cell_system.min_global_cut = system.box_l / 4.1
+        self.assertEqual(system.cell_system.get_params()[
+                         "fully_connected_boundary"], dict(direction="y", boundary="z"))
+
+        # Check particle visibility.
+        # Place particles on a cubic lattice and use the
+        # non_bonded_loop_trace() to check that all pairs are seen as expected
+        fc_normal = np.array((0, 0, 1))  # z
+        fc_normal_coord = 2  # z
+        fc_dir = np.array((0, 1, 0))  # y
+        N = 10
+        system.non_bonded_inter[0, 0].lennard_jones.set_params(
+            sigma=1, epsilon=1, cutoff=system.box_l[0] / N + 0.01, shift="auto")
+        indices = [np.array((i, j, k)) for i in range(N)
+                   for j in range(N) for k in range(N)]
+
+        def id_for_idx(idx): return (
+            idx[0] % N) * N * N + (idx[1] % N) * N + idx[2] % N
+
+        ids = [id_for_idx(idx) for idx in indices]
+        dx = system.box_l / N
+        positions = [idx * dx for idx in indices]
+        system.part.add(id=ids, pos=positions)
+        particles = {i: system.part.by_id(i) for i in ids}
+
+        def distance(id1, id2):
+            return system.distance(
+                particles[id1], particles[id2])
+        distances = {tuple(i): distance(*i)
+                     for i in itertools.combinations(ids, 2)}
+
+        max_range = np.amax(system.box_l) / N
+        two_cells = 2 * np.amax(system.cell_system.get_state()["cell_size"])
+        two_cells_2d = two_cells * np.sqrt(2)
+        two_cells_3d = two_cells * np.sqrt(3)
+        assert np.all(system.box_l / 2 > two_cells)
+
+        # next neighbors
+        must_find_nn = [i for i, d in distances.items() if d <= max_range]
+
+        # Fully connected neighbors
+        indices_lower_boundary = [
+            idx for idx in indices if idx[fc_normal_coord] == 0]
+        must_find_fc = [tuple(sorted((id_for_idx(idx), id_for_idx(idx + i * fc_dir - fc_normal))))
+                        for idx in indices_lower_boundary for i in range(-N + 1, N)]
+
+        # all neighbors that must be found
+        must_find = set(must_find_nn + must_find_fc)
+
+        def assert_can_find(pair):
+            # are the particles within a range that MAY be found by the
+            # pair loop
+            p1 = particles[pair[0]]
+            p2 = particles[pair[1]]
+            d = system.distance_vec(p1, p2)
+            # if not accross periodic boundary: particles must be in cells
+            # sharing at least one corner
+            if np.abs(
+                    p1.pos - p2.pos)[fc_normal_coord] < system.box_l[fc_normal_coord] / 2:
+                self.assertLess(np.linalg.norm(d), two_cells_3d)
+            # If across a the fully connected boundary
+            # substract the distance in the fully connected direciont (all are
+            # valid
+            d_trans = d - d * fc_dir
+            # in the other TWO directions, cells have to share a corner
+            self.assertLess(np.linalg.norm(d_trans), two_cells_2d)
+
+        # Use the cell system trace to get all pairs
+        # as opposed to get_pairs() this does not have a distance check
+        cs_pairs = system.cell_system.non_bonded_loop_trace()
+        found = []
+        for id1, id2, _rest1, _rest2, _rest3, _rest4 in cs_pairs:
+            p = tuple(sorted((id1, id2)))  # Make the pair unique
+            found.append(p)  # to check for double counting
+            if p in must_find:
+                must_find.remove(p)
+            else:
+                assert_can_find(p)  # close enough so that cells share a corner
+
+        # Check for double counting of pairs
+        self.assertEqual(len(found), len(set(found)))
+
+        # check that all required pairs have been seen
+        self.assertEqual(must_find, set([]))
+
 
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/rescale.py b/testsuite/python/rescale.py
index 5aefc884311..425ec0b6ef1 100644
--- a/testsuite/python/rescale.py
+++ b/testsuite/python/rescale.py
@@ -81,12 +81,20 @@ def test_z(self):
         self.dir_test(2)
 
     def test_exceptions(self):
+        box_l = np.copy(self.system.box_l)
         with self.assertRaisesRegex(ValueError, "Parameter 'd_new' must be > 0"):
             self.system.change_volume_and_rescale_particles(d_new=0.)
         with self.assertRaisesRegex(ValueError, "Parameter 'd_new' must be > 0"):
             self.system.change_volume_and_rescale_particles(d_new=-1.)
         with self.assertRaisesRegex(ValueError, "Usage: change_volume_and_rescale_particles"):
             self.system.change_volume_and_rescale_particles(d_new=1., dir=5)
+        with self.assertRaisesRegex(RuntimeError, "Cannot reset the box length when particles are present"):
+            self.system.box_l = 0.5 * box_l
+        np.testing.assert_allclose(
+            np.copy(self.system.box_l), box_l, atol=1e-7)
+        self.system.change_volume_and_rescale_particles(0.5 * box_l[0], "xyz")
+        np.testing.assert_allclose(
+            np.copy(self.system.box_l), 0.5 * box_l, atol=1e-7)
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/rotate_system.py b/testsuite/python/rotate_system.py
index ded7eda4e18..eb2fd89bc66 100644
--- a/testsuite/python/rotate_system.py
+++ b/testsuite/python/rotate_system.py
@@ -24,6 +24,7 @@
 import unittest as ut
 import numpy as np
 import espressomd
+import espressomd.propagation
 
 
 class RotateSystemTest(ut.TestCase):
@@ -71,9 +72,12 @@ def test_no_mass(self):
         # Check that virtual sites do not influence the center of mass
         # calculation
         if espressomd.has_features("VIRTUAL_SITES_RELATIVE"):
+            Propagation = espressomd.propagation.Propagation
             vs_dist = 0.01
             p2 = system.part.add(pos=p1.pos)
             p2.vs_relative = (p1.id, vs_dist, (1., 0., 0., 0.))
+            p2.propagation = (Propagation.TRANS_VS_RELATIVE |
+                              Propagation.ROT_VS_RELATIVE)
             system.rotate_system(phi=pi / 2., theta=pi / 2., alpha=-pi / 2.)
             np.testing.assert_allclose(np.copy(p0.pos), [6, 4, 4])
             np.testing.assert_allclose(np.copy(p1.pos), [4, 6, 6])
diff --git a/testsuite/python/save_checkpoint.py b/testsuite/python/save_checkpoint.py
index 04b752e4434..6e13e2a152c 100644
--- a/testsuite/python/save_checkpoint.py
+++ b/testsuite/python/save_checkpoint.py
@@ -20,8 +20,10 @@
 import unittest as ut
 import unittest_generator as utg
 import numpy as np
+import contextlib
 import pathlib
 import tempfile
+import sys
 
 import espressomd
 import espressomd.checkpointing
@@ -41,6 +43,9 @@
 import espressomd.bond_breakage
 import espressomd.reaction_methods
 
+with contextlib.suppress(ImportError):
+    import espressomd.plugins.ase
+
 config = utg.TestGenerator()
 modes = config.get_modes()
 
@@ -66,16 +71,21 @@
     filepath.unlink(missing_ok=True)
 
 # Lees-Edwards boundary conditions
-if 'INT.NPT' not in modes:
+if 'INT.NPT' not in modes and 'LB.GPU' not in modes:
     protocol = espressomd.lees_edwards.LinearShear(
         initial_pos_offset=0.1, time_0=0.2, shear_velocity=1.2)
     system.lees_edwards.set_boundary_conditions(
         shear_direction="x", shear_plane_normal="y", protocol=protocol)
 
+has_ase = "ASE" in modes
+
 lbf_class = None
 lb_lattice = None
 if espressomd.has_features('WALBERLA') and 'LB.WALBERLA' in modes:
-    lbf_class = espressomd.lb.LBFluidWalberla
+    if 'LB.GPU' in modes and espressomd.gpu_available():
+        lbf_class = espressomd.lb.LBFluidWalberlaGPU
+    elif 'LB.CPU' in modes:
+        lbf_class = espressomd.lb.LBFluidWalberla
     lb_lattice = espressomd.lb.LatticeWalberla(agrid=2.0, n_ghost_layers=1)
 if lbf_class:
     lbf_cpt_mode = 0 if 'LB.ASCII' in modes else 1
@@ -158,6 +168,11 @@
         system.electrostatics.solver = p3m
         p3m.charge_neutrality_tolerance = 5e-12
 
+if "ase" in sys.modules:
+    system.ase = espressomd.plugins.ase.ASEInterface(
+        type_mapping={0: "H", 1: "O", 10: "Cl"},
+    )
+
 # accumulators
 obs = espressomd.observables.ParticlePositions(ids=[0, 1])
 acc_mean_variance = espressomd.accumulators.MeanVarianceCalculator(obs=obs)
@@ -237,8 +252,8 @@
             approximation_method='ft', viscosity=0.5, radii={0: 1.5},
             pair_mobility=False, self_mobility=True)
 
-if espressomd.has_features(['VIRTUAL_SITES', 'VIRTUAL_SITES_RELATIVE']):
-    p2.vs_auto_relate_to(p1)
+if espressomd.has_features(['VIRTUAL_SITES_RELATIVE']) and not has_ase:
+    p2.vs_auto_relate_to(p1, couple_to_lb=lbf_class is not None)
 
 # non-bonded interactions
 if espressomd.has_features(['LENNARD_JONES']) and 'LJ' in modes:
@@ -385,20 +400,21 @@
     vtk_suffix = config.test_name
     vtk_root = pathlib.Path("vtk_out")
     # create LB VTK callbacks
-    lb_vtk_auto_id = f"auto_lb_{vtk_suffix}"
-    lb_vtk_manual_id = f"manual_lb_{vtk_suffix}"
-    config.recursive_unlink(vtk_root / lb_vtk_auto_id)
-    config.recursive_unlink(vtk_root / lb_vtk_manual_id)
-    lb_vtk_auto = espressomd.lb.VTKOutput(
-        identifier=lb_vtk_auto_id, delta_N=1,
-        observables=('density', 'velocity_vector'), base_folder=str(vtk_root))
-    lbf.add_vtk_writer(vtk=lb_vtk_auto)
-    lb_vtk_auto.disable()
-    lb_vtk_manual = espressomd.lb.VTKOutput(
-        identifier=lb_vtk_manual_id, delta_N=0,
-        observables=('density',), base_folder=str(vtk_root))
-    lbf.add_vtk_writer(vtk=lb_vtk_manual)
-    lb_vtk_manual.write()
+    if 'LB.GPU' not in modes:  # TODO WALBERLA
+        lb_vtk_auto_id = f"auto_lb_{vtk_suffix}"
+        lb_vtk_manual_id = f"manual_lb_{vtk_suffix}"
+        config.recursive_unlink(vtk_root / lb_vtk_auto_id)
+        config.recursive_unlink(vtk_root / lb_vtk_manual_id)
+        lb_vtk_auto = espressomd.lb.VTKOutput(
+            identifier=lb_vtk_auto_id, delta_N=1,
+            observables=('density', 'velocity_vector'), base_folder=str(vtk_root))
+        lbf.add_vtk_writer(vtk=lb_vtk_auto)
+        lb_vtk_auto.disable()
+        lb_vtk_manual = espressomd.lb.VTKOutput(
+            identifier=lb_vtk_manual_id, delta_N=0,
+            observables=('density',), base_folder=str(vtk_root))
+        lbf.add_vtk_writer(vtk=lb_vtk_manual)
+        lb_vtk_manual.write()
     # create EK VTK callbacks
     ek_vtk_auto_id = f"auto_ek_{vtk_suffix}"
     ek_vtk_manual_id = f"manual_ek_{vtk_suffix}"
@@ -447,6 +463,7 @@
 if espressomd.has_features(["ENGINE"]):
     p3.swimming = {"f_swim": 0.03}
 if espressomd.has_features(["ENGINE", "VIRTUAL_SITES_RELATIVE"]) and lbf_class:
+    assert not has_ase
     p4.swimming = {"v_swim": 0.02, "is_engine_force_on_fluid": True}
 if espressomd.has_features('LB_ELECTROHYDRODYNAMICS') and lbf_class:
     p8.mu_E = [-0.1, 0.2, -0.3]
diff --git a/testsuite/python/scafacos_dipoles_1d_2d.py b/testsuite/python/scafacos_dipoles_1d_2d.py
index e178fee5ee1..90c1cc2bcb5 100644
--- a/testsuite/python/scafacos_dipoles_1d_2d.py
+++ b/testsuite/python/scafacos_dipoles_1d_2d.py
@@ -54,9 +54,9 @@ def test_scafacos(self):
         particle_radius = 0.5
 
         box_l = np.cbrt(4 * n_particle * np.pi / (3 * rho)) * particle_radius
-        system.box_l = 3 * [box_l]
 
         for dim in (2, 1):
+            system.box_l = 3 * [box_l]
             with self.subTest(f"{dim} dimensions"):
                 # Read reference data
                 if dim == 2:
@@ -92,7 +92,9 @@ def test_scafacos(self):
                             "p2nfft_epsB": "0.05"})
                     # change box geometry in x,y direction to ensure that
                     # scafacos survives it
-                    system.box_l = np.array([1., 1., 1.3]) * box_l
+                    system.change_volume_and_rescale_particles(
+                        1.3 * box_l, "z")
+                    system.part.all().pos = data[:, 1:4]
                 else:
                     # 1d periodic in x
                     scafacos = espressomd.magnetostatics.Scafacos(
@@ -112,7 +114,7 @@ def test_scafacos(self):
                             "pnfft_m": "8",
                             "pnfft_diff_ik": "1",
                             "p2nfft_epsB": "0.125"})
-                    system.box_l = np.array([1., 1., 1.]) * box_l
+                    system.change_volume_and_rescale_particles(box_l, "z")
 
                 system.magnetostatics.solver = scafacos
                 system.integrator.run(0)
diff --git a/testsuite/python/scafacos_interface.py b/testsuite/python/scafacos_interface.py
index 7e9063400be..5c0776a7d00 100644
--- a/testsuite/python/scafacos_interface.py
+++ b/testsuite/python/scafacos_interface.py
@@ -119,7 +119,7 @@ def test_actor_coulomb(self):
                           'p3m_grid': 32, 'p3m_alpha': 2.799269})
 
         # check MD cell reset event
-        system.box_l = system.box_l
+        system.change_volume_and_rescale_particles(system.box_l[0], "x")
         system.periodicity = system.periodicity
 
         # force data array update, no-op since there are no particles
@@ -256,7 +256,7 @@ def test_actor_dipoles(self):
                          {k: type(v) for k, v in method_params_ref.items()})
 
         # check MD cell reset event
-        system.box_l = system.box_l
+        system.change_volume_and_rescale_particles(system.box_l[0], "x")
         system.periodicity = system.periodicity
 
         # force data array update, no-op since there are no particles
@@ -340,7 +340,7 @@ def fcs_data(self):
         ref_torques = np.copy(system.part.all().torque_lab)
 
         # check MD cell reset has no impact
-        system.box_l = system.box_l
+        system.change_volume_and_rescale_particles(system.box_l[0], "x")
         system.periodicity = system.periodicity
         system.cell_system.node_grid = system.cell_system.node_grid
         system.integrator.run(0, recalc_forces=True)
diff --git a/testsuite/python/sigint.py b/testsuite/python/sigint.py
index c719bdf4f1f..a8d9b1bbb3b 100644
--- a/testsuite/python/sigint.py
+++ b/testsuite/python/sigint.py
@@ -41,7 +41,9 @@ def check_signal_handling(self, process, sig):
         # capture stderr and return code (negative of signum)
         stdout, stderr = process.communicate(input=None, timeout=16.)
         assert stdout is None
-        traceback = stderr.decode()
+        traceback = (
+            stderr.decode()
+            .replace("\n[WARNING] yaksa: 1 leaked handle pool objects", ""))
         return_code = process.poll()
         signum = -return_code
         self.assertEqual(signum, sig.value)
diff --git a/testsuite/python/sigint_child.py b/testsuite/python/sigint_child.py
index 27a79e189a5..8e94847dd2c 100644
--- a/testsuite/python/sigint_child.py
+++ b/testsuite/python/sigint_child.py
@@ -20,6 +20,8 @@
 import numpy as np
 import espressomd
 
+np.random.seed(42)
+
 system = espressomd.System(box_l=[100, 100, 100])
 system.time_step = 0.01
 system.cell_system.skin = 0.1
diff --git a/testsuite/python/test_checkpoint.py b/testsuite/python/test_checkpoint.py
index 550a8c33a7c..9e58c6df973 100644
--- a/testsuite/python/test_checkpoint.py
+++ b/testsuite/python/test_checkpoint.py
@@ -41,6 +41,10 @@
 with contextlib.suppress(ImportError):
     import espressomd.io.vtk
 
+with contextlib.suppress(ImportError):
+    import ase
+    import espressomd.plugins.ase
+
 with contextlib.suppress(ImportError):
     import h5py  # h5py has to be imported *after* espressomd (MPI)
 
@@ -53,6 +57,7 @@
 has_thermalized_bonds = 'THERM.LB' in modes or 'THERM.LANGEVIN' in modes
 has_drude = (espressomd.has_features(['ELECTROSTATICS' and 'MASS', 'ROTATION'])
              and has_thermalized_bonds)
+has_ase = 'ASE' in modes
 
 
 class CheckpointTest(ut.TestCase):
@@ -103,7 +108,7 @@ def test_lb_fluid(self):
 
         # load the valid LB checkpoint file
         lbf.load_checkpoint(cpt_path.format(""), cpt_mode)
-        precision = 8 if "LB.WALBERLA" in modes else 5
+        precision = 8 if not lbf.single_precision else 5
         m = np.pi / 12
         nx = lbf.shape[0]
         ny = lbf.shape[1]
@@ -133,7 +138,10 @@ def test_lb_fluid(self):
             np.testing.assert_allclose(np.copy(state[key]), reference[key],
                                        atol=1E-7, err_msg=f"{key} differs")
         self.assertTrue(lbf.is_active)
-        self.assertFalse(lbf.single_precision)
+        if "LB.CPU" in modes:
+            self.assertFalse(lbf.single_precision)
+        elif "LB.GPU" in modes:
+            self.assertTrue(lbf.single_precision)
 
         # check boundary objects
         slip_velocity1 = np.array([1e-4, 1e-4, 0.])
@@ -150,8 +158,6 @@ def test_lb_fluid(self):
             np.testing.assert_allclose(np.copy(node.velocity), slip_velocity1)
         for node in lbf[-1, :, :]:
             np.testing.assert_allclose(np.copy(node.velocity), slip_velocity2)
-        for node in lbf[2, :, :]:
-            np.testing.assert_allclose(np.copy(node.velocity), 0.)
         # remove boundaries
         lbf.clear_boundaries()
         np.testing.assert_equal(
@@ -262,6 +268,7 @@ def generator(value, shape):
             np.copy(ek_species[:, :, :].is_boundary), False)
 
     @utx.skipIfMissingFeatures(["WALBERLA"])
+    @ut.skipIf('LB.GPU' in modes, 'VTK not implemented for LB GPU')
     @ut.skipIf(not has_lb_mode, "Skipping test due to missing LB feature.")
     def test_lb_vtk(self):
         lbf = system.lb
@@ -305,6 +312,8 @@ def test_lb_vtk(self):
             lb_density = vtk_data["density"]
             self.assertAlmostEqual(
                 lb_density[0, 0, 0], new_density, delta=1e-5)
+        (vtk_root / filename.format(1)).unlink(missing_ok=True)
+        (vtk_root / filename.format(2)).unlink(missing_ok=True)
 
     @utx.skipIfMissingFeatures(["WALBERLA"])
     @ut.skipIf(not has_lb_mode, "Skipping test due to missing EK feature.")
@@ -348,6 +357,8 @@ def test_ek_vtk(self):
             ek_density = vtk_data["density"]
             self.assertAlmostEqual(
                 ek_density[0, 0, 0], new_density, delta=1e-5)
+        (vtk_root / filename.format(1)).unlink(missing_ok=True)
+        (vtk_root / filename.format(2)).unlink(missing_ok=True)
 
     def test_system_variables(self):
         cell_system_params = system.cell_system.get_state()
@@ -362,6 +373,7 @@ def test_system_variables(self):
         np.testing.assert_array_equal(
             np.copy(system.periodicity), self.ref_periodicity)
 
+    @ut.skipIf('LB.GPU' in modes, 'Lees-Edwards not implemented for LB GPU')
     @ut.skipIf('INT.NPT' in modes, 'Lees-Edwards not compatible with NPT')
     def test_lees_edwards(self):
         lebc = system.lees_edwards
@@ -443,13 +455,13 @@ def test_particle_properties(self):
                 p3.swimming,
                 {"f_swim": 0.03, "is_engine_force_on_fluid": False})
             if espressomd.has_features(
-                    'VIRTUAL_SITES_RELATIVE') and has_lb_mode:
+                    'VIRTUAL_SITES_RELATIVE') and has_lb_mode and not has_ase:
                 self.assertEqual(
                     p4.swimming,
                     {"f_swim": 0., "is_engine_force_on_fluid": True})
         if espressomd.has_features('LB_ELECTROHYDRODYNAMICS') and has_lb_mode:
             np.testing.assert_allclose(np.copy(p8.mu_E), [-0.1, 0.2, -0.3])
-        if espressomd.has_features('VIRTUAL_SITES_RELATIVE'):
+        if espressomd.has_features('VIRTUAL_SITES_RELATIVE') and not has_ase:
             from scipy.spatial.transform import Rotation as R
             q_ind = ([1, 2, 3, 0],)  # convert from scalar-first to scalar-last
             vs_id, vs_dist, vs_quat = p2.vs_relative
@@ -734,13 +746,16 @@ def test_drude_helpers(self):
         self.assertEqual(dh.drude_id_list, [5])
 
     @utx.skipIfMissingFeatures(['VIRTUAL_SITES', 'VIRTUAL_SITES_RELATIVE'])
+    @ut.skipIf("ASE" in modes, "virtual sites not allowed by ASE")
     def test_virtual_sites(self):
         Propagation = espressomd.propagation.Propagation
         p_real = system.part.by_id(0)
         p_virt = system.part.by_id(1)
+        prop_flag = Propagation.TRANS_VS_RELATIVE | Propagation.ROT_VS_RELATIVE
+        if espressomd.has_features("WALBERLA") and system.lb is not None:
+            prop_flag |= Propagation.TRANS_LB_MOMENTUM_EXCHANGE
         self.assertEqual(p_real.propagation, Propagation.SYSTEM_DEFAULT)
-        self.assertEqual(p_virt.propagation, Propagation.TRANS_VS_RELATIVE |
-                         Propagation.ROT_VS_RELATIVE)
+        self.assertEqual(p_virt.propagation, prop_flag)
         self.assertEqual(p_real.vs_relative[0], -1)
         self.assertEqual(p_virt.vs_relative[0], p_real.id)
         self.assertEqual(p_real.vs_relative[1], 0.)
@@ -1018,6 +1033,22 @@ def test_union(self):
         p2.remove()
         system.non_bonded_inter[2, 6].reset()
 
+    @ut.skipIf("ase" not in sys.modules, "missing module 'ase'")
+    @ut.skipIf("ASE" not in modes, "missing combination")
+    def test_ase_plugin(self):
+        atoms = system.ase.get()
+        self.assertIsNotNone(atoms)
+        self.assertIsInstance(atoms, ase.Atoms)
+        self.assertEqual(set(atoms.get_chemical_symbols()), {"H", "O"})
+        np.testing.assert_equal(atoms.pbc, np.copy(system.periodicity))
+        np.testing.assert_allclose(atoms.cell, np.diag(system.box_l))
+        np.testing.assert_allclose(
+            atoms.get_positions(),
+            np.copy(system.part.select(lambda p: p.type in [0, 1]).pos))
+        np.testing.assert_allclose(
+            atoms.get_forces(),
+            np.copy(system.part.select(lambda p: p.type in [0, 1]).f))
+
 
 if __name__ == '__main__':
     config.bind_test_class(CheckpointTest)
diff --git a/testsuite/python/unittest_decorators.py b/testsuite/python/unittest_decorators.py
index a9a0e5200c4..32b988b80cf 100644
--- a/testsuite/python/unittest_decorators.py
+++ b/testsuite/python/unittest_decorators.py
@@ -19,7 +19,7 @@
 
 import sys
 import importlib
-import pkg_resources
+import packaging.specifiers
 import unittest
 
 import espressomd
@@ -74,7 +74,7 @@ def skipIfUnmetModuleVersionRequirement(module, version_requirement):
         _module = importlib.import_module(module)
     except ImportError:
         return skipIfMissingModules(module)
-    if not pkg_resources.packaging.specifiers.SpecifierSet(
+    if not packaging.specifiers.SpecifierSet(
             version_requirement).contains(_module.__version__):
         return unittest.skip(
             "Skipping test: version requirement not met for module {}".format(module))
diff --git a/testsuite/python/unittest_generator.py b/testsuite/python/unittest_generator.py
index e2925b7d87e..50972b01174 100644
--- a/testsuite/python/unittest_generator.py
+++ b/testsuite/python/unittest_generator.py
@@ -108,7 +108,7 @@ def __init__(self):
                 self.test_feat = arg.split('__', 1)[1]
                 self.test_idx = arg.split('_', 1)[1].lstrip('_')
                 break
-        err_msg = f"please provide a test name as argument, like '{prefix}lb_cpu__p3m_cpu' (got {sys.argv})"
+        err_msg = f"please provide a test name as argument, like '{prefix}lb_cpu__p3m_cpu' (got {sys.argv})"  # nopep8
         assert self.test_name is not None, err_msg
 
     def bind_test_class(self, base_class):
diff --git a/testsuite/python/virtual_sites_tracers_common.py b/testsuite/python/virtual_sites_tracers_common.py
index df44fa77fe9..85edaa57ae4 100644
--- a/testsuite/python/virtual_sites_tracers_common.py
+++ b/testsuite/python/virtual_sites_tracers_common.py
@@ -29,8 +29,9 @@
 
 
 class VirtualSitesTracersCommon:
-    box_height = 10.
-    box_lw = 8.
+    agrid = 0.5
+    box_height = 10. * agrid
+    box_lw = 8. * agrid
     system = espressomd.System(box_l=(box_lw, box_lw, box_height))
     system.time_step = 0.08
     system.cell_system.skin = 0.1
@@ -46,7 +47,7 @@ def tearDown(self):
     def set_lb(self, ext_force_density=(0, 0, 0), dir_walls=2):
         self.system.lb = None
         self.lbf = self.LBClass(
-            kT=0.0, agrid=1., density=1., kinematic_viscosity=1.8,
+            kT=0.0, agrid=self.agrid, density=1., kinematic_viscosity=1.8,
             tau=self.system.time_step, ext_force_density=ext_force_density)
         self.system.lb = self.lbf
         self.system.thermostat.set_lb(LB_fluid=self.lbf, gamma=1.)
@@ -54,11 +55,12 @@ def set_lb(self, ext_force_density=(0, 0, 0), dir_walls=2):
         # Setup boundaries
         normal = [0, 0, 0]
         normal[dir_walls] = 1
-        wall_shape = espressomd.shapes.Wall(normal=normal, dist=0.5)
+        wall_shape = espressomd.shapes.Wall(
+            normal=normal, dist=0.5 * self.agrid)
         self.lbf.add_boundary_from_shape(wall_shape)
         normal[dir_walls] = -1
         wall_shape = espressomd.shapes.Wall(
-            normal=normal, dist=-(self.system.box_l[dir_walls] - 0.5))
+            normal=normal, dist=-(self.system.box_l[dir_walls] - 0.5 * self.agrid))
         self.lbf.add_boundary_from_shape(wall_shape)
 
         espressomd.utils.handle_errors("setup")
@@ -72,7 +74,7 @@ def test_ab_single_step(self):
         self.lbf[:, :, :].velocity = np.random.random((*self.lbf.shape, 3))
         force = [1, -2, 3]
         # Test several particle positions
-        for pos in [[3, 2, 1], [0, 0, 0],
+        for pos in [[3 * self.agrid, 2 * self.agrid, 1 * self.agrid], [0, 0, 0],
                     self.system.box_l * 0.49,
                     self.system.box_l,
                     self.system.box_l * 0.99]:
@@ -133,8 +135,8 @@ def test_advection(self):
             system.integrator.run(400)
 
             # Add tracer in the fluid domain
-            pos_initial = [3.5, 3.5, 3.5]
-            pos_initial[direction] = 0.5
+            pos_initial = 3 * [3.5 * self.agrid]
+            pos_initial[direction] = 0.5 * self.agrid
             p = system.part.add(
                 pos=pos_initial,
                 propagation=espressomd.propagation.Propagation.TRANS_LB_TRACER)
@@ -150,6 +152,7 @@ def test_advection(self):
                 self.assertAlmostEqual(tracer_dist / ref_dist, 1., delta=0.01)
 
             system.lb = None
+            system.part.clear()
 
     def test_zz_exceptions_without_lb(self):
         """
diff --git a/testsuite/scripts/CMakeLists.txt b/testsuite/scripts/CMakeLists.txt
index 929d5ef890a..f0e9891a81f 100644
--- a/testsuite/scripts/CMakeLists.txt
+++ b/testsuite/scripts/CMakeLists.txt
@@ -27,7 +27,7 @@ configure_file(test_importlib_wrapper.py
 macro(PYTHON_SCRIPTS_TEST)
   cmake_parse_arguments(TEST "" "FILE;SUFFIX;TYPE" "DEPENDENCIES;LABELS"
                         ${ARGN})
-  get_filename_component(TEST_NAME ${TEST_FILE} NAME_WE)
+  cmake_path(GET TEST_FILE STEM TEST_NAME)
   if(TEST_SUFFIX)
     set(TEST_NAME "${TEST_NAME}_with_${TEST_SUFFIX}")
   endif()
diff --git a/testsuite/scripts/samples/CMakeLists.txt b/testsuite/scripts/samples/CMakeLists.txt
index 2fde3a87cbe..6a5c15e1e52 100644
--- a/testsuite/scripts/samples/CMakeLists.txt
+++ b/testsuite/scripts/samples/CMakeLists.txt
@@ -58,7 +58,8 @@ if(HDF5_FOUND)
   sample_test(FILE test_h5md.py)
   sample_test(FILE test_h5md_trajectory.py)
 endif()
-sample_test(FILE test_lbf.py)
+sample_test(FILE test_lbf.py SUFFIX cpu)
+sample_test(FILE test_lbf.py SUFFIX gpu LABELS "gpu")
 sample_test(FILE test_lb_profile.py)
 sample_test(FILE test_lb_planar_couette.py)
 sample_test(FILE test_lb_circular_couette.py)
diff --git a/testsuite/scripts/tutorials/test_electrodes_2.py b/testsuite/scripts/tutorials/test_electrodes_2.py
index 95a02252779..81a35646f66 100644
--- a/testsuite/scripts/tutorials/test_electrodes_2.py
+++ b/testsuite/scripts/tutorials/test_electrodes_2.py
@@ -22,8 +22,7 @@
 import numpy as np
 from scipy import constants
 
-params = {'N_SAMPLES_EQUIL': 25, 'N_SAMPLES_PROD': 5,
-          'N_SAMPLES_EQUIL_CAP': 0, 'N_SAMPLES_CAP': 5,
+params = {'N_SAMPLES_PROD': 15, 'N_SAMPLES_EQUIL_CAP': 5, 'N_SAMPLES_CAP': 5,
           'MIN_PHI': 5, 'MAX_PHI': 5, 'N_PHI': 1}
 
 tutorial, skipIfMissingFeatures = importlib_wrapper.configure_and_import(
@@ -66,7 +65,7 @@ def test_capacitance(self):
             constants.elementary_charge / (constants.Boltzmann * tutorial.TEMPERATURE))
         msg = 'The capacitance at low potentials should be in line with Grahame/DH.'
         np.testing.assert_allclose(
-            grahame, tutorial.sigma_vs_phi[:, 1], atol=.05, err_msg=msg)
+            grahame, tutorial.sigma_vs_phi[:, 1], atol=.08, err_msg=msg)
 
 
 if __name__ == "__main__":
diff --git a/testsuite/scripts/tutorials/test_ferrofluid_1.py b/testsuite/scripts/tutorials/test_ferrofluid_1.py
index f6ce5e78ca4..7da0aa14ef3 100644
--- a/testsuite/scripts/tutorials/test_ferrofluid_1.py
+++ b/testsuite/scripts/tutorials/test_ferrofluid_1.py
@@ -22,7 +22,7 @@
 
 tutorial, skipIfMissingFeatures = importlib_wrapper.configure_and_import(
     "@TUTORIALS_DIR@/ferrofluid/ferrofluid_part1.py",
-    EQUIL_STEPS=200, EQUIL_ROUNDS=10,
+    EQUIL_STEPS=200, EQUIL_ROUNDS=20,
     CI_DP3M_PARAMS={'cao': 3, 'r_cut': 8.34, 'mesh': [8, 8, 8], 'alpha': 0.2115, 'tune': False})
 
 
@@ -33,7 +33,7 @@ class Tutorial(ut.TestCase):
     def test(self):
         self.assertEqual(
             int(np.sum(tutorial.n_clusters)), len(tutorial.cluster_sizes))
-        for i in range(8):
+        for i in range(7):
             self.assertLess(
                 tutorial.size_dist[0][i + 1],
                 tutorial.size_dist[0][i])
diff --git a/testsuite/scripts/tutorials/test_polymers.py b/testsuite/scripts/tutorials/test_polymers.py
index f938bf0e0c6..abfd4193ad6 100644
--- a/testsuite/scripts/tutorials/test_polymers.py
+++ b/testsuite/scripts/tutorials/test_polymers.py
@@ -22,7 +22,7 @@
 if '@TEST_SUFFIX@' == 'rouse':
     params = {}
 elif '@TEST_SUFFIX@' == 'zimm':
-    params = {'LOOPS': 400, 'POLYMER_MODEL': 'Zimm'}
+    params = {'LOOPS': 480, 'POLYMER_MODEL': 'Zimm'}
 
 tutorial, skipIfMissingFeatures = importlib_wrapper.configure_and_import(
     "@TUTORIALS_DIR@/polymers/polymers.py",