From 5919c489bff18b07c40a945696b6abadcb3bf6db Mon Sep 17 00:00:00 2001
From: Liam Keegan <liam@keegan.ch>
Date: Wed, 10 Jan 2024 16:33:25 +0100
Subject: [PATCH] add CUDA distance implementation (#123)

- python interface changes
  - add `from_fasta_to_lower_triangular`
    - this constructs lower triangular matrix file directly from fasta file
    - for now only works on GPU
    - faster & requires less RAM than doing `from_fasta` followed by `dump_lower_triangular`
    - requires 1.5GB RAM per 100k genomes on gpu + 1GB buffer to store partial distances matrix
  - add `use_gpu` option to `from_fasta`
    - if True and include_x is False then the GPU is used to calcuate distances matrix
  - add `cuda_gpu_available()` utility function
- CUDA implementation
  - each block of threads calculates a single element of the distances matrix
  - a kernel is launched running on a grid of these blocks to calculate a subset of the distances matrix
  - I/O is interleaved with computation: the CPU writes the previous kernel results as the next kernel is running
- print basic timing info to cout
- add libfmt library
- migrate to using v3 of catch2
- build wheels using manylinux2014 image with CUDA 11.8 pre-installed from https://github.com/ameli/manylinux-cuda
- add a couple of performance plots
- bump version to 1.0.0
- resolves #111
---
 .github/workflows/ci.yml              |   2 +-
 .github/workflows/wheels.yml          |   6 +-
 .gitmodules                           |   3 +
 CMakeLists.txt                        |  12 +-
 README.md                             |  35 +++-
 ext/CMakeLists.txt                    |   2 +
 ext/Catch2                            |   2 +-
 ext/fmt                               |   1 +
 include/hamming/distance_avx2.hh      |   5 +-
 include/hamming/distance_avx512.hh    |   5 +-
 include/hamming/distance_cuda.hh      |  27 +++
 include/hamming/distance_sse2.hh      |   5 +-
 include/hamming/hamming.hh            | 111 ++++--------
 include/hamming/hamming_impl.hh       | 126 ++++++++------
 include/hamming/hamming_impl_types.hh |   5 +-
 include/hamming/hamming_types.hh      |   5 +-
 include/hamming/hamming_utils.hh      | 112 +++++++++++++
 plots/overview.plt                    |  10 ++
 plots/overview.png                    | Bin 0 -> 6498 bytes
 plots/overview.txt                    |   8 +
 plots/speed.plt                       |  13 ++
 plots/speed.png                       | Bin 0 -> 8301 bytes
 plots/speed.txt                       |  15 ++
 pyproject.toml                        |  11 +-
 python/hammingdist.cc                 |  23 ++-
 python/tests/test_hammingdist.py      |  47 ++++--
 src/CMakeLists.txt                    |  42 ++++-
 src/bench.cc                          |   7 +-
 src/bench.hh                          |  27 ++-
 src/cuda_mem.hh                       |  32 ++++
 src/distance_cuda.cu                  | 233 ++++++++++++++++++++++++++
 src/distance_cuda_bench.cc            |  19 +++
 src/distance_cuda_t.cc                |  62 +++++++
 src/hamming.cc                        |  41 +++--
 src/hamming_bench.cc                  |  47 +++++-
 src/hamming_impl.cc                   | 102 ++++++++++-
 src/hamming_t.cc                      | 230 +++++++++++++++++++------
 src/hamming_utils.cc                  |   1 +
 src/hamming_utils_bench.cc            |  78 +++++++++
 src/tests.cc                          |   9 +
 src/tests.hh                          |  12 +-
 41 files changed, 1269 insertions(+), 264 deletions(-)
 create mode 160000 ext/fmt
 create mode 100644 include/hamming/distance_cuda.hh
 create mode 100644 include/hamming/hamming_utils.hh
 create mode 100644 plots/overview.plt
 create mode 100644 plots/overview.png
 create mode 100644 plots/overview.txt
 create mode 100644 plots/speed.plt
 create mode 100644 plots/speed.png
 create mode 100644 plots/speed.txt
 create mode 100644 src/cuda_mem.hh
 create mode 100644 src/distance_cuda.cu
 create mode 100644 src/distance_cuda_bench.cc
 create mode 100644 src/distance_cuda_t.cc
 create mode 100644 src/hamming_utils.cc
 create mode 100644 src/hamming_utils_bench.cc

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 54c4dd0..68049b5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -65,4 +65,4 @@ jobs:
       - name: run tests
         shell: bash
         working-directory: ${{runner.workspace}}/build
-        run: ctest
+        run: ctest -j2 --rerun-failed --output-on-failure
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 58eae2b..7e3ee38 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -27,9 +27,11 @@ jobs:
         with:
           submodules: "recursive"
       - uses: pypa/cibuildwheel@v2.16
+        env:
+          CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux2014_x86_64_cuda_11.8
       - uses: actions/upload-artifact@v4
         with:
-          name: wheels-${{ matrix.os }}
+          name: artifacts-${{ matrix.os }}
           path: ./wheelhouse/*.whl
 
   upload_pypi:
@@ -42,7 +44,7 @@ jobs:
     steps:
       - uses: actions/download-artifact@v4
         with:
-          pattern: wheels-*
+          pattern: artifacts-*
           merge-multiple: true
           path: dist
 
diff --git a/.gitmodules b/.gitmodules
index f88f44d..0e6d3ab 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,3 +13,6 @@
 [submodule "Catch2"]
 	path = ext/Catch2
 	url = https://github.com/catchorg/Catch2.git
+[submodule "ext/fmt"]
+	path = ext/fmt
+	url = https://github.com/fmtlib/fmt.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8919e23..69984d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,8 @@
-cmake_minimum_required(VERSION 3.11..3.27)
+cmake_minimum_required(VERSION 3.23..3.27)
 
 project(
   hammingdist
-  VERSION 0.21.0
+  VERSION 1.0.0
   LANGUAGES CXX)
 
 include(CTest)
@@ -39,6 +39,14 @@ set(HAMMING_WITH_NEON
     no
     CACHE BOOL "Enable NEON optimized code on Arm64 CPUs")
 
+set(HAMMING_WITH_CUDA
+    no
+    CACHE BOOL "Build with CUDA support (nvidia gpu)")
+
+if(HAMMING_WITH_CUDA)
+  enable_language(CUDA)
+endif()
+
 # Add git submodules
 add_subdirectory(ext)
 
diff --git a/README.md b/README.md
index 4719e67..061a51e 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-A small C++ tool to calculate pairwise distances between gene sequences given in fasta format.
+A small and fast C++ tool to calculate pairwise distances between gene sequences given in fasta format.
 
 [![DOI](https://zenodo.org/badge/308676358.svg)](https://zenodo.org/badge/latestdoi/308676358)
 [![pypi releases](https://img.shields.io/pypi/v/hammingdist.svg)](https://pypi.org/project/hammingdist)
@@ -117,9 +117,34 @@ distance = hammingdist.distance("ACGTX", "AAGTX", include_x=True)
 
 ## OpenMP on linux
 
-The latest versions of hammingdist on linux are now built with OpenMP (multithreading) support.
-If this causes any issues, you can install a previous version of hammingdist without OpenMP support:
+On linux hammingdist is built with OpenMP (multithreading) support, and will automatically make use of all available CPU threads.
 
-```bash
-pip install hammingdist==0.11.0
+## CUDA on linux
+
+On linux hammingdist is also built with CUDA (Nvidia GPU) support.
+To use the GPU instead of the CPU, set `use_gpu=True` when calling `from_fasta`:
+
+```python
+import hammingdist
+
+data = hammingdist.from_fasta("example.fasta", use_gpu=True)
+```
+
+Additionally, the lower triangular matrix file can now be directly constructed from the fasta file
+using the GPU with the `from_fasta_to_lower_triangular` function.
+This avoids storing the entire distances matrix in memory and interleaves computation on the GPU with disk I/O on the CPU,
+which means it requires less RAM and runs faster.
+
+```python
+import hammingdist
+
+hammingdist.from_fasta_to_lower_triangular('input_fasta.txt', 'output_lower_triangular.txt', use_gpu=True)
 ```
+
+![overview](plots/speed.png)
+
+## Performance history
+
+A rough measure of the impact of the different performance improvements in hammingdist:
+
+![overview](plots/overview.png)
diff --git a/ext/CMakeLists.txt b/ext/CMakeLists.txt
index 6a00470..897253e 100644
--- a/ext/CMakeLists.txt
+++ b/ext/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(fmt)
+
 if(HAMMING_BUILD_PYTHON)
   add_subdirectory(pybind11)
 endif()
diff --git a/ext/Catch2 b/ext/Catch2
index 182c910..f981c9c 160000
--- a/ext/Catch2
+++ b/ext/Catch2
@@ -1 +1 @@
-Subproject commit 182c910b4b63ff587a3440e08f84f70497e49a81
+Subproject commit f981c9cbcac07a2690e5a86767eba490b5465463
diff --git a/ext/fmt b/ext/fmt
new file mode 160000
index 0000000..f5e5435
--- /dev/null
+++ b/ext/fmt
@@ -0,0 +1 @@
+Subproject commit f5e54359df4c26b6230fc61d38aa294581393084
diff --git a/include/hamming/distance_avx2.hh b/include/hamming/distance_avx2.hh
index 8fc6cdc..7823a52 100644
--- a/include/hamming/distance_avx2.hh
+++ b/include/hamming/distance_avx2.hh
@@ -1,5 +1,4 @@
-#ifndef _HAMMING_DISTANCE_AVX2_HH
-#define _HAMMING_DISTANCE_AVX2_HH
+#pragma once
 
 #include <cstdint>
 #include <vector>
@@ -12,5 +11,3 @@ int distance_avx2(const std::vector<GeneBlock> &a,
                   const std::vector<GeneBlock> &b);
 
 }
-
-#endif
diff --git a/include/hamming/distance_avx512.hh b/include/hamming/distance_avx512.hh
index 919fd31..a21d414 100644
--- a/include/hamming/distance_avx512.hh
+++ b/include/hamming/distance_avx512.hh
@@ -1,5 +1,4 @@
-#ifndef _HAMMING_DISTANCE_AVX512_HH
-#define _HAMMING_DISTANCE_AVX512_HH
+#pragma once
 
 #include <cstdint>
 #include <vector>
@@ -12,5 +11,3 @@ int distance_avx512(const std::vector<GeneBlock> &a,
                     const std::vector<GeneBlock> &b);
 
 }
-
-#endif
diff --git a/include/hamming/distance_cuda.hh b/include/hamming/distance_cuda.hh
new file mode 100644
index 0000000..c7c7e52
--- /dev/null
+++ b/include/hamming/distance_cuda.hh
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <cstdint>
+#include <iostream>
+#include <vector>
+
+#include "hamming/hamming_impl_types.hh"
+
+namespace hamming {
+
+bool distance_cuda_have_device();
+
+int distance_cuda(const std::vector<GeneBlock> &a,
+                  const std::vector<GeneBlock> &b);
+
+// for now explicit function def for each choice of integer type
+std::vector<uint8_t>
+distances_cuda_8bit(const std::vector<std::vector<GeneBlock>> &data);
+
+std::vector<uint16_t>
+distances_cuda_16bit(const std::vector<std::vector<GeneBlock>> &data);
+
+void distances_cuda_to_lower_triangular(
+    const std::vector<std::vector<GeneBlock>> &data,
+    const std::string &filename);
+
+} // namespace hamming
diff --git a/include/hamming/distance_sse2.hh b/include/hamming/distance_sse2.hh
index 6920226..ac5d2c7 100644
--- a/include/hamming/distance_sse2.hh
+++ b/include/hamming/distance_sse2.hh
@@ -1,5 +1,4 @@
-#ifndef _HAMMING_DISTANCE_SSE2_HH
-#define _HAMMING_DISTANCE_SSE2_HH
+#pragma once
 
 #include <cstdint>
 #include <vector>
@@ -12,5 +11,3 @@ int distance_sse2(const std::vector<GeneBlock> &a,
                   const std::vector<GeneBlock> &b);
 
 }
-
-#endif
diff --git a/include/hamming/hamming.hh b/include/hamming/hamming.hh
index 61af9b7..06f90a0 100644
--- a/include/hamming/hamming.hh
+++ b/include/hamming/hamming.hh
@@ -1,13 +1,12 @@
-#ifndef _HAMMING_HH
-#define _HAMMING_HH
+#pragma once
 
 #include "hamming/hamming_impl.hh"
 #include "hamming/hamming_types.hh"
+#include "hamming/hamming_utils.hh"
 #include <cmath>
 #include <fstream>
 #include <sstream>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 namespace hamming {
@@ -19,10 +18,11 @@ inline std::size_t uint_sqrt(std::size_t x) {
 template <typename DistIntType> struct DataSet {
   explicit DataSet(std::vector<std::string> &data, bool include_x = false,
                    bool clear_input_data = false,
-                   std::vector<std::size_t> &&indices = {})
+                   std::vector<std::size_t> &&indices = {},
+                   bool use_gpu = false)
       : nsamples(data.size()), sequence_indices(std::move(indices)) {
     validate_data(data);
-    result = distances<DistIntType>(data, include_x, clear_input_data);
+    result = distances<DistIntType>(data, include_x, clear_input_data, use_gpu);
   }
 
   explicit DataSet(const std::string &filename) {
@@ -67,35 +67,7 @@ template <typename DistIntType> struct DataSet {
   }
 
   void dump_lower_triangular(const std::string &filename) {
-    std::ofstream stream(filename);
-#ifdef HAMMING_WITH_OPENMP
-    std::size_t block_size = 200;
-#pragma omp parallel for ordered schedule(static, 1)
-    for (std::size_t i_start = 1; i_start < nsamples; i_start += block_size) {
-      std::stringstream line;
-      std::size_t i_end = i_start + block_size;
-      if (i_end > nsamples) {
-        i_end = nsamples;
-      }
-      for (std::size_t i = i_start; i < i_end; ++i) {
-        std::size_t offset{i * (i - 1) / 2};
-        for (std::size_t j = 0; j + 1 < i; ++j) {
-          line << static_cast<int>(result[offset + j]) << ",";
-        }
-        line << static_cast<int>(result[offset + i - 1]) << "\n";
-      }
-#pragma omp ordered
-      stream << line.str();
-    }
-#else
-    std::size_t k = 0;
-    for (std::size_t i = 1; i < nsamples; ++i) {
-      for (std::size_t j = 0; j + 1 < i; ++j) {
-        stream << static_cast<int>(result[k++]) << ",";
-      }
-      stream << static_cast<int>(result[k++]) << "\n";
-    }
-#endif
+    write_lower_triangular(filename, result);
   }
 
   void dump_sequence_indices(const std::string &filename) {
@@ -127,67 +99,52 @@ template <typename DistIntType> struct DataSet {
   std::vector<std::size_t> sequence_indices{};
 };
 
-DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data);
+DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data,
+                                            bool include_x = false,
+                                            bool use_gpu = false);
 
 DataSet<DefaultDistIntType> from_csv(const std::string &filename);
 
-DataSet<DefaultDistIntType> from_lower_triangular(const std::string &filename);
-
 template <typename DistIntType>
-DataSet<DistIntType>
-from_fasta(const std::string &filename, bool include_x = false,
-           bool remove_duplicates = false, std::size_t n = 0) {
-  std::vector<std::string> data;
-  data.reserve(n);
-  if (n == 0) {
-    n = std::numeric_limits<std::size_t>::max();
-    data.reserve(65536);
-  }
-  std::unordered_map<std::string, std::size_t> map_seq_to_index;
-  std::vector<std::size_t> sequence_indices{};
-  // Initializing the stream
+DataSet<DistIntType> from_lower_triangular(const std::string &filename) {
+  std::vector<DistIntType> distances;
   std::ifstream stream(filename);
-  std::size_t count = 0;
-  std::size_t count_unique = 0;
   std::string line;
-  // skip first header
-  std::getline(stream, line);
-  while (count < n && !stream.eof()) {
-    std::string seq{};
-    while (std::getline(stream, line) && line[0] != '>') {
-      seq.append(line);
-    }
-    if (remove_duplicates) {
-      auto result = map_seq_to_index.emplace(std::move(seq), count_unique);
-      if (result.second) {
-        ++count_unique;
-      }
-      sequence_indices.push_back(result.first->second);
-    } else {
-      data.push_back(std::move(seq));
-    }
-    ++count;
-  }
-  if (remove_duplicates) {
-    // copy each unique sequence to the vector of strings
-    data.resize(count_unique);
-    for (auto &key_value_pair : map_seq_to_index) {
-      data[key_value_pair.second] = key_value_pair.first;
+  while (std::getline(stream, line)) {
+    std::istringstream s(line);
+    std::string d;
+    while (s.good()) {
+      std::getline(s, d, ',');
+      distances.push_back(safe_int_cast<DistIntType>(std::stoi(d)));
     }
   }
+  return DataSet(std::move(distances));
+}
+
+template <typename DistIntType>
+DataSet<DistIntType> from_fasta(const std::string &filename,
+                                bool include_x = false,
+                                bool remove_duplicates = false,
+                                std::size_t n = 0, bool use_gpu = false) {
+  auto [data, sequence_indices] = read_fasta(filename, remove_duplicates, n);
   return DataSet<DistIntType>(data, include_x, true,
-                              std::move(sequence_indices));
+                              std::move(sequence_indices), use_gpu);
 }
 
+void from_fasta_to_lower_triangular(const std::string &input_filename,
+                                    const std::string &output_filename,
+                                    bool remove_duplicates = false,
+                                    std::size_t n = 0, bool use_gpu = false);
+
 ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1,
                               bool include_x = false);
+
 std::vector<ReferenceDistIntType>
 fasta_reference_distances(const std::string &reference_sequence,
                           const std::string &fasta_file,
                           bool include_x = false);
+
 std::vector<std::size_t> fasta_sequence_indices(const std::string &fasta_file,
                                                 std::size_t n = 0);
 
 } // namespace hamming
-
-#endif
diff --git a/include/hamming/hamming_impl.hh b/include/hamming/hamming_impl.hh
index 4e0a97e..6f1814f 100644
--- a/include/hamming/hamming_impl.hh
+++ b/include/hamming/hamming_impl.hh
@@ -1,35 +1,37 @@
-#ifndef _HAMMING_IMPL_HH
-#define _HAMMING_IMPL_HH
+#pragma once
 
 #include <array>
-#if !(defined(__aarch64__) || defined(_M_ARM64))
-#include <cpuinfo_x86.h>
-#endif
+#include <charconv>
+#include <chrono>
 #include <cstdint>
+#include <iostream>
 #include <limits>
 #include <string>
 #include <vector>
 #ifdef HAMMING_WITH_OPENMP
 #include <omp.h>
 #endif
-#ifdef HAMMING_WITH_NEON
-#include "hamming/distance_neon.hh"
-#endif
-#ifdef HAMMING_WITH_SSE2
-#include "hamming/distance_sse2.hh"
-#endif
-#ifdef HAMMING_WITH_AVX2
-#include "hamming/distance_avx2.hh"
-#endif
-#ifdef HAMMING_WITH_AVX512
-#include "hamming/distance_avx512.hh"
+#ifdef HAMMING_WITH_CUDA
+#include "hamming/distance_cuda.hh"
 #endif
-
 #include "hamming/hamming_impl_types.hh"
 #include "hamming/hamming_types.hh"
 
 namespace hamming {
 
+inline bool cuda_gpu_available() {
+#ifdef HAMMING_WITH_CUDA
+  return distance_cuda_have_device();
+#else
+  return false;
+#endif
+}
+
+typedef int (*distance_func_ptr)(const std::vector<GeneBlock> &,
+                                 const std::vector<GeneBlock> &);
+
+distance_func_ptr get_fastest_supported_distance_func();
+
 std::array<GeneBlock, 256> lookupTable(bool include_x = false);
 
 template <typename DistIntType> DistIntType safe_int_cast(int x) {
@@ -52,21 +54,53 @@ std::vector<SparseData> to_sparse_data(const std::vector<std::string> &data,
 std::vector<std::vector<GeneBlock>>
 to_dense_data(const std::vector<std::string> &data);
 
+std::pair<std::vector<std::string>, std::vector<std::size_t>>
+read_fasta(const std::string &filename, bool remove_duplicates = false,
+           std::size_t n = 0);
+
 std::vector<GeneBlock> from_string(const std::string &str);
 
 template <typename DistIntType>
 std::vector<DistIntType> distances(std::vector<std::string> &data,
-                                   bool include_x, bool clear_input_data) {
+                                   bool include_x, bool clear_input_data,
+                                   bool use_gpu) {
   std::vector<DistIntType> result((data.size() - 1) * data.size() / 2, 0);
+  auto start_time = std::chrono::high_resolution_clock::now();
+  auto print_timing = [&start_time](const std::string &event,
+                                    bool final = false) {
+    std::cout << "# hammingdist :: ..." << event << " completed in "
+              << std::chrono::duration_cast<std::chrono::milliseconds>(
+                     std::chrono::high_resolution_clock::now() - start_time)
+                     .count()
+              << " ms.";
+    if (!final) {
+      std::cout << "..";
+    }
+    std::cout << std::endl;
+    start_time = std::chrono::high_resolution_clock::now();
+  };
+
+#ifndef HAMMING_WITH_CUDA
+  if (use_gpu) {
+    throw std::runtime_error("hammingdist was not compiled with GPU support, "
+                             "please set use_gpu=False");
+  }
+#endif
   auto sparse = to_sparse_data(data, include_x);
   std::size_t nsamples{data.size()};
   std::size_t sample_length{data[0].size()};
 
+  if (include_x && use_gpu) {
+    throw std::runtime_error("use_gpu=True cannot be used if include_x=True, "
+                             "please set use_gpu=False");
+  }
+
   // if X is included, we have to use the sparse distance function
   bool use_sparse = include_x;
   // otherwise, use heuristic to choose distance function: if < 0.5% of values
-  // differ from reference genome, use sparse distance function
-  if (!include_x) {
+  // differ from reference genome, and we're using the CPU, use sparse distance
+  // function
+  if (!include_x && !use_gpu) {
     constexpr double sparse_threshold{0.005};
     std::size_t n_diff{0};
     for (const auto &s : sparse) {
@@ -77,11 +111,14 @@ std::vector<DistIntType> distances(std::vector<std::string> &data,
     use_sparse = frac_diff < sparse_threshold;
   }
   if (use_sparse) {
+    std::cout << "# hammingdist :: Using CPU with sparse distance function..."
+              << std::endl;
     if (clear_input_data) {
       data.clear();
     }
+    print_timing("pre-processing");
 #ifdef HAMMING_WITH_OPENMP
-#pragma omp parallel for
+#pragma omp parallel for default(none) shared(result, sparse, nsamples)
 #endif
     for (std::size_t i = 0; i < nsamples; ++i) {
       std::size_t offset{i * (i - 1) / 2};
@@ -90,52 +127,45 @@ std::vector<DistIntType> distances(std::vector<std::string> &data,
             safe_int_cast<DistIntType>(distance_sparse(sparse[i], sparse[j]));
       }
     }
+    print_timing("distance calculation", true);
     return result;
   }
 
-  // otherwise use fastest supported dense distance function
+  // otherwise use the fastest supported dense distance function
   auto dense = to_dense_data(data);
   if (clear_input_data) {
     data.clear();
   }
-  int (*distance_func)(const std::vector<GeneBlock> &a,
-                       const std::vector<GeneBlock> &b) = distance_cpp;
 
-#if defined(__aarch64__) || defined(_M_ARM64)
-#ifdef HAMMING_WITH_NEON
-  distance_func = distance_neon;
-#endif
-#else
-  const auto features = cpu_features::GetX86Info().features;
-#ifdef HAMMING_WITH_SSE2
-  if (features.sse2) {
-    distance_func = distance_sse2;
-  }
-#endif
-#ifdef HAMMING_WITH_AVX2
-  if (features.avx2) {
-    distance_func = distance_avx2;
-  }
-#endif
-#ifdef HAMMING_WITH_AVX512
-  if (features.avx512bw) {
-    distance_func = distance_avx512;
+#ifdef HAMMING_WITH_CUDA
+  if (use_gpu) {
+    std::cout << "# hammingdist :: Using GPU..." << std::endl;
+    print_timing("pre-processing");
+    if constexpr (sizeof(DistIntType) == 1) {
+      return distances_cuda_8bit(dense);
+    } else if constexpr (sizeof(DistIntType) == 2) {
+      return distances_cuda_16bit(dense);
+    } else {
+      throw std::runtime_error("No GPU implementation available");
+    }
   }
 #endif
-#endif
 
+  auto distance_func{get_fastest_supported_distance_func()};
+  print_timing("pre-processing");
 #ifdef HAMMING_WITH_OPENMP
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for schedule(static, 1) default(none)                     \
+    shared(result, dense, nsamples, distance_func)
 #endif
   for (std::size_t i = 0; i < nsamples; ++i) {
     std::size_t offset{i * (i - 1) / 2};
-    for (std::size_t j = 0; j < i; ++j)
+    for (std::size_t j = 0; j < i; ++j) {
       result[offset + j] =
           safe_int_cast<DistIntType>(distance_func(dense[i], dense[j]));
+    }
   }
+  print_timing("distance calculation", true);
   return result;
 }
 
 } // namespace hamming
-
-#endif
diff --git a/include/hamming/hamming_impl_types.hh b/include/hamming/hamming_impl_types.hh
index d75af12..69d4d44 100644
--- a/include/hamming/hamming_impl_types.hh
+++ b/include/hamming/hamming_impl_types.hh
@@ -1,5 +1,4 @@
-#ifndef _HAMMING_IMPL_TYPES_HH
-#define _HAMMING_IMPL_TYPES_HH
+#pragma once
 
 #include <array>
 #include <cstdint>
@@ -14,5 +13,3 @@ constexpr GeneBlock mask_gene0{0x0f};
 constexpr GeneBlock mask_gene1{0xf0};
 
 } // namespace hamming
-
-#endif
diff --git a/include/hamming/hamming_types.hh b/include/hamming/hamming_types.hh
index 8fdf069..6d6b39c 100644
--- a/include/hamming/hamming_types.hh
+++ b/include/hamming/hamming_types.hh
@@ -1,5 +1,4 @@
-#ifndef _HAMMING_TYPES_HH
-#define _HAMMING_TYPES_HH
+#pragma once
 
 #include <algorithm>
 #include <array>
@@ -15,5 +14,3 @@ using ReferenceDistIntType = uint32_t;
 using DefaultDistIntType = uint8_t;
 
 } // namespace hamming
-
-#endif
diff --git a/include/hamming/hamming_utils.hh b/include/hamming/hamming_utils.hh
new file mode 100644
index 0000000..849a296
--- /dev/null
+++ b/include/hamming/hamming_utils.hh
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <fmt/core.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#ifdef HAMMING_WITH_OPENMP
+#include <omp.h>
+#endif
+
+namespace hamming {
+
+template <typename IntDistType>
+void append_int(std::string &str, IntDistType value) {
+  str.append(fmt::to_string(value));
+}
+
+template <typename DistIntType>
+std::string
+lower_triangular_lines(const std::vector<DistIntType> &partial_distances,
+                       std::size_t i_start, std::size_t i_end, std::size_t i0,
+                       std::size_t j0, std::size_t iN, std::size_t jN) {
+  std::string lines{};
+  std::size_t k{[i_start, i0, j0]() {
+    if (i_start == i0) {
+      // first row, no offset required
+      return std::size_t{0};
+    }
+    // offset for elements from first row and any previous full rows
+    return i0 - j0 + i_start * (i_start - 1) / 2 - (i0 + 1) * i0 / 2;
+  }()};
+  for (std::size_t i = i_start; i < i_end; ++i) {
+    // first row starts at j0, other rows are full and start at 0
+    std::size_t j_first{i == i0 ? j0 : 0};
+    std::size_t j_last{i - 1};
+    char final_char{'\n'};
+    if (i == iN && j_last != jN) {
+      // this is an incomplete final row of partial_distances: no newline at end
+      j_last = jN;
+      final_char = ',';
+    }
+    // do all but last element for this line
+    for (std::size_t j = j_first; j < j_last; ++j) {
+      append_int(lines, partial_distances[k++]);
+      lines.push_back(',');
+    }
+    // do last element for this line with ',' or '\n' as appropriate
+    append_int(lines, partial_distances[k++]);
+    lines.push_back(final_char);
+  }
+  return lines;
+}
+
+static std::size_t row_from_index(std::size_t index) {
+  return static_cast<std::size_t>(
+      floor(sqrt(2.0 * static_cast<double>(index) + 0.5) + 0.5));
+}
+
+static std::size_t col_from_index(std::size_t index, std::size_t row) {
+  return index - row * (row - 1) / 2;
+}
+
+template <typename DistIntType>
+void partial_write_lower_triangular(
+    const std::string &filename,
+    const std::vector<DistIntType> &partial_distances,
+    std::size_t distances_offset, std::size_t n_partial_distances) {
+  if (n_partial_distances == 0) {
+    return;
+  }
+  // infer row/col (i0, j0) of first element of partial_distances
+  std::size_t i0{row_from_index(distances_offset)};
+  std::size_t j0{col_from_index(distances_offset, i0)};
+  // infer row/col (iN, jN) of last element of partial_distances
+  std::size_t last_element_index{distances_offset + n_partial_distances - 1};
+  std::size_t iN{row_from_index(last_element_index)};
+  std::size_t jN{col_from_index(last_element_index, iN)};
+  auto flag = [distances_offset]() {
+    if (distances_offset == 0) {
+      // overwrite any existing data
+      return std::ios_base::out;
+    }
+    // append to existing data
+    return std::ios_base::out | std::ios_base::app | std::ios_base::ate;
+  }();
+  std::ofstream output_file_stream(filename, flag);
+#ifdef HAMMING_WITH_OPENMP
+  constexpr std::size_t max_lines_per_thread{200};
+#pragma omp parallel for schedule(static, 1) ordered default(none)             \
+    shared(partial_distances, output_file_stream, i0, j0, iN, jN)
+  for (std::size_t i_start = i0; i_start <= iN;
+       i_start += max_lines_per_thread) {
+    std::size_t i_end{std::min(i_start + max_lines_per_thread, iN + 1)};
+    auto line = lower_triangular_lines(partial_distances, i_start, i_end, i0,
+                                       j0, iN, jN);
+#pragma omp ordered
+    output_file_stream << line;
+  }
+#else
+  output_file_stream << lower_triangular_lines(partial_distances, i0, iN + 1,
+                                               i0, j0, iN, jN);
+#endif
+}
+
+template <typename DistIntType>
+void write_lower_triangular(const std::string &filename,
+                            const std::vector<DistIntType> &distances) {
+  partial_write_lower_triangular(filename, distances, 0, distances.size());
+}
+
+} // namespace hamming
diff --git a/plots/overview.plt b/plots/overview.plt
new file mode 100644
index 0000000..1134bb9
--- /dev/null
+++ b/plots/overview.plt
@@ -0,0 +1,10 @@
+# to generate: gnuplot overview.plt
+set terminal png enhanced crop
+set output 'overview.png'
+set title 'Hammingdist performance history (million genomes / second)'
+set logscale y
+set ylabel "million genomes / second"
+set xrange [-0.3:4.3]
+set yrange [0.005:200]
+set bmargin 5
+p 'overview.txt' u (1e3/$2):xtic(1) w lp t "" ps 2 pt 7 lc 1 lt 0
diff --git a/plots/overview.png b/plots/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ed878a773178340c57e3d378590e44ad3519378
GIT binary patch
literal 6498
zcmZX22{@GB+y7%5jHQGavSrU8%`kRjiNeTM_MI`7EEOf&kO<j|iNP4zzm%A=R!C$U
z#=gXtk}Z468uL!QzyI}rulIef^E~G|_qoq=FXz6``J9tzZf3**761bPz;X48fh7RY
zqXB@fi-mzk8D<}ArZtq!O|36esZ<(;NF@HLS^)q{Z3U>n#zrd==;{J=eZEjzMFA|b
z6-&M7icJOpBo?4zsnOJtk_#7TLy>b7+Bg9L0T>L1LZQ%TbaZrdPEHPqL~3nq?d$7X
zSXiKm*t&EneIxzF#NyWVtvM=nV<whb*H*o;K^+HZ(g07|H~M_?d4*(~zwKlQB#5R8
z0FV><SAjn&Qi1+KD$i4DwD}emOT7#nVsEYB?r$wVepm0;5YP7O*EA6;N-Y64VmC5@
z;Thsgx;gT?pCXnA!1C~5k+g!vy0&6nsn|dIBdJ7MaitQ2sI9G3nmyE3ay?;-x<#xb
zw^7^3t*;`eNFr7A{rmR?1qC7^BD1rzEiElBE-pGcI?2h&L-+b^X=bwqU9r0j0GuEH
zymVP|oPq$rH+<DV*ZN-GdOj<zNR;Pth|rq!s_WXZVxiYh56eV`=X1?VcWm4*Q|_(8
zlG=kL4I4y!tt+(^CR|zuzkUv!%|L%!)j3AU4#?-6GJPxU!Ofy$X1L|ljB-(ed8HMf
z6;z)8Q9Wx*UJh(LA)FB$E%3Q1W~Y0JnzeVQsrPN!y9em#0(aSO4jZelTxt$NeD^$y
z(9uPF+q%?!ZCt@@Z`s_A+@@~CA(FBdu^TUOl=Y4p1E(7_^;K#LQcx8v+jMBYJ1J;W
z9wcQb?o=b4Egw0ZY>T-7OEu&ernXH7AC8u<KWn!<M89tFj^bv^{ZX|%xc__TWV@_g
zIQHn*Yi|Mq^|?W=vi(=c`1j+FR|?fO1z{pxtWRNupE_}~PKS=FulDnqpAC=Q(fi3x
zS{t^R#;C&d?Dp?CJ<fPCxhl?>>&&n4>yBD2V&jsjU?vCi@XM!cA6i_`V^ko@k<R@t
zMu(b$jo<3@q#iG7<5zsH1%%wrt*xpp3O+hIRjHP*c_6z3-GtuUh(%6!FC0g?lh8Zw
zm40nG9dHu{By?2jt|&(;2xm*<&IS7&+G@7%(Ny^DT<K4L^lX!*Y(pWt<`?t2hvROi
z6G-q(X@ls@<y<qZcduHq6vVEce4k#L#I#SHwg;bon;!7=IRA5|_vN(CjzXEM#@=TC
zp4@Lb58u2z6}n_SryBV0HK>o_Zc4@HxZ$vV?<eosn)Hp8qZEX)^;7h(DEgziAHpIA
zqWYT#EpOW)VCvT9Yl)MJ=fm~ESwZjZ<03r@)sE@@$xd&Nj5>Tp&E}BPJ5}>M<7N-3
zCpA9t&Z8Z(+doV{B6EK@qrDd|4R7vjfBb!_HqbPZA17@Z{chrnrKCf9jGE;r)E^}h
zuG6j=xmT9;KDSg9FDqVJUS|&zZu#}|!?AJvVTUDY#bhr|ao~;V4B^7!6R#{0)X-Fv
zW+_Zs1YQ)TI8+y22F_38i1w?SKwvt($+fVnvk&bfoOYHZUYK?MV>gZ0K0C0bb#x%d
z8S_FJO)kDzi-ZgMug{GAe1g9JXwcL?-j38!AW!xUhKIq~R`$OoqW`#Qm|FAd0vCBX
zCn6cWlhMR*TM?sL>oELDn2S7JFEegL()o_wFv+fqD9_z)kn}aYAlA#)qAKvr<q`cs
z0c;T38=kRiZB3pzCz7<>@*Qmevqa1%_wi|bw4PrYim|&_b-nb(ipLlBS<+^CfJ%Oz
zb!YY){DM462ySIn*<HVDx(_2b;A5{rFwY@AmhUc;eq;Qj)Zty0>e>f(B+X6Dby>0+
zdxP;5ANlN2XXniHP+$K?A%{AB)4$W$t0^2dg@2Oly-$ejX&S{XIwI!X;Z{6ce#)ds
zwQ1*?kpj=?4t?+RGap=TWc<qYvBx~&d|vMk)xnQAbX-vvI}qc=hy80ewPJca*|UjX
zIO>$kHB;4jx;F8HW`XJoBDeXs>N>(?mhx$BNugQgh|D?nd~}m^c5PhxJ2Gv4UZ(Zh
z4Z>E%Ydf@XTU`G~k!gKc4oARxv#s2<eQRO9zQr%Z&1pRt!SJzWv#%<RXlkEdNJm?_
z{_oQuCJv~TVQzdAFVV$~d$E$!y&ggfA+=TMl5l+-Z|z%S*p>_tQVzfwc(G_AJ%!yG
zz{h}*FmpN#=mJ1SGSNC)g~jjK5;8wYrtiDUOJG2^>I4hpw(;8;M@HCUz9Vj?NYV^T
z|1r1Hhwm@q-%b?BEay5Is**izRimPUp%xUSBW0^?CH|F;8Ux)=YvX+h)v}o-e4oxR
z2{DjcEu33)V)eMseO)-%uDFc{yk{;kZFH{w8Ya!C#Bp*Qou=PI7VV!uj=J`osOwzK
zOW+89oNguCovF76Rkb2*eZ(ZTNd-Me%OS~YVsD)*HJby187NHSh9uY-racd*i-JVR
zL<%ok2rTH5S=Du1VE`yp$uH!qOsgW%ZXBAg?q|C5#Gnmj>ozJ#VfS}S^I*UOu39>N
zuF)}4Ksh~}!EZvSTa&{6`zz7?EX^rcDPX^6`${~mGL9L#1dv<3lVs5`=sEGr@(<8+
z`L`Z=G2&yGI)nr%LyRV7uh`{LNLd$q5D~(XSCY+yryIz@>B3yqMr-KsF*_GL8UH$3
z7?pAbt=lr`QBlHwNJdH$Ct($5X=2yHM&#tVDQj*cHhuJDLWeR=mn0pW;lTh8=Nwm4
zL6SZNRd_8EBI)_y@-T|lXB|Gd#TaycjYmo|7)c`7zNCbm`UJv^@gCf7uMhUeqO~%r
z2~pC$L_4FZD1L)TqMgq4_$AEPDY=5-5LQZ+$c7&TzRL2S$^Y9(YPOGu2+VBtL`{SH
zD9iC0-<28u1cn{ov<6KZ?S^JA3i}9vR~``3xQP23I%BFFakX^#H)et~Q_UGLV;AME
zPqYSEbY*Zd009$6IQ`-HQ<5YYo50`p)<poXtg9ipm<|yuJ`x+}!{7gDd`k&@qGkca
z_dT*`6!Co_Va#6*5oz>v+w^)qcg`N{rH_u|a=hB{(5_K#>A`s4=puZQ1D}ubz*&eF
zL*xwuCO(6uoJ8m03OIFj3TqCR71T;lf(>6`(L`Oc<i_V4TV+KLD2eBxwNtNRKqql$
z@l*cmgoo%~K8l$f$x>K!Do1z`r3&>F7r9KCc)cfr!gAG9V!A$nYASX&t+s>&?jH^i
z^^53lcGg~dfU3+ePBE3eGcZ<ky|<jpF&h;En<b;=(ER@NvxN`Qjfu$eoP6Yi8813$
z0Ps(`pjbTzeU%AMCUMeyk!X0)W?Wr+h8>?(cC)t@x7gM2wF&E8o=a!^Z(!%lsnl?~
z0HgUDs0b4a$ve%2p4>C^*2^xzFk34b12WcMMaRlk@Nf7x@vVnG<0|KxdIJxMF&Jb7
z;vSyL^&U9M@{fUIixSLLz8UT}K@@XkRQ05BUiwQXA&F6>c3JFuWXoja%JUzaYGt$R
zC;-`IM5i~yKFFRawTd9!&4-PU19Xif7lUrJOm0gcZJ>u_o<kbRrqKUh?wGCT4RN@J
zaUjmiFvu-Xbnw%4eG_i;OCl5H2gNyR&T0=ljg~8aY}(x!FvGi@KP}V$JUsL;!6${N
zx;e-`L$Vq2NK}Ics#z3$%^qV(dLX#GVNg)$Z!1tZvfEJ>B?TTEdf}DB6`-u?OiVa?
zjK)4WaJ6yzHjJC6<9M=89PVjJPyqK#C*AX3zfE^T65RS~4Pmv6XnsV{pL{j)m~}cN
ziZ$N>$A|CP%?akg6XAB;t~_<q-z2%2<d(H?@@2|X(M<RIiIY-?_3Q)c%*v`pusZ`U
zOIJ5)0h}u6Iu`xx2iAP?kzpo(PlO52?>SkVr1(dSZN?-7ZUJa^vvl{_8Y^S!l0JQ`
zeaPi&iNZLmSZpz$s$C<qbSrN#0VTR7I|2lXli}aBbq!45?EzszS<^^s#SUhv(sWJq
z^?4(UZnRibtIu<d2Z?%W&*8X(&NeI*Ur2*0F;Cy3{|D<}cHsxbTxAaT6t6jcsj8U6
zH>lruD#2<wt?b5$mhDwRYe3G&=F4EAf2;9S$h{^n+p9;`!qqm6ay|u}y$q9}s^UoD
z>W`yFw-{bM*}eLf2q}EbrJfc;VaG3clDxCL{RE8pF?Ec1i$`1$?Ea_+J9K#Yd++8T
zF`*cYq)_;i7&MzJ#A{HiSp$Np1OBiXogKiUA&MBBe#%9>#;ybdnZ*ohYuIH-7pg)B
z!Wdq$Z>xn&F}fbfd9#s2d#1Fjs{;$VKY-wLoD45cuqt%Bsc7R8L&NY8kqSuCCq>GJ
zom;ktpFpS0%A)JOCYd;K7c&FBr8!tO+UGiHv$dZ{PkjBG{1v2r{TpW!vk$1_MvPBN
z0@iD_OWNmlM+n^j!2+T0)`;t44)5*0NORS7WDV-=-GQ&5CS@cEMZqj;*amBCDVo~{
z#E+2~tt^$Mk$b!{b<DnTq4MN;6U@(PMy!{H?Y=0l;!yBdrDu@8B0rNY+c^zDNzd2H
z*K{WcyvKkOictuL{(KJ+m_4&j50_8VJ0kaJ#Wj^l0X!O}+F*0^m&w9v45-%_c|-=Y
z<69)BRAl7YEZle3>J0=zNm=uGlcWTz2D(>OqRWUQMsV+kyq*2TQU|<w6}<7(z29*R
z_$=nMf-a;_80Wp;jOBo^P6kp!<CJF92ZjzarnCZpw>1(kBQF=!KQBGCf_UK0+bSSH
z1dDXSQ?vz!1P0kNsstuE?&$ex9eW!(Lk9)74}Wmxv5ofKrTRrRT$_)_yS@8j5>}uk
z*YQPo>c|IIP2imI-HB4f3Y<zq%$UJ6bywVLHF6s=pRkE{v<>cP_=Wx|)a1TjQ0nr}
z#1$XTex_Ko=u%z>N5Gg?@}(Vn!jKkQuL9`5F++?uCccc*xlbG(?#46cPI$F7B6mcm
z0I*$)0q2pntSjQcSpHDvS6e*g$XT{)Vwp2>@A(L^XawCKjyuzV@#di-{bByAWHbir
z*ByE9`+Do#oj-&xJe&{C?A(~Ec|oLi(yrqzTno5!FmVGDirAXCVIFf^`15_u#2Ht!
zNuf3m2VEw0K_;Y3v-BW~6!TeG945qd5|wDkl3I8>NI-}eU;cacnKkCjpZ!Og7a!-I
zIL27G*8kHI^P$#(G|r;Xd2uAb&U6)7$a*VujJ)3^nS6Iz*6rmedlt2sL<(30X}{SH
zR24Z*Zbj@u#Aw^hPQQ&WP4(#A6nYs10!7}da4CmC{jiZmfOcxP0|q4ETEJC5NE@#*
z;r%fmjG_P|EESq9dxxWp{FT!ciC4xL*#!6bZEGsSkq?z_TiO`FFk|dLaojJ8Jvss^
z!f`1I;MB(;T^_vX(lMwh<YSvuWlpp@c=DZLMxyx`XU-`+pGlpyo(mQ&ulpw@lcd0{
z?i?w<P}tE#0)w8N2So~yd$SKcOQ6r}OF*Hpo836$WpNg-b=_Pb1scpwo+{duOl?D9
zZYFxf)k>)?;ke;+Kw(W8gLvi}V-tK^E!yWQi~=QN;=^j?A4mbJ`Di@A;&pR49{s8+
zc#6UfH)FsT{+QL~z<{QfcI8oVwY!bcCfvA?wi#sxryxJF`N8Hx2S3pLFP(U?%wO%1
z$`8DRf?dc+qv_YP?1fd*0iOdw<ioF}_?Y(%MQ9eUnP4L_^6s=5EUs2KhK5b+huMF2
zy@Tt^tY+qh4#W}~uJ1wh>{@7mBSz2%bhsdecO&@EP8+}|h<Xa`hj%xK5a=pF0|RPW
zx<!*7P5;afL}<N^e^)C)ysa>}wh-f$_61CKuKotO2jc6a(AK<3QiEqa2p%p>1H#1j
z=xRRA@N<v;KQ1QDvRlthSpZw76@-Tw2LKrSFF^iZ=-D40n{2VQhq&*_e1>U;L}t;x
z)ou9ZUQrwkBnp0he0ogk=2=?&5Fz`WQsTvIo)HnzG$4#nffO39oY({g{&z>j@((xY
z-;O`{9E_qxIUW!n8v<pltFX}0yYEQQ0JfEqKST1D2rRY~xAF~qPK*xaRIwuu3C@3b
zUir^M0}D_34=;pu=nk3w1L`#e*z-8Qqlw`E0rcT?tt&)U;Gh4X{XfPaxq&=RJU|57
z_=LtJcKvAjpJ#?RbXTn&&if6Vj+j>n`G04QAmM-o2DcG+lB<Z0lYq9D;_KhXx}IT2
z&=&VN{Kfe-NWC2E=%8tuy{v*kM>T_;b1c_J!bP9tm!&jdrvrMHlQf&CzTHbZrJRPt
zI;-PKQ{*p?7DKp;Y7&%>U-z>LBQhT}+?=ggNajZ3#?zle?#fxAqCuUPVQs~=Y}-DC
z5+ajt3(sA0nXc}vkAW(F@R6D<e#Y@Sw9URc=Qx}qbxuieI6k)P^b|vg0#N&wv89vX
z6fmIfNzJiFc$|l3XkKn=C=~tn+1!oMri@P#riUx@K42Rk$~<}W%{8PKJ|C>L!3{5x
z`)Vx9RB)2(lE+Z1DcdWOJ$bGzGyk-uJbv(Pf^m(Ioil^ld9<U8Vd<nJLPHlOeDd9~
zF62FjXBdY8Os$FH)yHG^^m;`}SS-4ZFXOYb!QDe>=j~9ms}b%Qa=tS9#o8eGMn@D0
z1{ZAL`jz08zVOCcpo{UE;rF2$Fjn&IPxCWCn%?J)9$VI|`Id{co4M@90n3C%yYj~>
z&q~Bm0Ql7EEv+|EwKo31a>QeRM=y0(W9f^Jo`2?hSo);TB}2Jctrv0I;}K2Zq!ebw
zLnb>;?GjZwnc=?Bw%cE%!w>WsI=+0+a)3J;>4=D?b`^_$2;0YgzIA77Hx}xYw8Kbr
zgtc5x*JmhEnsT}_`ytq^Bh_br71?{?%(c|rX~*;ullwP*JwX5aBM3vhxO?zsSK#oL
zoO9SZwb<yU<(o&2w{QE$9mGN->m(ZQk{zv65<uW~gL~GsqQkKs3l7<h_HY-8&{wvN
zq9$zlz|X6&W_ck2ndIgkl^gr}U#m-r+@_OXug<Jt9M)o?@_ulW1W?i$Y5Rj}P3zZh
zh|eS*ovv=wKKvA=iz8fnSlR<7A5f4MuQRxNj<2t_x%fDEpD*DVc&Vll(XH4uqz@`@
zPv?}z%PvKz!_CwA^&)g|ElrliVaprte)h53?azM4LK!^i&94kY7L}jsdD4_U(|l!S
zVDN6|r&5uE_n|K$A{{R%Us~{8uiN|C71AQw%aQ|lmcDLmp{meeuLDV+Vc}79{mYuo
zN@Pt}wy(*9T5j~3gASu-^J;r?X{~8GD>9U^w_0kBIk)Td$d7-e|6~+4D)?z|6uJ4-
zQ`i~@f3!mHPaz#o__bBmB6iJ@--~?T%3u3&^)-_78`y6S)Y;Yxq(P2D7R*TXUO7?o
z8ux}@6I3f7r8zbQH2jPe&f1>A$RaM?z-GjhvOEp;R~$t=-j?rOSt|R>Z9MdcTLyKZ
z$D6N2R!_Y*1rxubi{f&snK=<3b{`3LlGILrnDsccQm;FMjK8@8=eR-f(R!(<F#TSu
zr7Nvx9kzr^i8!D4rd>1W>N5*>E7nDN<0<#ol@7t}dzWFev$?3%8hk|q|K<B!sYySd
zMy8^(g-6a@xSk-DlZQ}oY^?MrUtyDjtQ)<t8!`>Rmwf5%^S4hUe@v+MvAVCcUm)T0
zL~xY71g+m?3iErGbeDdDi*U#0m$d<^r1YF*nmoD6X`fr^+Gc9W%r)}<GVY2*qXqdQ
z+Y7P9ud**$z8Jq!%`;G*cXvh4)s!^K<5UBG_-MI+Q6gp~mZ1ju-weyxy5-w=9NBNu
zH#{L5!8E9boH^*>4&?~Z5E5!~KVBk#Z07~*McOs%30@%%Gj^tCwe`EoIQibb{HXsS
z;$hx9{7OXU;OX-?*k{z!Q3jvpe@=Uby@cc|#5LJjrEg3&mdG|(k5!q%wDE4$X#6y@
zZ*-^K<>-)sONh%?($oi1O!@RoNb=L3=MWPTwNbElGVsXs&lNcilG&!KiKL{HbN;t>
z(tf*glmBqG*cIi4>3dFICtILZ`f~7DWAtrRc4qJ1HTye(VQ_o*rrn(+9gVYxZKI%;
z?X*o6xsx^fWo9qF{X|M}@$?(YDc`SieEk`3^Gub?Re5D_#j&4r&^{!5kG)PI?-%xn
zPbo7L-e|Pn?+<$y<gq#|MQS->a9`OOr<G+%CLNY~+|vto_5-4i*-Aczu}6j8%oF>@
z%yW!E7^Akb)E+(LjUM8uRlOc-$~tu5(DpS?=&|dA>OL!<2va=Yp@rwZQs2K@PvgGw
zXFSe=_kE(%$u7znPJHYb{Rr)*V4p1ppJ0Z$YIEAg$gvD3>D(zSQ2m@SLY|K2Zz@jO
zG)FM#yvp~a**B@DqYjr{!j%p~`%haAR}T&`Rt9Cryxb?*E=%t|D~Tc<baRKspEhcN
v*!FHus>#6GuBDp9x`Lu^zeb0wv!_ri+;<nmz637(Nw}+*%?w`ZxyJq%?5ZIW

literal 0
HcmV?d00001

diff --git a/plots/overview.txt b/plots/overview.txt
new file mode 100644
index 0000000..f36efe5
--- /dev/null
+++ b/plots/overview.txt
@@ -0,0 +1,8 @@
+# time in ns to calculate distance between two genomes of length 32k:
+# note: fairly approximate numbers - see comment for origin of each one
+
+"Python"  54000  # (very rough guess: original dataset was I think ~40k genomes, they said it took many hours to run, so assuming 40k*20k distances in 12 hours we get (12*60*60*1e9)/(40000*20000) ns per distance calc)
+"C++"  3266  # c++ (bench_distance_cpp/32768 benchmark result on hgscomp01)
+"C++\nAVX512"  835  # c++/SIMD-AVX512 (bench_distance_avx512/32768 benchmark result on hgscomp01)
+"C++\nAVX512\nOpenMP\n(52 cores)"  42  # c++/SIMD-AVX512/OpenMP 52-cores (here I just divided above value by 19.7x which was the relative speed-up of a full from_fasta calc using 5000 genomes from the real data with 52 vs 1 core on hgscomp01)
+"C++\nCUDA\n(A100 GPU)" 11 # for 100k genomes from_fasta_to_lower_triangular took 60s, divided by 100000*(100000-1)/2 number of distance calcs, multiplied by 1e9 for s -> ns
diff --git a/plots/speed.plt b/plots/speed.plt
new file mode 100644
index 0000000..121b945
--- /dev/null
+++ b/plots/speed.plt
@@ -0,0 +1,13 @@
+# to generate: gnuplot speed.plt
+set terminal png enhanced crop
+set output 'speed.png'
+set title 'Hammingdist lower triangular output speed'
+set xrange [0:330000]
+set yrange [0:100]
+set ylabel "Million distance matrix coefficients per second"
+set xlabel "Sample count"
+a = 0.5 * 1e-6
+p \
+'speed.txt' u 1:(a*($1**2)/$4) w lp t "from-fasta-to-lower-triangular GPU (A100)", \
+'speed.txt' u 1:(a*($1**2)/$3) w lp t "from-fasta/dump-lower-triangular GPU (A100)", \
+'speed.txt' u 1:(a*($1**2)/$2) w lp t "from-fasta/dump-lower-triangular CPU (52-core Xeon)"
diff --git a/plots/speed.png b/plots/speed.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e6df313adba320449f5c5c268ad8a89c4b3b5fd
GIT binary patch
literal 8301
zcmXw81zZ%*+dqyvTJnw#1!;~hDG31q0g0oJ6r@{_I$G&cQc>wf>HvYGOG@HMK}zrd
zX{5aO{Jo$5es=eH=6U9s=leW6Gdr{UN?-301t}9L005xS(o{190Dy1+0PiaZA16_d
z+WdtpsOjq%sbjHNoTRX@@E<_|01;Re01G%cKq&);h5#xqKd>kcK!h?X0t<ylBme-)
z5ddriHXJ)s%+HTARbHFK*)cIOiHnQt>gqZ<IfaLZ=j7xxG&G=4sIOnYZf$MhTpWst
zr5>bK&u<?ZAFg4s2P=`-=I+{q11uVV^9FFh-NU&OxP%i-|0^djGJ4^l003ig;~2m{
zh*-e*6qYUn8?JvC5rI_)Tt)coz6n0uPV8uLMaGhyoh=teaA1o82ayMvfa#UOl~jFY
zV^@&~IzR*+U4%03jEI1vBH-AFfB2QLg}4)rE%d^oP*_|%Skzd{yF=_@Ve?owwtEa!
z7lKtT#By|YcD{fAo}Hb2b#=9?tINj5MoCGjx3{;nv@|6p1@O+-q;WSPAwi`UWsl1O
ziI=9CF91N<_wT~X5};%T0N5t9)KrXK<nI@_e5~T7iE(5-D4pw`pACA6#papiQ+q4g
z%(hRQKG$@52abji=$^pn8p27qK2tv1U-rD0ce#JVr;c9mY-btP>sS_;=Y@{^46|3f
zIc+hzN}}M3|0!r$`lx1q+-v;6jH7Pfjc)!yFO>}gNXO%*r@?&!@;z7q<R<?Lg>;BE
zxQ(28vs0_LXxahZW}cOY7%cXB6^BoWby2rzegGMKoKos=8q*7ny&S#yp~fS>vPI|Z
zF8UK`dUfIAO9Z~BVQNT0b>g(XweDRbs@9UGskrRE+Ue>SsOe@oAOD-$O;yyk@G8UP
z^~qewdSX+ss(~=c@2)!Fv^+t^xzX*uSj&`{2=w-LXCvqn4HeHBo>6$U2JH;X(3zw8
z!!x}KD*g1UfuTd~_cN|~*vN8JTkppluhs*ZrEKTYuBAIntW>X($e525BHgpY`x#c%
z#jaUjKC-_t{~bf0{oS^c%|kK#Hp`EB?gLEqix%O)zVu>x>~rwlF|gJr?QHpJX^mLZ
z55LLAcCG5}*B#Wsd8|Enp^~lbE^QiAl)L5&R?Kepkf?><^8q3=p@$t{6ZkpC{)%oo
zOLKdI!m20$$OZYlgFXjnBb)BiNp?S3c9~7Hv~0^D8q*SyVSokNg@_CsE5oaAx4c?9
z@;&`z?&X4*u>%*N<}{EFZ;66gGs^i-=SFNoeaK`w=qoFHg>$ROrTDK!m*3$%!xSGe
zrE6SE^RzH5R5h?w3qcXc{P0sRXT7)WM)=VpoVfrc^Ylw3@$`<Du1+v?zqRan^nn+$
z<Ku@M>iI%hUp6<d>~N8>BI1m(d}6XB%DBFP7z#Wp6V3b3>)WFcDk<8ig_;4iep=D&
z<c_aH08!5Drd_J|BrO6dXFYjja>84UIL5Zo{*5b9Sa^%<U5U1Y?uQ#UM0RzCG*lX=
zteJ-VrtsT#&E2jSAAYwjTNSN))Je1^%t*QYc41Yoyna08t8kvb(|v_Yu>K;2q<1#A
zwJ_T=W?E0`>-PQj(%$x=Nn{;$Ek$!$A3T$~_!crspUuehv843X#?x)n87e-K0gVm#
zc8PKLr*A6;*#)1`>bbsl<a3_mdxY0<vOTzN*DNYX?sN7pFJ_)Bf-CVqBc|)HSJTk#
zZSYsfxzgnV4@_<0o%8RzB`3llghJw<)NM|1G;U7&-!ZOCBR%l27e=WNL`U+4u{)C%
z!ad{oXMh55u3A3&(8s=ClyTY(pid|OJ;?acMgjPx{80PF+wZ2fa$Ep50qD=qjn}W4
z-!_wc0fER_NZT~McwJQ#@!+u`h}KsI0m%>N<GX5$H#Y<o3t!cSOf%m1TDjl+P>`bW
z`&v@I#~}%W=VV_ZZ3qdd^RR4($aB*#>Pg*;D4sc&9*XZnL{XOgt>q_G>=H|TpZ8vb
zn#|?BYw#$k6?JK&$53Yr?QU*BzITA7Lv;sOeJ663c8Mixspr#nzK3>zew?b%`V$Ve
zH-p@KZTn;~>6)C4Lu$OPTT*dA-@AYZyq-T~eT%h1ZRlVoIkCG@Jikbj1Kp|Tw|nBn
z=gQxvrh7m9oQHLnmPfS8DQ3!k5K?${{%CJ23Mj6iiFr;M>oRj|7=Kjg@!8D$x=smy
zEQ={|x{D%A?g9y2FBAT#wA9)Bh{06DK`UjOu!M(T+xU|sBu`$Rz<4kwR3BcGom|cb
zAyYYh%sCL%MicF_-;137;9zLb)3fOG8M>}!)u+Fx)+P0f-c)1r-jORA+t*7+iz3k%
zI}0kOa{WY#0?;`D$7z8W2$j#y@1T$8w&p=K;+tV;sf-xvVWhAfZ|cH5D;qOMwZ#r4
z0cf?d97DEiz-apY(1sK|^wm%}LpbxFDwKnHVx<i?H`}IHzbJ7Nl`((acxpexFGjFN
zenmpI5%x7upgeJhPVLpc_X;w7U-c0X1FSiubRuj%uVVz}-JQvMTmdctif7>!)4~K*
zvgZ`Z<yX9&!25ELhS8a^9Nt8kvV@)F{a@f*_WmsD(0kmL>JGl(-99!#hGqIQ(*w5k
zuK{TawK8}FQje<475BT5x4~aktVnuFDnlWGZ5zRCU-z{YV1^KHeee|P5-lvqvF}dc
zf^;G5clBaywA}FcYk?5W{h9?1r878oQ|zj)OR!Eww%Z%b`1npq6IFX#bn+HNP$*Hx
z09-FpcRUh*Wu#=i2+lS&S+>8LS<Cqnm=`*p2&u}1r8l)F<TLz#6gN+*9`8U~{7wSX
zTx|PZwZ(Ua*&f2TvZ$M~x#^8)7K(t=M=(sB%}TqWRh9E!>)`P(0bmlu*wpMz(gs{y
zCG5XkB+9ck&#djqz-poExJOcEvoo=1z&J(Z^w+?kg4|b^{#wo!n}&4XaD4ktyf!IH
z1@2iGz^C!Xo%HVIdmJ|qvan{hdB4p1wP;{S7*C#^_-}exaZ&!jgCpPYpEZv9`85p~
zWVv6#QSU*=u3TPFEvkZh>u|D?g%etY$#*Umo=x{z@7lUc5bIY!TWwTqZJ7K#LsFp~
z?`#Iv0x84Wl_RO8hjkROtYVNj@$Vqyae4e*fF@e&MJtN`-WDjm=KgHxL~quUuB@7q
z$6><9J_uvY5f|e+(eYf80oYH%+>|2BO|D>QhZX!f!^aIIRUj!&Ghh16Kg{gvDgt%R
zvxWR_*B^D+uWXZ7b2939D*43*E}!AE2Q11k6Mt2I3E9ITUtmk*-pm(u-wHF*9`VLE
ze)^xPeF$BG*&Ku$_GaH2oW}Q>wX$cW`z*s?EXluw;JJNp-wn8@E_xvVywDafVoI{7
zGlF%gIZF0p-PSoW^-p`|laKnIR<p4h!8=Dn5RkbQScKu#AVeEHsvqkmI3LSr?tJI7
z5qW<>;?`}S1^Jr?{BqR%%IDi!O=|^mT0sZ?{%fnPoP*ipRs%!il#y1M^Ds{gN%&y>
z{G7S>`?s;@HjJyAxmi~s5<g3<bX#}dS;pecQy{w;+g|7tL$KTU)vIHzIO|;j36iRZ
zCv6h~A;A)Cz3v(s!G2dqZBnKb$E`-vtj@P?<74I{6lwe<9de?Y;pl}>>w#dm#z%K9
zcKdHTQqZ9w|IhvpQ$;LBg#{iRF68GmWipqfPgd@?iHGYS2bsynY79=y7BHP(C|RR7
zgfG&<e_$oQ*}nEkrk`)|@wgG0GxQ+;YooeUaDP4i>Ec(FGj{u^^3$^J!KJpJwFwn}
z8nD$Fl#OqEt5BU;z8kQ|t@SUq7HvQ_N2M>v=R>WnK5Yy>-(tF;QvBZh5~Vb;YLna<
zobq}^))Kxt(z@%ZSF-%aYMJtzL_Iex+t<+G&MDoS>RpqKzs6MOZ%22S8^0CmtjH2w
z>x}D`z5giEobEFXtJh558K*g*<x#BN<Y4djzb1~eKvX9^bvuGz=%O!S<4KLO(mdUE
z+*`4E)Mm_^;_~bKa%2acb=aVOT(qO0_<-xUIO)p3v^=}I6&8`Yl+9)Rt?JsMW#lG{
z9JO@}&xK;`ymO%989Vdu^9#zBMRPXdC0tL*77lq*h<zQma$7%|Yc$?E{G@e3WhL!q
z?ymqZHiENGo&V?Z%+&$7{7TR;a9e8SJWS<p4$DqAPHe>FVDm+7){PXqy?YxOa_G0L
z**W1XgzH;GPF`$ZQ^vca5Q|7iVpC*7^8*k9g(BBu660Ars0X4zA{1fMi6)1B`4X1o
z8iD3`@aUKG`vk{F-cT<B2r+(KB#z<Tm4Q5flqIthi;K5Q>#eew;larVx=|ncyrBr3
z9w7R|%k(w%KX2a(zusFJu}f2d!-?K|@s_`L>*^&1*sm*B=J3SXlIrKlOg`79v@fKe
z;W>jtoC#-1G874@{Um=t^=ONtw8-y3vTa8jCB<*iHM~>wnBO>0YN;Ax*CBuBVN3!u
zhY%dXh>TdF?+xatW&u+Mu+A@BmFTS49DW`54^!&qJZ`p84#O=oIHy$-nbqGuE9RND
zDGQl3iZuaI*-%Ozv4L*4R8IkHj_Do95C`b1(8iY+WtE>wW@ChD95&iDZOJTz0@HfX
zz%5#pu$G4+00b|Y3TUGqy1+6O#a;Q4=y6;V^@IFWFXz8iNzr`FIM9ueAyQiSDX&o`
z9@d}eAF%#gZx9es5(4q#)``qrC<FkB-uTroADgJ+vSQlFCkSaB#o_blG@*VW2RiQl
zQ0NDcq06KM#6^C?ThrH<1F(>46Gy13>;k-a^R$TtEpYiZ`9c^+Q*`t@99s=1{8A9!
z{BEs8=8AtBLfrmlZ1wwPCW%hH325p~trHc(_(v1)gf^NWmanF`6ZKa%Qdh{UmoBWD
zi~OnO(3$zr^Q}4BDv6@@aJMOS>saGn*Y=?|<;_`gGy-F?(Wn4w%Q06%4am}4Ul4*m
zBfx>`&b(8q*9-1PYFg6x!1mXwc8Aw^2&*3HKF0`@-p{)=qxgt|(b9jV@ePZb(S~BZ
zzq#~vC~&{yyy5-ue$woX44kxF@9$@N=y2Tsq>v*=^wxs@{qQl;9ei&_shMG$2B6$p
z8<6?eW~~_>Tf9Iw$=BIoEI2T&9f4s|UZ-+sJRn+U`c%xxoyI^{xP)hkmM~{BqS;ag
zn}PZy6WSB1?6y9k{DmUpR@x(wbBE&X^sfP)L^T4$N)1cKqL2Iw+&F4<zU8dQSBDUn
z3B31ny3i9?1+Z>NE%z-QkHBALQ31T&2ZXroodWI>SrFA!f&{rU2>Xml2f9rVm___I
z>1IUhizIK-bBJpyfiP_+qWDTm904M0g5i8QoOeu(>oRpnElvG#gkW6`D9CM(z{tJk
zpx``rFnVACzk^1Egx+JX!N4`Ak4q>rErRadK_A`mCA;j8Tk*2`c{nCavo1$*p8~u#
z-4qqW<-r>z*7NNXJ4c;I6Mh*+$Fk=G?a-nO;j{FmZZU0M&uO*4gCO#CxjdQlRf)>G
zm)~?IC3_j?l7=1H4(<{Vq5TPu%p4l5BZL!AeKB90eNH<F1D4kV;PgMcL&%%=x@A73
zXB&IR&=+Z54PBGCH9B+6_)~<*D)A$oP3@pH_%HR&RK^33a4HvWJ<uLgzQoPP!_hdO
zm&%V1xa*eBtT^jjK}|oS@3@joFTa_ex!qv2+WZVlwv>PvIgWnfT8eq@uS;+lTd|+J
zv%yhO`okd&Tc(+VAbI+GS?%h;xX!4<-Fu$2B3R0zh%=s!XGkfekp5#Zm7D|G%3-_v
zrviUu^LL<VQ(ge(Q#0ysx9+M1N7A|WrpK<_oq1Od3Qtu=-=MdzTJ+Hhm)uIe^`fx)
zpTDYR2(H=$x|}LI4<J!0HWynh#x{Y8-5hny$7F5_M`5CnQ-9(x{GJ0K)!edcG1U%+
z#)&}kK?PrYH%lO&GI3}@kN%UO3cmIPiRDrND?tEAJ-DK0jC4}P#viGtuX<n{L3zc$
zsTj%RQwj8YKZkw;C>4xui<YM^yl(7d@SJ-0sgyd9U-`lt!5yVT#W0h~jMIGSwB{`1
zyx>#5a150-c4y}a2Y46y`x_c0XR(GD+Y}7)LvW&&T`ncwx-d_en9g6o!_+5)SzDw_
zxVOkI#yxHBfkg$T3bxQ2cY4O^Zq{ssHmm;}m9m4eU;JD&>tFNI=rjxwYGA=&V^u0S
z>o3gg9NN;USSd6b`%A9UL_4Uqv$I<0aaE1#1$p#7&Aj{>p}VY<agKGp;Sh#^$er7*
zT&%;cIMsBQY>~<5QfA)ul+az*=S*f^0LP3qd>A>+&7xte6>)Zrp)f8%S7jbLAXF@5
zE#)cCVWBD(t!`yv9UhJXDKlUeG*6A?_;2s4$mV8z#1@`Az0U{KyN@_aDm2L+2~Hfa
z!A5@x{z>t4xobqE(U@2|-4pW*6!2a1Z#5y2yK$ekDzAp_tAdje=@tJEgwB;x)=ZhA
z&Gy^-Qiv7b0*Eu_&PUWx5U{-fKdL#oH!y8;M)+8R8q(+w6&^Ab9#XoW*MqE;2azwz
zMV-X))lANLnJYH>GnG;`T5q(s^>9tjwYr%Z1`TTqo1P^#%g{9XD_XoUvi_Vt>{t(O
z^r!wAkoNYnc2yvuT>&o;tmO(b`}y(DXeAkGJFvyXAn3E7onw-#hH1D~NO=EQS2A;n
zz78Anrj5jD3<~43+}P9`b@O=O;6n|wmtktS$^|HB1$ZI&&HZG3aAg$O&hZi$x?#m}
z6vr@cYqdE{N&c&i%A83|usp<)JfH-%_QfOS4N;bwEafk*3G$;4HF@46xhv0x$;L5Z
zr$x9OM^JiaVU7IyW9LPP4y#1v4mqMbXob$^VFuqvtUM2qyo!xJ+THu^W%a_^AZX7k
z6sOJc81oRnuz#DyNXY=1@%PLp-Z-D&lr&G71#kKX;@$>geDYt$kO_WK^-fIOtmZk}
z%7_o1ApjF5%Y_j-#-KPMQQ)WO<2U!Hm%gBqr@=F4AUT&!!HTc1B!CT-PQ&-?<gMUd
z)=#>%dNZBr88JPTbAF$e<wSDJo}0i2cgTJPQ--m8ZSbixc}tyLM<yL#l#(xh1j0d6
zAPofb5w-}jyFS3FPONTd5GqzwcdfLYD8!{5GEmPkxVVFmwxqefypp|+d2K^&z^Jx&
z@foq{GoN3Sog&QTsW+bxc}boT{ybCuD7aX~v8v%x&3Yz%!aMs5s;+=*E~Q^-Z(LXb
zy-n3<G7a85gE!+QB`K_d0qegYrG$q>-hC@DXRK4TS<wkE!|WmvCM6YU;-#%FO9c{{
zTCf*vHzJhv-pm<8GCymbG$87Wb&n`;NuHYKY~8w71-Mi#VssKrB25R2?<}m7_KH9S
zFiazAtosc_VWN}hcF;H2DB)-W4R4^$BzhFLOiH}--N=Hx5*Z%-#X4@HB-Mzl>dXA8
zKHSk{t<yT5dARf((NZRvZo>l|!X)_><MWU22~$Is(ehdfkdDbH6>E|HL3lMs<Y(;#
zx=C=nj0-Y<_qeLA0H}KL$2w9%ti=Br%-f9~vUz3~LZW+}anMLJ&jUr)nr=KMM()dr
z&TE8bS<^m<d;wmNXgc6(0g)eI_KO-`^Cv~Q-@#c0=M$fZ&M&Z$Uj-|9up}CEtG_fM
z-##IkQV7GlPhlGHF~oMu2+l}JF3#4a3LAbBpGcbPY9c7But{}wMS3P~-tZ8ITmWhA
zV>p}lB5$`_TZHgkjKz4$aBwlck6j&m$h@i72K?naRSkh;olq59c1~gLhSk@1Pk3lT
z2&5@Z>O*J=hzu*<dYXeFr)S(aW`t#snEk`K4CItB_>{n;8@DYjjGJ(-ZsxtoMHSYj
z{17*<6e8{7g6?33Z(^|0YnV;6IUO72-t5f+R%&W&ujVuF1D5&r%j>1G+Qoz(5i(qG
zEa9|nVrvX`LpF3_-LLV0FCkBX=69ZcARfgDhP4|^Ammr(AMZt4OSc&)SGo%(bh~*?
z^oIu!Uk1*iIc==nNjp16uFO1be^@yT<$YgK7be*=#2O!N6-+p!_<ancRC%nbZ8vDP
zHU2D3V79n+pM&a;l*!6q*mLPiV53CYy%F4S-aRI(rJEP;yba`i9`QJ*#;!PLpDP^w
z3&VM^M!oXKDPLlb>v!XWT7My;cCYlfr*rt7Umh*TyA7_W=U(MRXO?c)5?!Kq1g@&z
zh6O!A`KMJ|?R2bMmpNP8fKOAYcIdj4Uoh}JH_Cap&pGj~<UBeRbC&>-GpP26WDlL+
z_JF<5|9%LVPiKHqp(5eI#LI5~6$<uETBl&Gw$YI-U=-3r^M$s<-^_{M))=l7;u=he
zQGu~v7K)?h`eClgqdy2Afj0vGRvVu2e4<va>-#;QhEyYv2NNV#n<_YSHWgh&L8%C`
zW>C59uXiAB`ofvsS7Cbl+&O%Y{P;ehiK&g3)swr`R6I{;U@ji`eP_zwNaJ4m9+whG
z?V?CarQO`=k$5giCiM(QEs02v%wH68jp-tF)t#XAVByY@Ml<c$&slLR>H00-$+<II
zT!Po75Y<jirJ=l*=!PR&&wWF&=iJX0gGI#29(bHq>5#b1A#P*3?l&=9>BnP^`idQ#
zw++RFT#^3P61(~>BzvA*M9j|noF@_}{ybvytj2{}9y`p{kSt<s*Y!yF7#U={7Sc>w
z)pVlzm4Tg9a00QlOLQDe#!NVuTh!-<&`m)&9!8isaiq8dGfa%ctsIh3K4&pzPM)?I
zsGG??wNy_S7;@F&Y3pb!CHD5@Q};8`2B{h*%BP70Qr}F2o_0T7_?*w4(F2v8lruKJ
z5BhrXViFBrsLOxPL8*9B=DgD_N9<%Du!G4kvK;rchC}@S2-CkkVFn3$!y(eQ|M)z5
z$b<Q~@|OJ7^iHV9mDw-dt88#~D3T&3G=l^BE=Nby(@_O3FBlcKK`^&?6nCs2E(qNr
zgDqR#b15(UY-#%#7}cqmM0Q-hQsN&OV(Ko;PY%46@Rsdz@{H*nk2<S(qiVA{5Cm1K
zESrP&Q(Xbsz)|%kUQ-8o3k3u4N$?@k+F-F=vvpAWi4AVF+Pr^uQuR?*po2~94-~X_
zV#GZB#QsLViRB77tL>~8jk72-e%SDKf;1E{glBb+fJjagtz?4XqF#&AY@*3kMX1=I
z^qWQ+d$+?lW9VU&cjl=`syGaVh%kRR3E9=2zYz_*fxZIy&HUyH&sHOls}PQL^j5e>
zM<i$r?iIVa5yG+Lv+YCdjf6BDxNU6{VXPqQWR%3c56jvBO7c@E@Tt&QrUi*IypY&M
z#@O8Mnm@W2mjmU<%BklP8W&re@@pvN_pqBHNv0*o{)wGfl=4G#D@m*LP>Xom2a(iu
zNpADzUId|llOhhiVWI@m82$YUPP9Wt&Y|0m<q<oA-+}Dd4t6^fF2DXkOhpu%s==LU
z;qvuQA61YXTzTwSTDXz}QzV`-dw{A9TvHvv31&|$-|e1MjiffVhxYw}YmNyY$61Cz
zi{7C2G7h?48LxG>ly7%KM@UP`(CX`=)Y*l0<U|OB=plSf|2fIt6AXrm24A&eMV-n2
zhcFF!Ia=Kg!mguVozNGh303Dt+#Yh!=opF+_2P;uv%)l__^SLQ<il0)TyxRvIq(24
zD%+fm{2~>e{#q19|9H)D<`@oJ#@8`)zsHWN^;!&JpaEHNCk%<;a?yVn^tew?V2zAU
zD=LRu#lg@S0K^y1)QqzDsh;ta6AD#*B|-4x{2BVu92n-GK;^tHu<EBpoBw<L&iua6
z_rh0q=JWoM9j;ZkBO=`@x=nnjDgwpi^xXXoT9p8vGo>s5FlJQb2mbe$ptM2!-qZ5{
zlag2lDU-CK{g&q~#-8=o@)n``77a<=Ye{*?s1E5YZBmLS<U(#$oY_7;L!xgSCC#hm
z^l%@|lE%#46NxaAX$C5;Qr=Xb_6pKZUgDaXD{S=suVRyDy@tHBq|l%pc8yebbB<HX
z<j7v|*@L>Sw8L^WRWfE-nCvEGw0RK>D+p+=@?cKt9?><b@Sz=rDCI7L(niiNrY3td
z-gGn%i@$N7C8_d|&#GF|u&;BXjY-up@~~sv*nFJ?-QtROcDB7c_%2H|x!$evd(^Dk
zv&ycl3U>)2S1E&4k;xHudrtqomd$fu<-&OLdCM~j6Zemu?P)D{V60T-)yo|TRr7i-
zDP{8-IV}47pNfKFlMlEseJ`%YmKBC-(R$%eY-*D_R3e#lCDKK;J5*D<<X;YUGHTJ_
zLSXz8&({m9F@9EWF7J}6S}mm!)s(xlY>?-&vvXQqu;G@knWl7re6IU)&%}X}+{uiB
z0n?PR@MH3|J2@_Q1`7z2pcKSh#}m@%ia!!P3WA~rD%H<ertGQ%NDW()3c8=ObH9}i
zHX``nf1UrQUaaSr3FhH51B<h&00?g9XIXX@*UAn2^tniDO*gw$a$BS=0R9SfkKjD0
z?Gh%{AMuMq!y?0#V_)Fgua)$dh$a_1l^4`b_31KG3BRmfFX<%VPg|mB3r!S=ijku)
z?W$&(U;AhwLv6pIt5N0;Whr)#(n&mg@K#pc`RlxrBZ={Omb5gK?2C)s-W84BNQhV-
zKg&$T7%_`Tcm(F&l|%4#;UVynoat@5lY(p<x9q=7nPuC5l#YJ#<brQsFKcC74_+f3
zgHn!gEwvvIaC1rHLy+9B%dTSLp&;zYxyu2CCODYehm1rqCi$@?GVd`S!a48{epc~0
zDbTRLtg|N97<?8Wnyrwc0_RZAl~sSZyQKQj^ET5^6>)aJ#W*`S<W<sMoEVtqq<T(R
z@}(VrHHJT&x!m)9)}3ey`??=8Fvwi|hRqL85ABRTei<iKXskE14#cJsi5oGf56bX_
Z8+O`cXpa1%hR(m7YpLt0)jWhp{vW%5w;%uj

literal 0
HcmV?d00001

diff --git a/plots/speed.txt b/plots/speed.txt
new file mode 100644
index 0000000..4ccaf80
--- /dev/null
+++ b/plots/speed.txt
@@ -0,0 +1,15 @@
+# Time in seconds to go from n 32k-length random genomes to a lower-triangular distances matrix file
+# CPU: 52 core Xeon: from-fasta / dump-lower-triangular
+# GPU: A100: from-fasta / dump-lower-triangular
+# GPU-stream: A100: from-fasta-to-lower-triangular (interleaves I/O and computation)
+# n    CPU        GPU                  GPU-stream
+2000   0.653878   0.5127140621189028   0.13557552301790565
+3000   0.964759   0.7554650978418067   0.23105294106062502
+5000   1.63097    1.3445364689687267   0.46112991601694375
+10000  3.88273    3.1115588791435584   1.34703719697427
+20000  10.779     7.625964550068602    4.461474176961929
+30000  20.7542    13.989239579997957   9.265670311986469
+50000  59.7482    32.871332183945924   20.969019818934612
+100000 185.941    107.00290298496839   81.37370045098942
+200000 735.223    361.97915480996016   271.80367124499753
+300000 1575.75    780.524687159108     610.4017879960593
diff --git a/pyproject.toml b/pyproject.toml
index a1c2119..6bcd607 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "hammingdist"
-version = "0.21.0"
+version = "1.0.0"
 description = "A fast tool to calculate Hamming distances"
 readme = "README.md"
 license = {text = "MIT"}
@@ -37,21 +37,24 @@ Issues = "https://github.com/ssciwr/hammingdist/issues"
 [project.optional-dependencies]
 test = ["pytest", "numpy"]
 
+[tool.scikit-build]
+cmake.verbose = true
+
 [tool.scikit-build.cmake.define]
 BUILD_TESTING = "OFF"
 HAMMING_BUILD_BENCHMARKS = "OFF"
 HAMMING_BUILD_PYTHON = "ON"
 
 [tool.cibuildwheel]
-skip = "*-manylinux_i686"
-test-skip = "pp* *-musllinux*"
+skip = "*-manylinux_i686 *-musllinux*"
+test-skip = "pp*"
 test-extras = "test"
 test-command = "pytest {project}/python/tests -v"
 environment = { BLAS="None", LAPACK="None", ATLAS="None" }
 build-verbosity = 3
 
 [tool.cibuildwheel.linux]
-environment = { CMAKE_ARGS="-DHAMMING_WITH_OPENMP=ON" }
+environment = { CMAKE_ARGS="-DHAMMING_WITH_OPENMP=ON -DHAMMING_WITH_CUDA=ON" }
 
 [[tool.cibuildwheel.overrides]]
 select = "*-macosx_arm64*"
diff --git a/python/hammingdist.cc b/python/hammingdist.cc
index 6c82f19..b78205f 100644
--- a/python/hammingdist.cc
+++ b/python/hammingdist.cc
@@ -59,7 +59,7 @@ PYBIND11_MODULE(hammingdist, m) {
         "(full matrix expected)");
   m.def("from_fasta", &from_fasta<uint8_t>, py::arg("filename"),
         py::arg("include_x") = false, py::arg("remove_duplicates") = false,
-        py::arg("n") = 0,
+        py::arg("n") = 0, py::arg("use_gpu") = false,
         "Creates a dataset by reading from a fasta file (assuming all "
         "sequences have equal length). Maximum value of an element in the "
         "distances matrix: 255. Distances that would have been larger than "
@@ -67,13 +67,26 @@ PYBIND11_MODULE(hammingdist, m) {
         "than this see `from_fasta_large` instead.");
   m.def("from_fasta_large", &from_fasta<uint16_t>, py::arg("filename"),
         py::arg("include_x") = false, py::arg("remove_duplicates") = false,
-        py::arg("n") = 0,
+        py::arg("n") = 0, py::arg("use_gpu") = false,
         "Creates a dataset by reading from a fasta file (assuming all "
         "sequences have equal length). Maximum value of an element in the "
         "distances matrix: 65535");
-  m.def("from_lower_triangular", &from_lower_triangular,
+  m.def("from_fasta_to_lower_triangular", &from_fasta_to_lower_triangular,
+        py::arg("fasta_filename"), py::arg("output_filename"),
+        py::arg("remove_duplicates") = false, py::arg("n") = 0,
+        py::arg("use_gpu") = true,
+        "Construct lower triangular distances matrix output file from the "
+        "fasta file,"
+        "requires an NVIDIA GPU. Maximum value of an element in "
+        "the distances matrix: 65535");
+  m.def("from_lower_triangular", &from_lower_triangular<uint8_t>,
         "Creates a dataset by reading already computed distances from lower "
-        "triangular format");
+        "triangular format. Maximum value of an element in the distances "
+        "matrix: 255.");
+  m.def("from_lower_triangular_large", &from_lower_triangular<uint16_t>,
+        "Creates a dataset by reading already computed distances from lower "
+        "triangular format. Maximum value of an element in the distances "
+        "matrix: 65535.");
   m.def("distance", &distance, py::arg("seq0"), py::arg("seq1"),
         py::arg("include_x") = false,
         "Calculate the distance between seq0 and seq1");
@@ -98,6 +111,8 @@ PYBIND11_MODULE(hammingdist, m) {
       "constructing the distances matrix."
       "For each genome in the input fasta file it gives the index of the "
       "corresponding row in the distances matrix which excludes duplicates");
+  m.def("cuda_gpu_available", &cuda_gpu_available,
+        "True if a GPU that supports CUDA is available");
 }
 
 } // namespace hamming
diff --git a/python/tests/test_hammingdist.py b/python/tests/test_hammingdist.py
index 2f3dfd9..581ba1f 100644
--- a/python/tests/test_hammingdist.py
+++ b/python/tests/test_hammingdist.py
@@ -3,6 +3,8 @@
 import random
 import pytest
 
+gpu_options = [False, True] if hammingdist.cuda_gpu_available() else [False]
+
 
 def write_fasta_file(filename, sequences):
     with open(filename, "w") as f:
@@ -35,7 +37,8 @@ def check_output_sizes(dat, n_in, n_out, tmp_out_file, fasta_sequence_indices=No
 @pytest.mark.parametrize(
     "from_fasta_func", [hammingdist.from_fasta, hammingdist.from_fasta_large]
 )
-def test_from_fasta(from_fasta_func, tmp_path):
+@pytest.mark.parametrize("use_gpu", gpu_options)
+def test_from_fasta(from_fasta_func, use_gpu, tmp_path):
     sequences = [
         "ACGTGTCGTGTCGACGTGTCG",
         "ACGTGTCGTTTCGACGAGTCG",
@@ -48,34 +51,36 @@ def test_from_fasta(from_fasta_func, tmp_path):
     output_file = str(tmp_path / "out.txt")
     write_fasta_file(fasta_file, sequences)
 
-    data = hammingdist.from_fasta(fasta_file)
+    data = from_fasta_func(fasta_file, use_gpu=use_gpu)
     check_output_sizes(data, 6, 6, output_file)
 
-    data = hammingdist.from_fasta(fasta_file, n=5)
+    data = from_fasta_func(fasta_file, n=5, use_gpu=use_gpu)
     check_output_sizes(data, 5, 5, output_file)
 
-    data = hammingdist.from_fasta(fasta_file, include_x=True)
+    data = from_fasta_func(fasta_file, include_x=True, use_gpu=False)
     check_output_sizes(data, 6, 6, output_file)
 
     fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file)
-    data = hammingdist.from_fasta(fasta_file, remove_duplicates=True)
+    data = from_fasta_func(fasta_file, remove_duplicates=True, use_gpu=use_gpu)
     check_output_sizes(data, 6, 4, output_file, fasta_sequence_indices)
 
     fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file)
-    data = hammingdist.from_fasta(fasta_file, remove_duplicates=True, include_x=True)
+    data = from_fasta_func(
+        fasta_file, remove_duplicates=True, include_x=True, use_gpu=False
+    )
     check_output_sizes(data, 6, 4, output_file, fasta_sequence_indices)
 
     fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file, n=2)
-    data = hammingdist.from_fasta(fasta_file, include_x=True, n=2)
+    data = from_fasta_func(fasta_file, include_x=True, n=2, use_gpu=False)
     check_output_sizes(data, 2, 2, output_file, fasta_sequence_indices)
 
     fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file, n=3)
-    data = hammingdist.from_fasta(fasta_file, remove_duplicates=True, n=3)
+    data = from_fasta_func(fasta_file, remove_duplicates=True, n=3, use_gpu=use_gpu)
     check_output_sizes(data, 3, 2, output_file, fasta_sequence_indices)
 
     fasta_sequence_indices = hammingdist.fasta_sequence_indices(fasta_file, n=5)
-    data = hammingdist.from_fasta(
-        fasta_file, remove_duplicates=True, n=5, include_x=True
+    data = from_fasta_func(
+        fasta_file, remove_duplicates=True, n=5, include_x=True, use_gpu=False
     )
     check_output_sizes(data, 5, 3, output_file, fasta_sequence_indices)
 
@@ -125,3 +130,25 @@ def test_distance():
     assert hammingdist.distance("ACGTX", "ACCTX", include_x=True) == 1
     with pytest.raises(RuntimeError):
         hammingdist.distance("ACGT", "ACC")
+
+
+@pytest.mark.skipif(
+    not hammingdist.cuda_gpu_available(),
+    reason="No CUDA GPU available or hammingdist was compiled without CUDA support",
+)
+def test_from_fasta_to_lower_triangular(tmp_path):
+    sequences = [
+        "ACGTGTCGTGTCGACGTGTCGCAGGTGTCGACGTGTCGCAGGTGTCGACGTGTCGCAG",
+        "CCGTGTCGTGTCGACGTGTCGC-GGTGTCGACGTGTCGCAGGTGTCGACGTGTCGCAG",
+        "CAGTGT-GTGTCGACGTGTCGCAGGTGTCGACGTGTCGCAGGTGTCGACGTG--GCAG",
+    ]
+    lower_triangular_dist = [[1], [2, 1]]
+    fasta_file = str(tmp_path / "fasta.txt")
+    output_file = str(tmp_path / "out.txt")
+    write_fasta_file(fasta_file, sequences)
+    hammingdist.from_fasta_to_lower_triangular(fasta_file, output_file)
+    with open(output_file) as f:
+        data = f.read().splitlines()
+    assert len(data) == 2
+    assert np.allclose(np.fromstring(data[0], sep=","), lower_triangular_dist[0])
+    assert np.allclose(np.fromstring(data[1], sep=","), lower_triangular_dist[1])
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6f2d66c..5215b04 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,15 +1,17 @@
 # Build hamming library
-add_library(hamming STATIC hamming.cc hamming_impl.cc)
+add_library(hamming STATIC hamming.cc hamming_impl.cc hamming_utils.cc)
 target_include_directories(hamming PUBLIC ../include)
+target_include_directories(hamming PRIVATE .)
 target_link_libraries(hamming PUBLIC CpuFeatures::cpu_features)
+target_link_libraries(hamming PUBLIC fmt::fmt-header-only)
 if(HAMMING_WITH_OPENMP)
   find_package(OpenMP REQUIRED)
   target_compile_definitions(hamming PUBLIC HAMMING_WITH_OPENMP)
   target_link_libraries(hamming PUBLIC OpenMP::OpenMP_CXX)
 endif()
 
-# compile optional SIMD code as separate libraries which can be used at runtime
-# if there is CPU support
+# compile optional SIMD/CUDA code as separate libraries which can be used at
+# runtime if there is hardware support
 if(HAMMING_WITH_SSE2)
   target_compile_definitions(hamming PUBLIC HAMMING_WITH_SSE2)
   add_library(distance_sse2 STATIC distance_sse2.cc)
@@ -41,9 +43,27 @@ if(HAMMING_WITH_NEON)
   target_link_libraries(hamming PRIVATE distance_neon)
 endif()
 
+if(HAMMING_WITH_CUDA)
+  target_compile_definitions(hamming PUBLIC HAMMING_WITH_CUDA)
+  add_library(distance_cuda STATIC distance_cuda.cu)
+  target_link_libraries(distance_cuda PUBLIC fmt::fmt-header-only)
+  target_include_directories(distance_cuda PRIVATE .)
+  target_include_directories(distance_cuda PUBLIC ../include)
+  target_link_libraries(hamming PRIVATE distance_cuda)
+  set_target_properties(distance_cuda PROPERTIES CUDA_ARCHITECTURES "all")
+  if(HAMMING_WITH_OPENMP)
+    target_compile_definitions(distance_cuda PUBLIC HAMMING_WITH_OPENMP)
+    target_link_libraries(distance_cuda PUBLIC OpenMP::OpenMP_CXX)
+    # This is required to make nvcc pass the -fopenmp option to gcc:
+    target_compile_options(distance_cuda PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
+                                                 -Xcompiler=-fopenmp>)
+  endif()
+endif()
+
 # Build library benchmarks
 if(HAMMING_BUILD_BENCHMARKS)
-  add_executable(bench bench.cc hamming_bench.cc hamming_impl_bench.cc)
+  add_executable(bench bench.cc hamming_bench.cc hamming_impl_bench.cc
+                       hamming_utils_bench.cc)
   if(HAMMING_WITH_SSE2)
     target_sources(bench PRIVATE distance_sse2_bench.cc)
     target_link_libraries(bench PRIVATE distance_sse2)
@@ -60,13 +80,17 @@ if(HAMMING_BUILD_BENCHMARKS)
     target_sources(bench PRIVATE distance_neon_bench.cc)
     target_link_libraries(bench PRIVATE distance_neon)
   endif()
+  if(HAMMING_WITH_CUDA)
+    target_sources(bench PRIVATE distance_cuda_bench.cc)
+    target_link_libraries(bench PRIVATE distance_cuda)
+  endif()
   target_link_libraries(bench PRIVATE hamming benchmark::benchmark
                                       CpuFeatures::cpu_features)
 endif()
 
 # Build tests
 if(BUILD_TESTING)
-  include(../ext/Catch2/contrib/Catch.cmake)
+  include(../ext/Catch2/extras/Catch.cmake)
   add_executable(tests tests.cc hamming_t.cc hamming_impl_t.cc)
   if(HAMMING_WITH_SSE2)
     target_sources(tests PRIVATE distance_sse2_t.cc)
@@ -84,7 +108,11 @@ if(BUILD_TESTING)
     target_sources(tests PRIVATE distance_neon_t.cc)
     target_link_libraries(tests PRIVATE distance_neon)
   endif()
-  target_link_libraries(tests PRIVATE hamming Catch2::Catch2
+  if(HAMMING_WITH_CUDA)
+    target_sources(tests PRIVATE distance_cuda_t.cc)
+    target_link_libraries(tests PRIVATE distance_cuda)
+  endif()
+  target_link_libraries(tests PRIVATE hamming Catch2::Catch2WithMain
                                       CpuFeatures::cpu_features)
-  catch_discover_tests(tests)
+  catch_discover_tests(tests EXTRA_ARGS "--allow-running-no-tests")
 endif()
diff --git a/src/bench.cc b/src/bench.cc
index f3fede7..c35a30c 100644
--- a/src/bench.cc
+++ b/src/bench.cc
@@ -29,10 +29,11 @@ void randomize_n(std::string &str, std::size_t n, std::mt19937 &gen) {
   }
 }
 
-std::vector<std::string> make_stringlist(int64_t n, std::mt19937 &gen) {
+std::vector<std::string> make_stringlist(int64_t n, int64_t n_samples,
+                                         std::mt19937 &gen) {
   std::vector<std::string> v;
-  v.reserve(n);
-  for (int64_t row = 0; row < n; ++row) {
+  v.reserve(n_samples);
+  for (int64_t row = 0; row < n_samples; ++row) {
     v.push_back(make_string(n, gen));
   }
   return v;
diff --git a/src/bench.hh b/src/bench.hh
index 510ec42..52498cb 100644
--- a/src/bench.hh
+++ b/src/bench.hh
@@ -1,5 +1,4 @@
-#ifndef HAMMING_BENCH_HH
-#define HAMMING_BENCH_HH
+#pragma once
 
 #include <benchmark/benchmark.h>
 #include <random>
@@ -9,11 +8,29 @@
 namespace hamming {
 
 std::string make_string(int64_t n, std::mt19937 &gen, bool include_dash = true);
+
 void randomize_n(std::string &str, std::size_t n, std::mt19937 &gen);
-std::vector<std::string> make_stringlist(int64_t n, std::mt19937 &gen);
+
+std::vector<std::string> make_stringlist(int64_t n, int64_t n_samples,
+                                         std::mt19937 &gen);
+
 void write_fasta(const std::string &filename, const std::string &seq,
                  std::size_t n_seq, std::mt19937 &gen);
 
-} // namespace hamming
+template <typename DistIntType>
+std::vector<DistIntType> make_distances(int64_t n, std::mt19937 &gen) {
+  std::vector<DistIntType> v{};
+  auto n_elements{n * (n - 1) / 2};
+  v.reserve(n_elements);
+  std::uniform_int_distribution<> distrib(
+      0, std::numeric_limits<DistIntType>::max());
+  for (int64_t i = 0; i < n_elements; ++i) {
+    v.push_back(distrib(gen));
+  }
+  return v;
+}
+
+const char *const benchmark_tmp_output_file{"tmp.bench.output"};
+const char *const benchmark_tmp_input_file{"tmp.bench.input"};
 
-#endif
+} // namespace hamming
diff --git a/src/cuda_mem.hh b/src/cuda_mem.hh
new file mode 100644
index 0000000..7f25cf5
--- /dev/null
+++ b/src/cuda_mem.hh
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <stdexcept>
+#include <vector>
+
+template <typename T> T *CheckedCudaMalloc(std::size_t n) {
+  T *a{nullptr};
+  std::size_t sz{sizeof(T) * n};
+  if (cudaError err{cudaMalloc(&a, sz)}; err != cudaSuccess) {
+    throw std::runtime_error(cudaGetErrorString(err));
+  }
+  return a;
+}
+
+template <typename T>
+void CheckedCopyToDevice(T *dest, const std::vector<T> &src) {
+  std::size_t count{sizeof(T) * src.size()}; // count in bytes
+  if (cudaError err{
+          cudaMemcpy(dest, src.data(), count, cudaMemcpyHostToDevice)};
+      err != cudaSuccess) {
+    throw std::runtime_error(cudaGetErrorString(err));
+  }
+}
+
+template <typename T>
+void CheckedCopyToHost(T *dest, const T *src, std::size_t n_elements) {
+  std::size_t count{sizeof(T) * n_elements}; // count in bytes
+  if (cudaError err{cudaMemcpy(dest, src, count, cudaMemcpyDeviceToHost)};
+      err != cudaSuccess) {
+    throw std::runtime_error(cudaGetErrorString(err));
+  }
+}
diff --git a/src/distance_cuda.cu b/src/distance_cuda.cu
new file mode 100644
index 0000000..9ddfcb5
--- /dev/null
+++ b/src/distance_cuda.cu
@@ -0,0 +1,233 @@
+#include "cuda_mem.hh"
+#include "hamming/distance_cuda.hh"
+#include "hamming/hamming_utils.hh"
+#include <chrono>
+#include <cuda/std/limits>
+
+namespace hamming {
+
+template <typename DistIntType>
+__global__ void Dist(DistIntType *partial_distances, const std::uint8_t *genes,
+                     std::uint64_t distances_offset,
+                     unsigned int geneBlocksPerSample) {
+  // Calculates all gridDim.x entries of the partial_distances array.
+  //
+  // The full distances array is a flat nsamples * (nsamples - 1) / 2 element
+  // array that contains the lower-triangular elements of the nSamples x
+  // nSamples partial_distances matrix.
+  //
+  // This kernel is provided with partial_distances, which should have
+  // gridDim.x entries, and which is filled with values corresponding to
+  // entries in the full distances array with an offset of distances_offset.
+  //
+
+  // this array is shared between the threads in this block
+  // and must be large enough to store one int per thread
+  extern __shared__ int s[];
+
+  // index in the partial_distances array where we'll put the result from this
+  // block of threads
+  uint64_t distancesIndex{static_cast<uint64_t>(blockIdx.x)};
+  // index of this value in the full distances array
+  uint64_t trueDistancesIndex{distancesIndex + distances_offset};
+  // infer indices of the two genes corresponding to this distances index
+  uint64_t distancesRowIndex{static_cast<std::size_t>(
+      floor(sqrt(2.0 * static_cast<double>(trueDistancesIndex) + 0.5) + 0.5))};
+  uint64_t distancesColIndex{trueDistancesIndex -
+                             distancesRowIndex * (distancesRowIndex - 1) / 2};
+  uint64_t uint32sPerSample{geneBlocksPerSample / 4};
+  uint64_t geneAIndex{distancesRowIndex * uint32sPerSample};
+  uint64_t geneBIndex{distancesColIndex * uint32sPerSample};
+
+  unsigned int threadIndex{threadIdx.x};
+  // calculate partial sum for each thread and store in shared memory s
+  int r0{0};
+  int r1{0};
+  int r2{0};
+  int r3{0};
+  // NOTE: this cast is only safe if genes is 32-bit aligned AND we each sample
+  // in genes is padded such that the first element of each sample is also
+  // 32-bit aligned!
+  const uint32_t *genes_as_uint32{reinterpret_cast<const uint32_t *>(genes)};
+  uint mask_lower{0x0f0f0f0f};
+  uint mask_upper{0xf0f0f0f0};
+  // NOTE: this loop is also only correct if the length of each sample in genes
+  // is a multiple of 8, which we do by padding the samples with '-'.
+  for (int j = 2 * threadIndex; j < uint32sPerSample; j += 2 * blockDim.x) {
+    auto c0{genes_as_uint32[geneAIndex + j] & genes_as_uint32[geneBIndex + j]};
+    auto c1{genes_as_uint32[geneAIndex + j + 1] &
+            genes_as_uint32[geneBIndex + j + 1]};
+    r0 += __popc(__vseteq4(c0 & mask_lower, 0u));
+    r1 += __popc(__vseteq4(c0 & mask_upper, 0u));
+    r2 += __popc(__vseteq4(c1 & mask_lower, 0u));
+    r3 += __popc(__vseteq4(c1 & mask_upper, 0u));
+  }
+  s[threadIndex] = r0 + r1 + r2 + r3;
+  // synchronise shared memory s between all threads in this block
+  __syncthreads();
+  // sum elements of s using reduction until partial sums are stored in
+  // the first 64 elements of s
+  for (int offset = blockDim.x / 2; offset > 32; offset >>= 1) {
+    if (threadIndex < offset) {
+      s[threadIndex] += s[threadIndex + offset];
+    }
+    __syncthreads();
+  }
+  if (threadIndex < 32) {
+    // one more reduction in each of the 32 threads in this warp
+    int sum{s[threadIndex] + s[threadIndex + 32]};
+    // now sum the values of sum within this warp
+    constexpr unsigned int FULL_MASK{0xffffffff};
+    for (int offset = 16; offset > 0; offset /= 2) {
+      sum += __shfl_down_sync(FULL_MASK, sum, offset);
+    }
+    if (threadIndex == 0) {
+      auto maxDist{cuda::std::numeric_limits<DistIntType>::max()};
+      partial_distances[distancesIndex] = sum > maxDist ? maxDist : sum;
+    }
+  }
+}
+
+template <typename DistIntType>
+std::vector<DistIntType>
+distances_cuda(const std::vector<std::vector<GeneBlock>> &data,
+               const std::string &filename = {}) {
+  std::vector<DistIntType> distances{};
+  std::size_t timing_gpu_ms = 0;
+  std::size_t timing_io_ms = 0;
+  auto timing0{std::chrono::high_resolution_clock::now()};
+  bool output_to_vector{false};
+  if (filename.empty()) {
+    output_to_vector = true;
+  }
+  std::size_t nSamples{data.size()};
+  std::size_t nDistances{nSamples * (nSamples - 1) / 2};
+  std::size_t geneBlocksPerSample{data[0].size()};
+  // 2^31-1 is limit on number of CUDA blocks in x-dim, which corresponds to
+  // 0.5/1GB of distances data for each chunk. For large datasets I/O becomes
+  // the bottleneck and the larger the chunk the faster the I/O tends to be.
+  std::size_t nPartialDistances{std::min(nDistances, 2147483647ul)};
+
+  if (output_to_vector) {
+    // need to store all distances on host
+    distances.resize(nDistances);
+  } else {
+    // only need to store a single block of partial distances on host
+    distances.resize(nPartialDistances);
+  }
+  // allocate memory for genes on device
+  // one gene is 30k chars -> 15k bytes in dense format
+  // so 1 million samples -> 15GB
+  auto *genes{CheckedCudaMalloc<GeneBlock>(nSamples * geneBlocksPerSample)};
+  // copy genes to device
+  for (std::size_t i = 0; i < data.size(); ++i) {
+    CheckedCopyToDevice(genes + i * geneBlocksPerSample, data[i]);
+  }
+
+  // allocate memory for partial distances matrix on device
+  auto *partial_distances{CheckedCudaMalloc<DistIntType>(nPartialDistances)};
+  // keep track of how many distance elements are available to write to disk
+  std::size_t available_distance_elements{0};
+  // keep track of where in the full distances array these elements should go
+  std::size_t distances_offset{0};
+  // GPU timing
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  // I/O timing
+  auto timing_io = std::chrono::high_resolution_clock::now();
+  while (distances_offset + available_distance_elements < nDistances) {
+    // use nThreadsPerBlock in x dim of block
+    uint nThreadsPerBlock{128};
+    dim3 threadsPerBlock{nThreadsPerBlock, 1, 1};
+    // use up to nPartialDistances blocks, one block per distance element
+    dim3 numBlocks{static_cast<uint>(std::min(nPartialDistances,
+                                              nDistances - distances_offset)),
+                   1, 1};
+    cudaEventRecord(start);
+    timing_io = std::chrono::high_resolution_clock::now();
+    // launch a kernel with shared memory of size int[nThreadsPerBlock] -
+    // this call returns immediately and the kernel runs asynchronously on the
+    // GPU
+    Dist<<<numBlocks, threadsPerBlock, nThreadsPerBlock * sizeof(int)>>>(
+        partial_distances, genes, distances_offset, geneBlocksPerSample);
+    cudaEventRecord(stop);
+    if (auto err = cudaGetLastError(); err != cudaSuccess) {
+      throw std::runtime_error(cudaGetErrorString(err));
+    }
+
+    if (!output_to_vector) {
+      // write previous kernel's output (if any) to disk using CPU while the new
+      // kernel is running on the GPU to interleave I/O with computation
+      partial_write_lower_triangular(filename, distances, distances_offset,
+                                     available_distance_elements);
+    }
+    timing_io_ms += std::chrono::duration_cast<std::chrono::milliseconds>(
+                        std::chrono::high_resolution_clock::now() - timing_io)
+                        .count();
+    // copy partial_distances from GPU to distances vector on HOST - this call
+    // waits until the kernel has completed before copying the memory.
+    CheckedCopyToHost(distances.data() +
+                          (output_to_vector ? distances_offset : 0),
+                      partial_distances, numBlocks.x);
+    cudaEventSynchronize(stop);
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    timing_gpu_ms += static_cast<std::size_t>(milliseconds);
+    distances_offset += available_distance_elements;
+    available_distance_elements = numBlocks.x;
+  }
+  if (!output_to_vector) {
+    // write final kernel's output to disk
+    timing_io = std::chrono::high_resolution_clock::now();
+    partial_write_lower_triangular(filename, distances, distances_offset,
+                                   available_distance_elements);
+    timing_io_ms += std::chrono::duration_cast<std::chrono::milliseconds>(
+                        std::chrono::high_resolution_clock::now() - timing_io)
+                        .count();
+    distances.clear();
+  }
+  // free data on gpu
+  cudaFree(genes);
+  cudaFree(partial_distances);
+  std::cout << "# hammingdist :: ...distance calculation completed in "
+            << std::chrono::duration_cast<std::chrono::milliseconds>(
+                   std::chrono::high_resolution_clock::now() - timing0)
+                   .count()
+            << " ms (GPU: " << timing_gpu_ms << " / IO: " << timing_io_ms
+            << ")." << std::endl;
+  return distances;
+}
+
+std::vector<uint8_t>
+distances_cuda_8bit(const std::vector<std::vector<GeneBlock>> &data) {
+  return distances_cuda<uint8_t>(data, {});
+}
+
+std::vector<uint16_t>
+distances_cuda_16bit(const std::vector<std::vector<GeneBlock>> &data) {
+  return distances_cuda<uint16_t>(data, {});
+}
+
+void distances_cuda_to_lower_triangular(
+    const std::vector<std::vector<GeneBlock>> &data,
+    const std::string &filename) {
+  distances_cuda<uint16_t>(data, filename);
+}
+
+int distance_cuda(const std::vector<GeneBlock> &a,
+                  const std::vector<GeneBlock> &b) {
+  // wrapper for testing cuda kernel with existing distance API
+  std::vector<std::vector<GeneBlock>> data{a, b};
+  return distances_cuda<int>(data, {})[0];
+}
+
+bool distance_cuda_have_device() {
+  int nDevices = 0;
+  if (cudaError_t err{cudaGetDeviceCount(&nDevices)}; err != cudaSuccess) {
+    return false;
+  }
+  return nDevices > 0;
+}
+
+} // namespace hamming
diff --git a/src/distance_cuda_bench.cc b/src/distance_cuda_bench.cc
new file mode 100644
index 0000000..ae6ab5c
--- /dev/null
+++ b/src/distance_cuda_bench.cc
@@ -0,0 +1,19 @@
+#include "bench.hh"
+#include "hamming/distance_cuda.hh"
+#include "hamming/hamming.hh"
+
+using namespace hamming;
+
+static void bench_distance_cuda(benchmark::State &state) {
+  std::mt19937 gen(12345);
+  int64_t n{state.range(0)};
+  auto s1{from_string(make_string(n, gen))};
+  auto s2{from_string(make_string(n, gen))};
+  int d{0};
+  for (auto _ : state) {
+    d += distance_cuda(s1, s2);
+  }
+  state.SetComplexityN(n);
+}
+
+BENCHMARK(bench_distance_cuda)->Range(4096, 4194304)->Complexity();
diff --git a/src/distance_cuda_t.cc b/src/distance_cuda_t.cc
new file mode 100644
index 0000000..dbb5b55
--- /dev/null
+++ b/src/distance_cuda_t.cc
@@ -0,0 +1,62 @@
+#include "hamming/distance_cuda.hh"
+#include "tests.hh"
+
+using namespace hamming;
+
+TEST_CASE("distance_cuda() returns all return zero for identical vectors",
+          "[impl][distance][gpu]") {
+  std::mt19937 gen(12345);
+  for (int n :
+       {1,       2,      3,      4,      5,      6,      7,      8,
+        9,       10,     11,     12,     13,     14,     15,     16,
+        17,      18,     19,     20,     31,     32,     33,     63,
+        64,      65,     127,    128,    129,    254,    255,    256,
+        256,     511,    512,    513,    1023,   1024,   1025,   2047,
+        2048,    2049,   4095,   4096,   4097,   8191,   8192,   8193,
+        32767,   32768,  32769,  65535,  65536,  65537,  131071, 131072,
+        131073,  262143, 262144, 262145, 524287, 524288, 524289, 1048575,
+        1048576, 1048577}) {
+    CAPTURE(n);
+    auto g1{make_gene_vector(n, gen)};
+    REQUIRE(distance_cuda(g1, g1) == 0);
+  }
+}
+
+TEST_CASE("distance_cuda() all return n for n A's and n G's",
+          "[impl][distance][gpu]") {
+  for (int n :
+       {1,       2,      3,      4,      5,      6,      7,      8,
+        9,       10,     11,     12,     13,     14,     15,     16,
+        17,      18,     19,     20,     31,     32,     33,     63,
+        64,      65,     127,    128,    129,    254,    255,    256,
+        256,     511,    512,    513,    1023,   1024,   1025,   2047,
+        2048,    2049,   4095,   4096,   4097,   8191,   8192,   8193,
+        32767,   32768,  32769,  65535,  65536,  65537,  131071, 131072,
+        131073,  262143, 262144, 262145, 524287, 524288, 524289, 1048575,
+        1048576, 1048577}) {
+    CAPTURE(n);
+    auto g1 = from_string(std::string(n, 'A'));
+    auto g2 = from_string(std::string(n, 'G'));
+    REQUIRE(distance_cuda(g1, g2) == n);
+  }
+}
+
+TEST_CASE("distance_cuda() returns same as distance_cpp() for random vectors",
+          "[impl][distance][gpu]") {
+  std::mt19937 gen(123451);
+  for (int n :
+       {1,       2,      3,      4,      5,      6,      7,      8,
+        9,       10,     11,     12,     13,     14,     15,     16,
+        17,      18,     19,     20,     31,     32,     33,     63,
+        64,      65,     127,    128,    129,    254,    255,    256,
+        256,     511,    512,    513,    1023,   1024,   1025,   2047,
+        2048,    2049,   4095,   4096,   4097,   8191,   8192,   8193,
+        32767,   32768,  32769,  65535,  65536,  65537,  131071, 131072,
+        131073,  262143, 262144, 262145, 524287, 524288, 524289, 1048575,
+        1048576, 1048577}) {
+    CAPTURE(n);
+    auto g1{make_gene_vector(n, gen)};
+    auto g2{make_gene_vector(n, gen)};
+    REQUIRE(distance_cuda(g1, g2) == distance_cpp(g1, g2));
+  }
+}
diff --git a/src/hamming.cc b/src/hamming.cc
index ff3c6aa..75b26a7 100644
--- a/src/hamming.cc
+++ b/src/hamming.cc
@@ -7,34 +7,45 @@
 #include <fstream>
 #include <iostream>
 #include <limits>
-#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 namespace hamming {
 
-DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data) {
-  return DataSet<DefaultDistIntType>(data);
+DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data,
+                                            bool include_x, bool use_gpu) {
+  return DataSet<DefaultDistIntType>(data, include_x, false, {}, use_gpu);
 }
 
 DataSet<DefaultDistIntType> from_csv(const std::string &filename) {
   return DataSet<DefaultDistIntType>(filename);
 }
 
-DataSet<DefaultDistIntType> from_lower_triangular(const std::string &filename) {
-  std::vector<DefaultDistIntType> distances;
-  std::ifstream stream(filename);
-  std::string line;
-  while (std::getline(stream, line)) {
-    std::istringstream s(line);
-    std::string d;
-    while (s.good()) {
-      std::getline(s, d, ',');
-      distances.push_back(safe_int_cast<DefaultDistIntType>(std::stoi(d)));
-    }
+void from_fasta_to_lower_triangular(const std::string &input_filename,
+                                    const std::string &output_filename,
+                                    bool remove_duplicates, std::size_t n,
+                                    bool use_gpu) {
+  if (use_gpu) {
+    std::cout << "# hammingdist :: Using GPU..." << std::endl;
+  }
+  auto start_time = std::chrono::high_resolution_clock::now();
+  auto [data, sequence_indices] =
+      read_fasta(input_filename, remove_duplicates, n);
+  auto dense_data = to_dense_data(data);
+  std::cout << "# hammingdist :: ...pre-processing completed in "
+            << std::chrono::duration_cast<std::chrono::milliseconds>(
+                   std::chrono::high_resolution_clock::now() - start_time)
+                   .count()
+            << " ms..." << std::endl;
+#ifdef HAMMING_WITH_CUDA
+  if (use_gpu) {
+    distances_cuda_to_lower_triangular(dense_data, output_filename);
+    return;
   }
-  return DataSet(std::move(distances));
+#endif
+  throw std::runtime_error(
+      "from_fasta_to_lower_triangular is currently only available on GPU");
 }
 
 ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1,
diff --git a/src/hamming_bench.cc b/src/hamming_bench.cc
index 2f4ff28..148b845 100644
--- a/src/hamming_bench.cc
+++ b/src/hamming_bench.cc
@@ -7,13 +7,15 @@
 
 using namespace hamming;
 
+constexpr int64_t sampleLength{30000};
+
 static void bench_from_stringlist(benchmark::State &state) {
 #ifdef HAMMING_WITH_OPENMP
   omp_set_num_threads(1);
 #endif
   std::mt19937 gen(12345);
   int64_t n{state.range(0)};
-  auto v{make_stringlist(n, gen)};
+  auto v{make_stringlist(sampleLength, n, gen)};
   for (auto _ : state) {
     from_stringlist(v);
   }
@@ -24,17 +26,40 @@ static void bench_from_stringlist(benchmark::State &state) {
 static void bench_from_stringlist_omp(benchmark::State &state) {
   omp_set_num_threads(state.range(0));
   std::mt19937 gen(12345);
-  auto v{make_stringlist(8192, gen)};
+  auto v{make_stringlist(sampleLength, 1024, gen)};
   for (auto _ : state) {
     from_stringlist(v);
   }
 }
 #endif
 
+static void bench_from_stringlist_gpu(benchmark::State &state) {
+  std::mt19937 gen(12345);
+  int64_t n{state.range(0)};
+  auto v{make_stringlist(sampleLength, n, gen)};
+  for (auto _ : state) {
+    from_stringlist(v, false, true);
+  }
+  state.SetComplexityN(n);
+}
+
+static void bench_from_fasta_to_lower_triangular_gpu(benchmark::State &state) {
+  std::mt19937 gen(12345);
+  int64_t n{state.range(0)};
+  std::string fasta_file{benchmark_tmp_input_file};
+  std::string lt_file{benchmark_tmp_output_file};
+  auto reference_seq{make_string(sampleLength, gen, true)};
+  write_fasta(fasta_file, reference_seq, state.range(0), gen);
+  for (auto _ : state) {
+    from_fasta_to_lower_triangular(fasta_file, lt_file, false);
+  }
+  state.SetComplexityN(n);
+}
+
 static void bench_fasta_reference_distances(benchmark::State &state) {
   std::mt19937 gen(12345);
-  std::string fasta_file{"fasta.txt"};
-  auto reference_seq{make_string(30000, gen, true)};
+  std::string fasta_file{benchmark_tmp_input_file};
+  auto reference_seq{make_string(sampleLength, gen, true)};
   write_fasta(fasta_file, reference_seq, state.range(0), gen);
   std::vector<ReferenceDistIntType> distances;
   for (auto _ : state) {
@@ -44,7 +69,7 @@ static void bench_fasta_reference_distances(benchmark::State &state) {
 
 BENCHMARK(bench_from_stringlist)
     ->RangeMultiplier(2)
-    ->Range(128, 8192)
+    ->Range(16, 1024)
     ->Complexity();
 #ifdef HAMMING_WITH_OPENMP
 BENCHMARK(bench_from_stringlist_omp)
@@ -55,7 +80,17 @@ BENCHMARK(bench_from_stringlist_omp)
     ->Arg(12)
     ->Arg(24);
 #endif
+#ifdef HAMMING_WITH_CUDA
+BENCHMARK(bench_from_stringlist_gpu)
+    ->RangeMultiplier(2)
+    ->Range(16, 8192)
+    ->Complexity();
+BENCHMARK(bench_from_fasta_to_lower_triangular_gpu)
+    ->RangeMultiplier(2)
+    ->Range(16, 16384)
+    ->Complexity();
+#endif
 BENCHMARK(bench_fasta_reference_distances)
     ->RangeMultiplier(2)
-    ->Range(16, 32384)
+    ->Range(16, 1024)
     ->Complexity();
diff --git a/src/hamming_impl.cc b/src/hamming_impl.cc
index 2ba768f..e2109c9 100644
--- a/src/hamming_impl.cc
+++ b/src/hamming_impl.cc
@@ -1,6 +1,23 @@
 #include "hamming/hamming_impl.hh"
 #include <algorithm>
+#if !(defined(__aarch64__) || defined(_M_ARM64))
+#include <cpuinfo_x86.h>
+#endif
 #include <stdexcept>
+#include <unordered_map>
+#ifdef HAMMING_WITH_SSE2
+#include "hamming/distance_sse2.hh"
+#endif
+#ifdef HAMMING_WITH_AVX2
+#include "hamming/distance_avx2.hh"
+#endif
+#ifdef HAMMING_WITH_AVX512
+#include "hamming/distance_avx512.hh"
+#endif
+#ifdef HAMMING_WITH_NEON
+#include "hamming/distance_neon.hh"
+#endif
+
 namespace hamming {
 
 // bit meaning:
@@ -26,6 +43,40 @@ std::array<GeneBlock, 256> lookupTable(bool include_x) {
   return lookup;
 }
 
+distance_func_ptr get_fastest_supported_distance_func() {
+  std::string simd_str = "no";
+  distance_func_ptr distance_func{distance_cpp};
+#if defined(__aarch64__) || defined(_M_ARM64)
+#ifdef HAMMING_WITH_NEON
+  distance_func = distance_neon;
+  simd_str = "NEON";
+#endif
+#else
+  const auto features = cpu_features::GetX86Info().features;
+#ifdef HAMMING_WITH_SSE2
+  if (features.sse2) {
+    distance_func = distance_sse2;
+    simd_str = "SSE2";
+  }
+#endif
+#ifdef HAMMING_WITH_AVX2
+  if (features.avx2) {
+    distance_func = distance_avx2;
+    simd_str = "AVX2";
+  }
+#endif
+#ifdef HAMMING_WITH_AVX512
+  if (features.avx512bw) {
+    distance_func = distance_avx512;
+    simd_str = "AVX512";
+  }
+#endif
+#endif
+  std::cout << "# hammingdist :: Using CPU with " << simd_str
+            << " SIMD extensions..." << std::endl;
+  return distance_func;
+}
+
 void validate_data(const std::vector<std::string> &data) {
   if (data.empty() || data[0].empty()) {
     throw std::runtime_error("Error: Empty sequence");
@@ -143,11 +194,55 @@ to_dense_data(const std::vector<std::string> &data) {
   return dense;
 }
 
+std::pair<std::vector<std::string>, std::vector<std::size_t>>
+read_fasta(const std::string &filename, bool remove_duplicates, std::size_t n) {
+  std::pair<std::vector<std::string>, std::vector<std::size_t>>
+      data_and_sequence_indices;
+  auto &[data, sequence_indices] = data_and_sequence_indices;
+  data.reserve(n);
+  if (n == 0) {
+    n = std::numeric_limits<std::size_t>::max();
+    data.reserve(65536);
+  }
+  std::unordered_map<std::string, std::size_t> map_seq_to_index;
+  // Initializing the stream
+  std::ifstream stream(filename);
+  std::size_t count = 0;
+  std::size_t count_unique = 0;
+  std::string line;
+  // skip first header
+  std::getline(stream, line);
+  while (count < n && !stream.eof()) {
+    std::string seq{};
+    while (std::getline(stream, line) && line[0] != '>') {
+      seq.append(line);
+    }
+    if (remove_duplicates) {
+      auto result = map_seq_to_index.emplace(std::move(seq), count_unique);
+      if (result.second) {
+        ++count_unique;
+      }
+      sequence_indices.push_back(result.first->second);
+    } else {
+      data.push_back(std::move(seq));
+    }
+    ++count;
+  }
+  if (remove_duplicates) {
+    // copy each unique sequence to the vector of strings
+    data.resize(count_unique);
+    for (auto &key_value_pair : map_seq_to_index) {
+      data[key_value_pair.second] = key_value_pair.first;
+    }
+  }
+  return data_and_sequence_indices;
+}
+
 std::vector<GeneBlock> from_string(const std::string &str) {
   alignas(16) std::vector<GeneBlock> r;
   auto lookup = lookupTable();
   std::size_t n_full_blocks{str.size() / 2};
-  r.reserve(1 + n_full_blocks);
+  r.reserve(1 + n_full_blocks + 3);
   auto iter_str = str.cbegin();
   for (std::size_t i_block = 0; i_block < n_full_blocks; ++i_block) {
     r.push_back(lookup[*iter_str] & mask_gene0);
@@ -160,6 +255,11 @@ std::vector<GeneBlock> from_string(const std::string &str) {
     r.push_back(lookup[*iter_str] & mask_gene0);
     r.back() |= (lookup['-'] & mask_gene1);
   }
+  // pad to ensure 64-bit alignment
+  while (8 * (r.size() / 8) != r.size()) {
+    r.push_back(lookup['-']);
+  }
+
   return r;
 }
 
diff --git a/src/hamming_t.cc b/src/hamming_t.cc
index a88d6d9..0b94e61 100644
--- a/src/hamming_t.cc
+++ b/src/hamming_t.cc
@@ -9,6 +9,12 @@ using namespace hamming;
 static constexpr std::array<char, 4> valid_chars{'A', 'C', 'G', 'T'};
 static constexpr std::array<char, 6> invalid_chars{' ', 'N', '*',
                                                    '?', 'a', '.'};
+static std::vector<bool> valid_use_gpu_values() {
+  if (cuda_gpu_available()) {
+    return {false, true};
+  }
+  return {false};
+}
 
 static int dist1(char c1, char c2) {
   std::vector<std::string> v{std::string{c1}, std::string{c2}};
@@ -16,7 +22,7 @@ static int dist1(char c1, char c2) {
 }
 
 static int dist2(char c1, char c2) {
-  return distance(std::string{c1}, std::string{c2});
+  return static_cast<int>(distance(std::string{c1}, std::string{c2}));
 }
 
 TEST_CASE("distance between two equal valid characters is 0", "[distance]") {
@@ -130,17 +136,26 @@ TEST_CASE("from_fasta single line sequences", "[hamming]") {
   of << ">seq1\n";
   of << "ACGTGTCGTTTCGACGAGTCG\n";
   of.close();
-  for (bool remove_duplicates : {false, true}) {
-    for (bool include_x : {false, true}) {
-      CAPTURE(include_x);
-      CAPTURE(remove_duplicates);
-      for (int n : {0, 2, 3, 8}) {
-        auto d =
-            from_fasta<uint8_t>(tmp_file_name, include_x, remove_duplicates, n);
-        REQUIRE(d[{0, 0}] == 0);
-        REQUIRE(d[{0, 1}] == 2);
-        REQUIRE(d[{1, 0}] == 2);
-        REQUIRE(d[{1, 1}] == 0);
+  for (auto use_gpu : valid_use_gpu_values()) {
+    for (bool remove_duplicates : {false, true}) {
+      for (bool include_x : {false, true}) {
+        CAPTURE(include_x);
+        CAPTURE(remove_duplicates);
+        CAPTURE(use_gpu);
+        if (use_gpu && include_x) {
+          // include_x cannot be used with use_gpu
+          REQUIRE_THROWS(from_fasta<uint8_t>(tmp_file_name, include_x,
+                                             remove_duplicates, 1, use_gpu));
+        } else {
+          for (int n : {0, 2, 3, 8}) {
+            auto d = from_fasta<uint8_t>(tmp_file_name, include_x,
+                                         remove_duplicates, n, use_gpu);
+            REQUIRE(d[{0, 0}] == 0);
+            REQUIRE(d[{0, 1}] == 2);
+            REQUIRE(d[{1, 0}] == 2);
+            REQUIRE(d[{1, 1}] == 0);
+          }
+        }
       }
     }
   }
@@ -166,21 +181,31 @@ TEST_CASE("from_fasta single line sequences with duplicates", "[hamming]") {
   of << "ACGTGTCGTATCGACGTGTCG\n";
   of.close();
   std::vector<std::size_t> sequence_indices{0, 1, 1, 1, 0, 2};
-  for (int n : {0, 6, 22}) {
-    for (bool include_x : {false, true}) {
-      CAPTURE(include_x);
-      auto d = from_fasta<uint8_t>(tmp_file_name, include_x, true, n);
-      REQUIRE(d.nsamples == 3);
-      REQUIRE(d.sequence_indices == sequence_indices);
-      REQUIRE(d[{0, 0}] == 0);
-      REQUIRE(d[{0, 1}] == 2);
-      REQUIRE(d[{0, 2}] == 1);
-      REQUIRE(d[{1, 0}] == 2);
-      REQUIRE(d[{1, 1}] == 0);
-      REQUIRE(d[{1, 2}] == 2);
-      REQUIRE(d[{2, 0}] == 1);
-      REQUIRE(d[{2, 1}] == 2);
-      REQUIRE(d[{2, 2}] == 0);
+  for (auto use_gpu : valid_use_gpu_values()) {
+    for (int n : {0, 6, 22}) {
+      for (bool include_x : {false, true}) {
+        CAPTURE(include_x);
+        CAPTURE(use_gpu);
+        if (use_gpu && include_x) {
+          // include_x cannot be used with use_gpu
+          REQUIRE_THROWS(
+              from_fasta<uint8_t>(tmp_file_name, include_x, true, n, use_gpu));
+        } else {
+          auto d =
+              from_fasta<uint8_t>(tmp_file_name, include_x, true, n, use_gpu);
+          REQUIRE(d.nsamples == 3);
+          REQUIRE(d.sequence_indices == sequence_indices);
+          REQUIRE(d[{0, 0}] == 0);
+          REQUIRE(d[{0, 1}] == 2);
+          REQUIRE(d[{0, 2}] == 1);
+          REQUIRE(d[{1, 0}] == 2);
+          REQUIRE(d[{1, 1}] == 0);
+          REQUIRE(d[{1, 2}] == 2);
+          REQUIRE(d[{2, 0}] == 1);
+          REQUIRE(d[{2, 1}] == 2);
+          REQUIRE(d[{2, 2}] == 0);
+        }
+      }
     }
   }
   std::remove(tmp_file_name);
@@ -200,16 +225,23 @@ TEST_CASE("from_fasta multi-line sequences", "[hamming]") {
   of << "ACGTGTCGTGTCGACGTGTCG\n";
   of << "ACGTGTCGTGTCG\n";
   of.close();
-  for (bool remove_duplicates : {false, true}) {
-    for (bool include_x : {false, true}) {
-      CAPTURE(include_x);
-      for (int n : {0, 2, 3, 8}) {
-        auto d =
-            from_fasta<uint8_t>(tmp_file_name, include_x, remove_duplicates, 2);
-        REQUIRE(d[{0, 0}] == 0);
-        REQUIRE(d[{0, 1}] == 2);
-        REQUIRE(d[{1, 0}] == 2);
-        REQUIRE(d[{1, 1}] == 0);
+  for (auto use_gpu : valid_use_gpu_values()) {
+    for (bool remove_duplicates : {false, true}) {
+      for (bool include_x : {false, true}) {
+        CAPTURE(include_x);
+        CAPTURE(use_gpu);
+        if (use_gpu && include_x) {
+          // include_x cannot be used with use_gpu
+          REQUIRE_THROWS(from_fasta<uint8_t>(tmp_file_name, include_x,
+                                             remove_duplicates, 2, use_gpu));
+        } else {
+          auto d = from_fasta<uint8_t>(tmp_file_name, include_x,
+                                       remove_duplicates, 2, use_gpu);
+          REQUIRE(d[{0, 0}] == 0);
+          REQUIRE(d[{0, 1}] == 2);
+          REQUIRE(d[{1, 0}] == 2);
+          REQUIRE(d[{1, 1}] == 0);
+        }
       }
     }
   }
@@ -242,7 +274,7 @@ TEST_CASE("invalid input data: inconsistent sequence lengths", "[invalid]") {
 
 TEST_CASE("from_csv reproduces correct data", "[hamming]") {
   std::mt19937 gen(12345);
-  std::vector<std::string> data(10);
+  std::vector<std::string> data(107);
   for (auto &d : data)
     d = make_test_string(201, gen);
 
@@ -266,17 +298,21 @@ TEMPLATE_TEST_CASE("distance integer saturates instead of overflowing",
   auto n_max{static_cast<std::size_t>(std::numeric_limits<TestType>::max())};
   std::mt19937 gen(12345);
   std::vector<std::string> data(2);
-  for (auto n : {n_max, n_max + 1, n_max + 99}) {
-    CAPTURE(n);
-    data[0] = std::string(n, 'A');
-    data[1] = std::string(n, 'T');
-    DataSet<TestType> dataSet(data);
-    REQUIRE(dataSet[{0, 1}] == n_max);
-    REQUIRE(dataSet[{1, 0}] == n_max);
+  for (auto use_gpu : valid_use_gpu_values()) {
+    for (auto n : {n_max, n_max + 1, n_max + 99}) {
+      CAPTURE(use_gpu);
+      CAPTURE(n);
+      data[0] = std::string(n, 'A');
+      data[1] = std::string(n, 'T');
+      DataSet<TestType> dataSet(data, false, false, {}, use_gpu);
+      REQUIRE(dataSet[{0, 1}] == n_max);
+      REQUIRE(dataSet[{1, 0}] == n_max);
+    }
   }
 }
 
-TEST_CASE("from_lower_triangular reproduces correct data", "[hamming]") {
+TEMPLATE_TEST_CASE("from_lower_triangular reproduces correct data", "[hamming]",
+                   uint8_t, uint16_t) {
   std::mt19937 gen(12345);
   char tmp_file_name[L_tmpnam];
   REQUIRE(std::tmpnam(tmp_file_name) != nullptr);
@@ -286,11 +322,11 @@ TEST_CASE("from_lower_triangular reproduces correct data", "[hamming]") {
     for (auto &d : data)
       d = make_test_string(24, gen);
 
-    DataSet<uint8_t> ref(data);
+    DataSet<TestType> ref(data);
     REQUIRE(ref.nsamples == n);
     ref.dump_lower_triangular(std::string(tmp_file_name));
 
-    auto restore = from_lower_triangular(std::string(tmp_file_name));
+    auto restore = from_lower_triangular<TestType>(std::string(tmp_file_name));
     REQUIRE(ref.nsamples == restore.nsamples);
     for (std::size_t i = 0; i < ref.nsamples; ++i) {
       for (std::size_t j = 0; j < ref.nsamples; ++j) {
@@ -300,3 +336,101 @@ TEST_CASE("from_lower_triangular reproduces correct data", "[hamming]") {
   }
   std::remove(tmp_file_name);
 }
+
+TEST_CASE("from_stringlist GPU and CPU implementations give consistent results",
+          "[hamming][gpu]") {
+  if (!cuda_gpu_available()) {
+    SKIP("No CUDA gpu available");
+  }
+  std::mt19937 gen(12345);
+  for (bool include_x : {false}) {
+    for (int n : {1, 5, 13, 32, 89, 185, 497, 1092}) {
+      for (int n_samples : {2, 3, 4, 7, 11, 32, 33, 257, 689}) {
+        std::vector<std::string> stringlist;
+        stringlist.reserve(n_samples);
+        for (std::size_t i = 0; i < n_samples; ++i) {
+          stringlist.push_back(make_test_string(n, gen, include_x));
+        }
+        CAPTURE(include_x);
+        auto d_cpu{from_stringlist(stringlist, include_x, false)};
+        auto d_gpu{from_stringlist(stringlist, include_x, true)};
+        for (std::size_t i = 0; i < n_samples; ++i) {
+          for (std::size_t j = 0; j < n_samples; ++j) {
+            REQUIRE(d_cpu[{i, j}] == d_gpu[{i, j}]);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE(
+    "from_fasta GPU and CPU implementations give consistent results",
+    "[hamming][gpu]", uint8_t, uint16_t) {
+  if (!cuda_gpu_available()) {
+    SKIP("No CUDA gpu available");
+  }
+  char tmp_file_name[L_tmpnam];
+  std::mt19937 gen(12345);
+  REQUIRE(std::tmpnam(tmp_file_name) != nullptr);
+  CAPTURE(tmp_file_name);
+  for (bool remove_duplicates : {false, true}) {
+    for (bool include_x : {false}) {
+      for (int n : {17, 88, 381, 1023}) {
+        for (int n_samples : {2, 3, 4, 7, 11, 127, 128, 255, 256, 257, 703}) {
+          write_test_fasta(tmp_file_name, n, n_samples, gen, include_x);
+          CAPTURE(include_x);
+          auto d_cpu{from_fasta<TestType>(tmp_file_name, include_x,
+                                          remove_duplicates, 0, false)};
+          auto d_gpu{from_fasta<TestType>(tmp_file_name, include_x,
+                                          remove_duplicates, 0, true)};
+          for (std::size_t i = 0; i < n_samples; ++i) {
+            for (std::size_t j = 0; j < n_samples; ++j) {
+              REQUIRE(d_cpu[{i, j}] == d_gpu[{i, j}]);
+            }
+          }
+        }
+      }
+    }
+  }
+  std::remove(tmp_file_name);
+}
+
+TEST_CASE("from_fasta_to_lower_triangular GPU consistent with CPU from_fasta",
+          "[hamming][gpu]") {
+  if (!cuda_gpu_available()) {
+    SKIP("No CUDA gpu available");
+  }
+  std::mt19937 gen(12345);
+  char tmp_fasta_file_name[L_tmpnam];
+  REQUIRE(std::tmpnam(tmp_fasta_file_name) != nullptr);
+  CAPTURE(tmp_fasta_file_name);
+  char tmp_lt_file_name[L_tmpnam];
+  REQUIRE(std::tmpnam(tmp_lt_file_name) != nullptr);
+  CAPTURE(tmp_lt_file_name);
+  for (bool remove_duplicates : {false, true}) {
+    for (int n : {17, 88, 381, 1023}) {
+      for (int n_samples :
+           {2, 3, 4, 7, 11, 127, 128, 255, 256, 257, 703, 1012}) {
+        CAPTURE(remove_duplicates);
+        CAPTURE(n);
+        CAPTURE(n_samples);
+        write_test_fasta(tmp_fasta_file_name, n, n_samples, gen, false);
+        auto d_cpu{from_fasta<uint16_t>(tmp_fasta_file_name, false,
+                                        remove_duplicates, 0, false)};
+        from_fasta_to_lower_triangular(tmp_fasta_file_name, tmp_lt_file_name,
+                                       remove_duplicates, 0, true);
+        auto d_gpu{from_lower_triangular<uint16_t>(tmp_lt_file_name)};
+        for (std::size_t i = 0; i < n_samples; ++i) {
+          for (std::size_t j = 0; j < n_samples; ++j) {
+            CAPTURE(i);
+            CAPTURE(j);
+            REQUIRE(d_cpu[{i, j}] == d_gpu[{i, j}]);
+          }
+        }
+      }
+    }
+  }
+  std::remove(tmp_fasta_file_name);
+  std::remove(tmp_lt_file_name);
+}
diff --git a/src/hamming_utils.cc b/src/hamming_utils.cc
new file mode 100644
index 0000000..312c032
--- /dev/null
+++ b/src/hamming_utils.cc
@@ -0,0 +1 @@
+#include "hamming/hamming_utils.hh"
diff --git a/src/hamming_utils_bench.cc b/src/hamming_utils_bench.cc
new file mode 100644
index 0000000..7fae605
--- /dev/null
+++ b/src/hamming_utils_bench.cc
@@ -0,0 +1,78 @@
+#include "bench.hh"
+#include "hamming/hamming.hh"
+#include "hamming/hamming_utils.hh"
+#ifdef HAMMING_WITH_OPENMP
+#include <omp.h>
+#endif
+
+using namespace hamming;
+
+static void bench_distance_cpp(benchmark::State &state) {
+#ifdef HAMMING_WITH_OPENMP
+  omp_set_num_threads(1);
+#endif
+  std::mt19937 gen(12345);
+  int64_t n{state.range(0)};
+  auto s1{from_string(make_string(n, gen))};
+  auto s2{from_string(make_string(n, gen))};
+  int d{0};
+  for (auto _ : state) {
+    d += distance_cpp(s1, s2);
+  }
+  state.SetComplexityN(n);
+}
+
+static void bench_distance_sparse(benchmark::State &state) {
+#ifdef HAMMING_WITH_OPENMP
+  omp_set_num_threads(1);
+#endif
+  std::mt19937 gen(12345);
+  int64_t n{state.range(0)};
+  auto s1{make_string(n, gen, false)};
+  auto s2{s1};
+  // make ~0.5% of s2 elements differ from s1
+  randomize_n(s2, n / 200, gen);
+  auto sparse = to_sparse_data({s1, s2}, false);
+  int d{0};
+  for (auto _ : state) {
+    d += distance_sparse(sparse[0], sparse[1]);
+  }
+  state.SetComplexityN(n);
+}
+
+static void bench_partial_write_lower_triangular(benchmark::State &state) {
+  int64_t n{state.range(0)};
+  std::mt19937 gen(12345);
+  auto v{make_distances<uint16_t>(n, gen)};
+  for (auto _ : state) {
+    partial_write_lower_triangular(benchmark_tmp_output_file, v, 0, v.size());
+  }
+  state.SetComplexityN(n);
+}
+
+#ifdef HAMMING_WITH_OPENMP
+static void bench_partial_write_lower_triangular_omp(benchmark::State &state) {
+  omp_set_num_threads(state.range(0));
+  std::mt19937 gen(12345);
+  auto v{make_distances<uint16_t>(16384, gen)};
+  for (auto _ : state) {
+    partial_write_lower_triangular(benchmark_tmp_output_file, v, 0, v.size());
+  }
+}
+#endif
+
+BENCHMARK(bench_distance_sparse)->Range(4096, 4194304)->Complexity();
+
+BENCHMARK(bench_distance_cpp)->Range(4096, 4194304)->Complexity();
+
+BENCHMARK(bench_partial_write_lower_triangular)->Range(2, 32768)->Complexity();
+
+#ifdef HAMMING_WITH_OPENMP
+BENCHMARK(bench_partial_write_lower_triangular_omp)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(12)
+    ->Arg(24);
+#endif
diff --git a/src/tests.cc b/src/tests.cc
index 66022f4..f933e6e 100644
--- a/src/tests.cc
+++ b/src/tests.cc
@@ -24,4 +24,13 @@ std::vector<GeneBlock> make_gene_vector(int n, std::mt19937 &gen,
   return from_string(make_test_string(n, gen, include_x));
 }
 
+void write_test_fasta(const std::string &filename, int n, std::size_t n_seq,
+                      std::mt19937 &gen, bool include_x) {
+  std::ofstream fs;
+  fs.open(filename);
+  for (std::size_t i = 0; i < n_seq; ++i) {
+    fs << ">seq" << i << "\n" << make_test_string(n, gen, include_x) << "\n";
+  }
+  fs.close();
+}
 } // namespace hamming
diff --git a/src/tests.hh b/src/tests.hh
index 9999496..88284c9 100644
--- a/src/tests.hh
+++ b/src/tests.hh
@@ -1,9 +1,10 @@
-#ifndef HAMMING_TESTS_HH
-#define HAMMING_TESTS_HH
+#pragma once
 
 #include "hamming/hamming.hh"
 #include "hamming/hamming_impl.hh"
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers.hpp>
 #include <random>
 #include <vector>
 
@@ -14,6 +15,7 @@ std::string make_test_string(int n, std::mt19937 &gen, bool include_x = false);
 std::vector<GeneBlock> make_gene_vector(int n, std::mt19937 &gen,
                                         bool include_x = false);
 
-} // namespace hamming
+void write_test_fasta(const std::string &filename, int n, std::size_t n_seq,
+                      std::mt19937 &gen, bool include_x = false);
 
-#endif
+} // namespace hamming