add CUDA distance implementation (#123)

- python interface changes - add `from_fasta_to_lower_triangular` - this constructs lower triangular matrix file directly from fasta file - for now only works on GPU - faster & requires less RAM than doing `from_fasta` followed by `dump_lower_triangular` - requires 1.5GB RAM per 100k genomes on gpu + 1GB buffer to store partial distances matrix - add `use_gpu` option to `from_fasta` - if True and include_x is False then the GPU is used to calcuate distances matrix - add `cuda_gpu_available()` utility function - CUDA implementation - each block of threads calculates a single element of the distances matrix - a kernel is launched running on a grid of these blocks to calculate a subset of the distances matrix - I/O is interleaved with computation: the CPU writes the previous kernel results as the next kernel is running - print basic timing info to cout - add libfmt library - migrate to using v3 of catch2 - build wheels using manylinux2014 image with CUDA 11.8 pre-installed from https://github.com/ameli/manylinux-cuda - add a couple of performance plots - bump version to 1.0.0 - resolves #111
ssciwr · Jan 10, 2024 · 5919c48 · 5919c48
1 parent a749996
commit 5919c48
Show file tree

Hide file tree

Showing 41 changed files with 1,269 additions and 264 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -65,4 +65,4 @@ jobs:
       - name: run tests
         shell: bash
         working-directory: ${{runner.workspace}}/build
-        run: ctest
+        run: ctest -j2 --rerun-failed --output-on-failure
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -27,9 +27,11 @@ jobs:
         with:
           submodules: "recursive"
       - uses: pypa/cibuildwheel@v2.16
+        env:
+          CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux2014_x86_64_cuda_11.8
       - uses: actions/upload-artifact@v4
         with:
-          name: wheels-${{ matrix.os }}
+          name: artifacts-${{ matrix.os }}
           path: ./wheelhouse/*.whl
 
   upload_pypi:
@@ -42,7 +44,7 @@ jobs:
     steps:
       - uses: actions/download-artifact@v4
         with:
-          pattern: wheels-*
+          pattern: artifacts-*
           merge-multiple: true
           path: dist
 

diff --git a/.gitmodules b/.gitmodules
@@ -13,3 +13,6 @@
 [submodule "Catch2"]
 	path = ext/Catch2
 	url = https://github.com/catchorg/Catch2.git
+[submodule "ext/fmt"]
+	path = ext/fmt
+	url = https://github.com/fmtlib/fmt.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,8 +1,8 @@
-cmake_minimum_required(VERSION 3.11..3.27)
+cmake_minimum_required(VERSION 3.23..3.27)
 
 project(
   hammingdist
-  VERSION 0.21.0
+  VERSION 1.0.0
   LANGUAGES CXX)
 
 include(CTest)
@@ -39,6 +39,14 @@ set(HAMMING_WITH_NEON
     no
     CACHE BOOL "Enable NEON optimized code on Arm64 CPUs")
 
+set(HAMMING_WITH_CUDA
+    no
+    CACHE BOOL "Build with CUDA support (nvidia gpu)")
+
+if(HAMMING_WITH_CUDA)
+  enable_language(CUDA)
+endif()
+
 # Add git submodules
 add_subdirectory(ext)
 

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-A small C++ tool to calculate pairwise distances between gene sequences given in fasta format.
+A small and fast C++ tool to calculate pairwise distances between gene sequences given in fasta format.
 
 [![DOI](https://zenodo.org/badge/308676358.svg)](https://zenodo.org/badge/latestdoi/308676358)
 [![pypi releases](https://img.shields.io/pypi/v/hammingdist.svg)](https://pypi.org/project/hammingdist)
@@ -117,9 +117,34 @@ distance = hammingdist.distance("ACGTX", "AAGTX", include_x=True)
 
 ## OpenMP on linux
 
-The latest versions of hammingdist on linux are now built with OpenMP (multithreading) support.
-If this causes any issues, you can install a previous version of hammingdist without OpenMP support:
+On linux hammingdist is built with OpenMP (multithreading) support, and will automatically make use of all available CPU threads.
 
-```bash
-pip install hammingdist==0.11.0
+## CUDA on linux
+
+On linux hammingdist is also built with CUDA (Nvidia GPU) support.
+To use the GPU instead of the CPU, set `use_gpu=True` when calling `from_fasta`:
+
+```python
+import hammingdist
+
+data = hammingdist.from_fasta("example.fasta", use_gpu=True)
+```
+
+Additionally, the lower triangular matrix file can now be directly constructed from the fasta file
+using the GPU with the `from_fasta_to_lower_triangular` function.
+This avoids storing the entire distances matrix in memory and interleaves computation on the GPU with disk I/O on the CPU,
+which means it requires less RAM and runs faster.
+
+```python
+import hammingdist
+
+hammingdist.from_fasta_to_lower_triangular('input_fasta.txt', 'output_lower_triangular.txt', use_gpu=True)
 ```
+
+![overview](plots/speed.png)
+
+## Performance history
+
+A rough measure of the impact of the different performance improvements in hammingdist:
+
+![overview](plots/overview.png)
diff --git a/ext/CMakeLists.txt b/ext/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(fmt)
+
 if(HAMMING_BUILD_PYTHON)
   add_subdirectory(pybind11)
 endif()

diff --git a/ext/Catch2 b/ext/Catch2
diff --git a/ext/fmt b/ext/fmt
diff --git a/include/hamming/distance_avx2.hh b/include/hamming/distance_avx2.hh
@@ -1,5 +1,4 @@
-#ifndef _HAMMING_DISTANCE_AVX2_HH
-#define _HAMMING_DISTANCE_AVX2_HH
+#pragma once
 
 #include <cstdint>
 #include <vector>
@@ -12,5 +11,3 @@ int distance_avx2(const std::vector<GeneBlock> &a,
                   const std::vector<GeneBlock> &b);
 
 }
-
-#endif
diff --git a/include/hamming/distance_avx512.hh b/include/hamming/distance_avx512.hh
@@ -1,5 +1,4 @@
-#ifndef _HAMMING_DISTANCE_AVX512_HH
-#define _HAMMING_DISTANCE_AVX512_HH
+#pragma once
 
 #include <cstdint>
 #include <vector>
@@ -12,5 +11,3 @@ int distance_avx512(const std::vector<GeneBlock> &a,
                     const std::vector<GeneBlock> &b);
 
 }
-
-#endif
diff --git a/include/hamming/distance_cuda.hh b/include/hamming/distance_cuda.hh
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <cstdint>
+#include <iostream>
+#include <vector>
+
+#include "hamming/hamming_impl_types.hh"
+
+namespace hamming {
+
+bool distance_cuda_have_device();
+
+int distance_cuda(const std::vector<GeneBlock> &a,
+                  const std::vector<GeneBlock> &b);
+
+// for now explicit function def for each choice of integer type
+std::vector<uint8_t>
+distances_cuda_8bit(const std::vector<std::vector<GeneBlock>> &data);
+
+std::vector<uint16_t>
+distances_cuda_16bit(const std::vector<std::vector<GeneBlock>> &data);
+
+void distances_cuda_to_lower_triangular(
+    const std::vector<std::vector<GeneBlock>> &data,
+    const std::string &filename);
+
+} // namespace hamming
diff --git a/include/hamming/distance_sse2.hh b/include/hamming/distance_sse2.hh
@@ -1,5 +1,4 @@
-#ifndef _HAMMING_DISTANCE_SSE2_HH
-#define _HAMMING_DISTANCE_SSE2_HH
+#pragma once
 
 #include <cstdint>
 #include <vector>
@@ -12,5 +11,3 @@ int distance_sse2(const std::vector<GeneBlock> &a,
                   const std::vector<GeneBlock> &b);
 
 }
-
-#endif
diff --git a/include/hamming/hamming.hh b/include/hamming/hamming.hh
@@ -1,13 +1,12 @@
-#ifndef _HAMMING_HH
-#define _HAMMING_HH
+#pragma once
 
 #include "hamming/hamming_impl.hh"
 #include "hamming/hamming_types.hh"
+#include "hamming/hamming_utils.hh"
 #include <cmath>
 #include <fstream>
 #include <sstream>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 namespace hamming {
@@ -19,10 +18,11 @@ inline std::size_t uint_sqrt(std::size_t x) {
 template <typename DistIntType> struct DataSet {
   explicit DataSet(std::vector<std::string> &data, bool include_x = false,
                    bool clear_input_data = false,
-                   std::vector<std::size_t> &&indices = {})
+                   std::vector<std::size_t> &&indices = {},
+                   bool use_gpu = false)
       : nsamples(data.size()), sequence_indices(std::move(indices)) {
     validate_data(data);
-    result = distances<DistIntType>(data, include_x, clear_input_data);
+    result = distances<DistIntType>(data, include_x, clear_input_data, use_gpu);
   }
 
   explicit DataSet(const std::string &filename) {
@@ -67,35 +67,7 @@ template <typename DistIntType> struct DataSet {
   }
 
   void dump_lower_triangular(const std::string &filename) {
-    std::ofstream stream(filename);
-#ifdef HAMMING_WITH_OPENMP
-    std::size_t block_size = 200;
-#pragma omp parallel for ordered schedule(static, 1)
-    for (std::size_t i_start = 1; i_start < nsamples; i_start += block_size) {
-      std::stringstream line;
-      std::size_t i_end = i_start + block_size;
-      if (i_end > nsamples) {
-        i_end = nsamples;
-      }
-      for (std::size_t i = i_start; i < i_end; ++i) {
-        std::size_t offset{i * (i - 1) / 2};
-        for (std::size_t j = 0; j + 1 < i; ++j) {
-          line << static_cast<int>(result[offset + j]) << ",";
-        }
-        line << static_cast<int>(result[offset + i - 1]) << "\n";
-      }
-#pragma omp ordered
-      stream << line.str();
-    }
-#else
-    std::size_t k = 0;
-    for (std::size_t i = 1; i < nsamples; ++i) {
-      for (std::size_t j = 0; j + 1 < i; ++j) {
-        stream << static_cast<int>(result[k++]) << ",";
-      }
-      stream << static_cast<int>(result[k++]) << "\n";
-    }
-#endif
+    write_lower_triangular(filename, result);
   }
 
   void dump_sequence_indices(const std::string &filename) {
@@ -127,67 +99,52 @@ template <typename DistIntType> struct DataSet {
   std::vector<std::size_t> sequence_indices{};
 };
 
-DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data);
+DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data,
+                                            bool include_x = false,
+                                            bool use_gpu = false);
 
 DataSet<DefaultDistIntType> from_csv(const std::string &filename);
 
-DataSet<DefaultDistIntType> from_lower_triangular(const std::string &filename);
-
 template <typename DistIntType>
-DataSet<DistIntType>
-from_fasta(const std::string &filename, bool include_x = false,
-           bool remove_duplicates = false, std::size_t n = 0) {
-  std::vector<std::string> data;
-  data.reserve(n);
-  if (n == 0) {
-    n = std::numeric_limits<std::size_t>::max();
-    data.reserve(65536);
-  }
-  std::unordered_map<std::string, std::size_t> map_seq_to_index;
-  std::vector<std::size_t> sequence_indices{};
-  // Initializing the stream
+DataSet<DistIntType> from_lower_triangular(const std::string &filename) {
+  std::vector<DistIntType> distances;
   std::ifstream stream(filename);
-  std::size_t count = 0;
-  std::size_t count_unique = 0;
   std::string line;
-  // skip first header
-  std::getline(stream, line);
-  while (count < n && !stream.eof()) {
-    std::string seq{};
-    while (std::getline(stream, line) && line[0] != '>') {
-      seq.append(line);
-    }
-    if (remove_duplicates) {
-      auto result = map_seq_to_index.emplace(std::move(seq), count_unique);
-      if (result.second) {
-        ++count_unique;
-      }
-      sequence_indices.push_back(result.first->second);
-    } else {
-      data.push_back(std::move(seq));
-    }
-    ++count;
-  }
-  if (remove_duplicates) {
-    // copy each unique sequence to the vector of strings
-    data.resize(count_unique);
-    for (auto &key_value_pair : map_seq_to_index) {
-      data[key_value_pair.second] = key_value_pair.first;
+  while (std::getline(stream, line)) {
+    std::istringstream s(line);
+    std::string d;
+    while (s.good()) {
+      std::getline(s, d, ',');
+      distances.push_back(safe_int_cast<DistIntType>(std::stoi(d)));
     }
   }
+  return DataSet(std::move(distances));
+}
+
+template <typename DistIntType>
+DataSet<DistIntType> from_fasta(const std::string &filename,
+                                bool include_x = false,
+                                bool remove_duplicates = false,
+                                std::size_t n = 0, bool use_gpu = false) {
+  auto [data, sequence_indices] = read_fasta(filename, remove_duplicates, n);
   return DataSet<DistIntType>(data, include_x, true,
-                              std::move(sequence_indices));
+                              std::move(sequence_indices), use_gpu);
 }
 
+void from_fasta_to_lower_triangular(const std::string &input_filename,
+                                    const std::string &output_filename,
+                                    bool remove_duplicates = false,
+                                    std::size_t n = 0, bool use_gpu = false);
+
 ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1,
                               bool include_x = false);
+
 std::vector<ReferenceDistIntType>
 fasta_reference_distances(const std::string &reference_sequence,
                           const std::string &fasta_file,
                           bool include_x = false);
+
 std::vector<std::size_t> fasta_sequence_indices(const std::string &fasta_file,
                                                 std::size_t n = 0);
 
 } // namespace hamming
-
-#endif