diff --git a/BENCHMARK.md b/BENCHMARK.md new file mode 100644 index 0000000..9cd667f --- /dev/null +++ b/BENCHMARK.md @@ -0,0 +1,104 @@ +# Benchmark + +## Build + +```bash +cd small_gicp +mkdir build && cd build + +cmake .. -DBUILD_WITH_TBB=ON -DBUILD_WITH_PCL=ON -DBUILD_BENCHMARKS=ON + +# [optional] Build with Iridescence (visualization) +git clone https://github.com/koide3/iridescence --recursive +mkdir iridescence/build && cd iridescence/build +cmake .. && make -j +sudo make install + +cmake .. -DBUILD_WITH_IRIDESCENCE=ON + +# [optional] Build with fast_gicp +export FAST_GICP_INCLUDE_DIR=/your/fast_gicp/include +cmake .. -DBUILD_WITH_FAST_GICP=ON + +# Build +make -j + +# Test +# Show options +./odometry_benchmark + +# USAGE: odometry_benchmark [options] +# OPTIONS: +# --visualize +# --num_threads (default: 4) +# --num_neighbors (default: 20) +# --downsampling_resolution (default: 0.25) +# --voxel_resolution (default: 2.0) +# --engine (default: small_gicp) + +# Run odometry benchmark +./odometry_benchmark /your/kitti/dataset/velodyne /tmp/traj_lidar.txt --visualize --num_threads 16 --engine small_gicp_tbb +``` + +## Results + +All benchmarks were conducted on the KITTI 00 sequence. + +### Downsampling + +```bash +cd small_gicp/scripts +./run_downsampling_benchmark.sh +python3 plot_downsampling.py +``` + +- Single-threaded `small_gicp::voxelgrid_sampling` is about **1.3x faster** than `pcl::VoxelGrid`. +- Multi-threaded `small_gicp::voxelgrid_sampling_tbb` (6 threads) is about **3.2x faster** than `pcl::VoxelGrid`. +- `small_gicp::voxelgrid_sampling` gives accurate downsampling results (almost identical to those of `pcl::VoxelGrid`) while `pcl::ApproximateVoxelGrid` yields spurious points (up to 2x points). +- `small_gicp::voxelgrid_sampling` can process a larger point cloud with a fine voxel resolution compared to `pcl::VoxelGrid` (for a point cloud of 1000m width, minimum voxel resolution can be 0.5 mm). + +![downsampling_comp](docs/assets/downsampling_comp.png) + +- While TBB shows slightly better scalability, both the parallelism backends do not obtain a speed gain for the cases with threads more than 16. + +![downsampling_threads](docs/assets/downsampling_threads.png) + +### KdTree construction + +```bash +cd small_gicp/scripts +./run_kdtree_benchmark.sh +python3 plot_kdtree.py +``` + +- Multi-threaded implementation (TBB and OMP) can be up to **4x faster** than the single-threaded one (All the implementations are based on nanoflann). +- Basically the processing speed get faster as the number of threads increases, but the speed gain is not monotonic sometimes (because of the scheduling algorithm or some CPU(AMD 5995WX)-specific issues?). +- This benchmark only compares the construction time (query time is not included). + +![kdtree_time](docs/assets/kdtree_time.png) + +### Odometry estimation + +```bash +cd small_gicp/scripts +./run_odometry_benchmark.sh +python3 plot_odometry.py +``` + +- Single-thread `small_gicp::GICP` is about **2.4x and 1.9x faster** than `pcl::GICP` and `fast_gicp::GICP`, respectively. +- `small_gicp::(GICP|VGICP)` shows a better multi-thread scalability compared to `fast_gicp::(GICP|VGICP)`. +- `small_gicp::GICP` parallelized with [TBB flow graph](src/odometry_benchmark_small_gicp_tbb_flow.cpp) shows an excellent scalablity to many-threads situations (**~128 threads**) but with latency degradation. + +![odometry_time](docs/assets/odometry_time.png) + +**SIMD intrinsics (-march=native)** (We recommend keeping this feature disabled unless you are 100% sure what it is) + +- `BUILD_WITH_MARCH_NATIVE=ON` enables platform-specific intrinsics and squeezing the performance (**1.1x speedup for free**). +- However, you must ensure that all involved libraries are built with `-march=native`, otherwise the program will crash. +- Generally, it is difficult to properly set `-march=native` for all libraries, and we recommend keeping `BUILD_WITH_MARCH_NATIVE=OFF`. + +Results: +- `BUILD_WITH_MARCH_NATIVE=OFF` : `Eigen::SimdInstructionSetsInUse()=SSE, SSE2` +- `BUILD_WITH_MARCH_NATIVE=ON` : `Eigen::SimdInstructionSetsInUse()=AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2` + +![odometry_native](docs/assets/odometry_native.png) \ No newline at end of file diff --git a/README.md b/README.md index b357563..984c792 100644 --- a/README.md +++ b/README.md @@ -335,7 +335,7 @@ open3d.visualization.draw_geometries([target_o3d, source_o3d]) -## Benchmark +## [BENCHMARK](Benchmark.md) ### Downsampling diff --git a/docs/assets/odometry_native.png b/docs/assets/odometry_native.png new file mode 100644 index 0000000..4868ede Binary files /dev/null and b/docs/assets/odometry_native.png differ diff --git a/scripts/plot_kdtree.py b/scripts/plot_kdtree.py index 4022869..a30d9c1 100644 --- a/scripts/plot_kdtree.py +++ b/scripts/plot_kdtree.py @@ -21,16 +21,17 @@ def parse_result(filename): num_points.append(int(matched.group(1))) continue - matched = re.match(r'(\S+)_times=(\S+) \+\- (\S+)', line) + matched = re.match(r'(\S+)_times=(\S+) \+\- (\S+) \(median=(\S+)\)', line) if matched: method = matched.group(1) mean = float(matched.group(2)) std = float(matched.group(3)) + med = float(matched.group(4)) if method not in results: results[method] = [] - results[method].append(mean) + results[method].append(med) continue return (num_threads, num_points, results) diff --git a/scripts/plot_odometry_native.py b/scripts/plot_odometry_native.py new file mode 100644 index 0000000..5c44602 --- /dev/null +++ b/scripts/plot_odometry_native.py @@ -0,0 +1,66 @@ +#!/usr/bin/python3 +import os +import re +import numpy +from collections import namedtuple +from matplotlib import pyplot + +Result = namedtuple('Result', ['reg_mean', 'reg_std', 'tp_mean', 'tp_std']) + +def parse_result(filename): + reg_mean = None + reg_std = None + throughput_mean = None + throughput_std = None + with open(filename, 'r') as f: + for line in f.readlines(): + found = re.findall(r'([^=]+)\s*\+\-\s*(\S+)', line) + if not found or len(found) != 2: + found = re.findall(r'total_throughput=(\S+)', line) + if found: + throughput_mean = float(found[0]) + continue + + reg_mean = float(found[0][0].strip()) + reg_std = float(found[0][1].strip()) + throughput_mean = float(found[1][0].strip()) + throughput_std = float(found[1][1].strip()) + + return Result(reg_mean, reg_std, throughput_mean, throughput_std) + +def main(): + results_path = os.path.dirname(__file__) + '/results' + + results = {} + for filename in os.listdir(results_path): + found = re.findall(r'odometry_benchmark_(\S+)_(native|nonnative)_(\d+).txt', filename) + if not found: + continue + + rets = parse_result(results_path + '/' + filename) + results['{}_{}_{}'.format(found[0][0], found[0][1], found[0][2])] = rets + + fig, axes = pyplot.subplots(1, 1, figsize=(12, 2)) + axes = [axes] + + num_threads = [1, 2, 4, 8, 16, 32, 64, 128] + + print(results['small_gicp_native_1'], results['small_gicp_tbb_native_1']) + print(results['small_gicp_nonnative_1'], results['small_gicp_tbb_nonnative_1']) + + native = [results['small_gicp_tbb_native_{}'.format(N)].reg_mean for N in num_threads] + nonnative = [results['small_gicp_tbb_nonnative_{}'.format(N)].reg_mean for N in num_threads] + + axes[0].plot(num_threads, native, label='small_gicp_tbb (-march=native)', marker='o') + axes[0].plot(num_threads, nonnative, label='small_gicp_tbb (nonnative)', marker='o') + axes[0].set_xlabel('Number of threads [1, 2, ..., 128]') + axes[0].set_ylabel('Time [msec/scan]') + axes[0].set_xscale('log') + axes[0].grid() + axes[0].legend() + + pyplot.show() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/run_odometry_benchmark_native.sh b/scripts/run_odometry_benchmark_native.sh new file mode 100755 index 0000000..b9b2242 --- /dev/null +++ b/scripts/run_odometry_benchmark_native.sh @@ -0,0 +1,20 @@ +#!/bin/bash +dataset_path=$1 +exe_path=../build/odometry_benchmark + +mkdir results + +engines=(small_gicp small_gicp_tbb) +for engine in ${engines[@]}; do + N=1 + $exe_path $dataset_path $(printf "results/traj_lidar_%s_nonnative_%d.txt" $engine $N) --num_threads $N --engine $engine | tee $(printf "results/odometry_benchmark_%s_nonnative_%d.txt" $engine $N) +done + +engines=(small_gicp_tbb) +num_threads=(128 96 64 32 16 8 4 2) + +for N in ${num_threads[@]}; do + for engine in ${engines[@]}; do + $exe_path $dataset_path $(printf "results/traj_lidar_%s_nonnative_%d.txt" $engine $N) --num_threads $N --engine $engine | tee $(printf "results/odometry_benchmark_%s_nonnative_%d.txt" $engine $N) + done +done diff --git a/src/odometry_benchmark.cpp b/src/odometry_benchmark.cpp index c9b2799..088f2d2 100644 --- a/src/odometry_benchmark.cpp +++ b/src/odometry_benchmark.cpp @@ -54,6 +54,7 @@ int main(int argc, char** argv) { } } + std::cout << "SIMD in use=" << Eigen::SimdInstructionSetsInUse() << std::endl; std::cout << "dataset_path=" << dataset_path << std::endl; std::cout << "output_path=" << output_path << std::endl; std::cout << "registration_engine=" << engine << std::endl;