From 5da02a35be781585587d3aa69d68f00aa4d6ec6e Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Thu, 8 Jan 2026 18:46:41 -0500
Subject: [PATCH 01/20] add top level interpn function

---
 Cargo.lock |  54 +++++++++++++++++++++---
 Cargo.toml |  12 +++---
 src/lib.rs | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 175 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 461a64f..25a0e27 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11,6 +11,15 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "alloca"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5a7d05ea6aea7e9e64d25b9156ba2fee3fdd659e34e41063cd2fc7cd020d7f4"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "anes"
 version = "0.1.6"
@@ -111,10 +120,11 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
 
 [[package]]
 name = "criterion"
-version = "0.7.0"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928"
+checksum = "4d883447757bb0ee46f233e9dc22eb84d93a9508c9b868687b274fc431d886bf"
 dependencies = [
+ "alloca",
  "anes",
  "cast",
  "ciborium",
@@ -123,6 +133,7 @@ dependencies = [
  "itertools 0.13.0",
  "num-traits",
  "oorandom",
+ "page_size",
  "plotters",
  "rayon",
  "regex",
@@ -134,9 +145,9 @@ dependencies = [
 
 [[package]]
 name = "criterion-plot"
-version = "0.6.0"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338"
+checksum = "ed943f81ea2faa8dcecbbfa50164acf95d555afec96a27871663b300e387b2e4"
 dependencies = [
  "cast",
  "itertools 0.13.0",
@@ -221,7 +232,7 @@ checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
 
 [[package]]
 name = "interpn"
-version = "0.9.1"
+version = "0.9.2"
 dependencies = [
  "criterion",
  "crunchy",
@@ -231,6 +242,7 @@ dependencies = [
  "numpy",
  "pyo3",
  "rand",
+ "rayon",
 ]
 
 [[package]]
@@ -381,6 +393,16 @@ version = "11.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
 
+[[package]]
+name = "page_size"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "plotters"
 version = "0.3.7"
@@ -824,6 +846,22 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
 [[package]]
 name = "winapi-util"
 version = "0.1.11"
@@ -833,6 +871,12 @@ dependencies = [
  "windows-sys",
 ]
 
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
 [[package]]
 name = "windows-link"
 version = "0.2.0"
diff --git a/Cargo.toml b/Cargo.toml
index 5ddb9a4..125f39f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "interpn"
-version = "0.9.1"
+version = "0.9.2"
 edition = "2024"
 rust-version = "1.87" # 2025-05-15
 authors = ["James Logan <jlogan03@gmail.com>"]
@@ -17,8 +17,9 @@ crate-type = ["cdylib", "rlib"]
 
 [dependencies]
 # Rust lib deps
-num-traits = { version = "0.2.19", default-features = false, features = ["libm"] }
-crunchy = { version = "0.2.4", default-features = false }
+num-traits = { version = "^0.2.19", default-features = false, features = ["libm"] }
+crunchy = { version = "^0.2.4", default-features = false }
+rayon = { version = "^1.11.0", optional = true }
 
 # Python bindings
 pyo3 = { version = "0.27.2", features = ["extension-module", "abi3-py310", "generate-import-lib"], optional = true }
@@ -29,15 +30,16 @@ itertools = { version = "0.14.0", optional = true }
 
 [dev-dependencies]
 rand = "0.9.2"
-criterion = "0.7.0"
+criterion = "0.8.1"
 ndarray = "0.17.1"
 
 [features]
-default = ["std", "crunchy/limit_64"]
+default = ["std", "crunchy/limit_64", "par"]
 python = ["numpy", "pyo3", "std"]
 std = ["itertools"]
 deep-unroll = ["crunchy/limit_256"]
 fma = []
+par = ["std", "rayon"]
 
 [profile.release]
 opt-level = 3
diff --git a/src/lib.rs b/src/lib.rs
index 88d8cab..da481a0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -88,12 +88,24 @@
 // expanded code that is entirely in const.
 #![allow(clippy::absurd_extreme_comparisons)]
 
+use num_traits::Float;
+
 pub mod multilinear;
 pub use multilinear::{MultilinearRectilinear, MultilinearRegular};
 
 pub mod multicubic;
 pub use multicubic::{MulticubicRectilinear, MulticubicRegular};
 
+pub mod linear {
+    pub use crate::multilinear::rectilinear;
+    pub use crate::multilinear::regular;
+}
+
+pub mod cubic {
+    pub use crate::multicubic::rectilinear;
+    pub use crate::multicubic::regular;
+}
+
 pub mod nearest;
 pub use nearest::{NearestRectilinear, NearestRegular};
 
@@ -112,6 +124,113 @@ pub(crate) mod testing;
 #[cfg(feature = "python")]
 pub mod python;
 
+pub enum GridInterpMethod {
+    Linear,
+    Cubic,
+}
+
+pub enum GridKind {
+    Regular,
+    Rectilinear,
+}
+
+const MAXDIMS: usize = 8;
+const MAXDIMS_ERR: &str =
+    "Dimension exceeds maximum (8). Use interpolator struct directly for higher dimensions.";
+
+pub fn interpn<T: Float>(
+    grids: &[&[T]],
+    vals: &[T],
+    obs: &[&[T]],
+    out: &mut [T],
+    method: GridInterpMethod,
+    assume_grid_kind: Option<GridKind>,
+    linearize_extrapolation: bool,
+) -> Result<(), &'static str> {
+    let ndims = grids.len();
+    if ndims > MAXDIMS {
+        return Err(MAXDIMS_ERR);
+    }
+
+    let kind = match assume_grid_kind {
+        Some(GridKind::Regular) => GridKind::Regular,
+        Some(GridKind::Rectilinear) => GridKind::Rectilinear,
+        None => {
+            // Check whether grid is regular
+            let mut is_regular = true;
+
+            for grid in grids.iter() {
+                if grid.len() < 2 {
+                    return Err("All grids must have at least two entries");
+                }
+                let step = grid[1] - grid[0];
+
+                if !grid.windows(2).all(|pair| pair[1] - pair[0] == step) {
+                    is_regular = false;
+                    break;
+                }
+            }
+
+            if is_regular {
+                GridKind::Regular
+            } else {
+                GridKind::Rectilinear
+            }
+        }
+    };
+
+    // Extract regular grid params
+    let get_regular_grid = || {
+        let mut dims = [0_usize; MAXDIMS];
+        let mut starts = [T::zero(); MAXDIMS];
+        let mut steps = [T::zero(); MAXDIMS];
+
+        for (i, grid) in grids.iter().enumerate() {
+            if grid.len() < 2 {
+                return Err("All grids must have at least two entries");
+            }
+            dims[i] = grid.len();
+            starts[i] = grid[0];
+            steps[i] = grid[1] - grid[0];
+        }
+
+        Ok((dims, starts, steps))
+    };
+
+    // Select lower-level method
+    match (method, kind) {
+        (GridInterpMethod::Linear, GridKind::Regular) => {
+            let (dims, starts, steps) = get_regular_grid()?;
+            linear::regular::interpn(
+                &dims[..ndims],
+                &starts[..ndims],
+                &steps[..ndims],
+                vals,
+                obs,
+                out,
+            )
+        }
+        (GridInterpMethod::Linear, GridKind::Rectilinear) => {
+            linear::rectilinear::interpn(grids, vals, obs, out)
+        }
+        (GridInterpMethod::Cubic, GridKind::Regular) => {
+            let (dims, starts, steps) = get_regular_grid()?;
+            cubic::regular::interpn(
+                &dims[..ndims],
+                &starts[..ndims],
+                &steps[..ndims],
+                vals,
+                linearize_extrapolation,
+                obs,
+                out,
+            )
+        }
+        (GridInterpMethod::Cubic, GridKind::Rectilinear) => {
+            cubic::rectilinear::interpn(grids, vals, linearize_extrapolation, obs, out)
+        }
+    }
+}
+
 /// Index a single value from an array
 #[inline]
 pub(crate) fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {

From 1438e5770f6d09d632582d51e1a3574ce5cbef66 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Thu, 8 Jan 2026 19:48:02 -0500
Subject: [PATCH 02/20] first cut at parallel implementation

---
 src/lib.rs | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 88 insertions(+), 4 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index da481a0..aacc2ee 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -115,6 +115,15 @@ pub use one_dim::{
     linear::Linear1D, linear::LinearHoldLast1D,
 };
 
+#[cfg(feature = "par")]
+use rayon::{
+    iter::{IndexedParallelIterator, ParallelIterator},
+    slice::ParallelSliceMut,
+};
+
+#[cfg(feature = "par")]
+use std::sync::Mutex;
+
 #[cfg(feature = "std")]
 pub mod utils;
 
@@ -124,13 +133,19 @@ pub(crate) mod testing;
 #[cfg(feature = "python")]
 pub mod python;
 
+/// Interpolant function for multi-dimensional methods.
+#[derive(Clone, Copy)]
 pub enum GridInterpMethod {
     Linear,
     Cubic,
 }
 
+/// Grid spacing category for multi-dimensional methods.
+#[derive(Clone, Copy)]
 pub enum GridKind {
+    /// Evenly-spaced points along each axis.
     Regular,
+    /// Un-evenly spaced points along each axis.
     Rectilinear,
 }
 
@@ -138,7 +153,79 @@ const MAXDIMS: usize = 8;
 const MAXDIMS_ERR: &str =
     "Dimension exceeds maximum (8). Use interpolator struct directly for higher dimensions.";
 
-pub fn interpn<T: Float>(
+#[cfg(feature = "par")]
+pub fn interpn<T: Float + Send + Sync>(
+    grids: &[&[T]],
+    vals: &[T],
+    obs: &[&[T]],
+    out: &mut [T],
+    method: GridInterpMethod,
+    assume_grid_kind: Option<GridKind>,
+    linearize_extrapolation: bool,
+    max_threads: Option<usize>,
+) -> Result<(), &'static str> {
+    let ndims = grids.len();
+    if ndims > MAXDIMS {
+        return Err(MAXDIMS_ERR);
+    }
+
+    // Chunk for parallelism
+    let num_cores = rayon::current_num_threads()
+        .max(1)
+        .min(max_threads.unwrap_or(usize::MAX));
+    let n = out.len();
+    let chunk = 1024.max(n / num_cores);
+
+    // Make a shared error indicator
+    let result: Mutex<Option<&'static str>> = Mutex::new(None);
+    let write_err = |msg: &'static str| {
+        let mut guard = result.lock().unwrap();
+        if guard.is_none() {
+            *guard = Some(msg);
+        }
+    };
+
+    // Run threaded
+    out.par_chunks_mut(chunk).enumerate().for_each(|(i, outc)| {
+        // Calculate the start and end of observation point chunks
+        let start = chunk * i;
+        let end = start + outc.len();
+
+        // Chunk observation points
+        let mut obs_slices: [&[T]; 8] = [&[]; 8];
+        for (j, o) in obs.iter().enumerate() {
+            let s = &o.get(start..end);
+            match s {
+                Some(s) => obs_slices[j] = s,
+                None => write_err("Dimension mismatch"),
+            };
+        }
+
+        // Do interpolations
+        let res_inner = interpn_serial(
+            grids,
+            vals,
+            &obs_slices[..ndims],
+            outc,
+            method,
+            assume_grid_kind,
+            linearize_extrapolation,
+        );
+
+        match res_inner {
+            Ok(()) => {}
+            Err(msg) => write_err(msg),
+        }
+    });
+
+    // Handle errors from threads
+    match *result.lock().unwrap() {
+        Some(msg) => Err(msg),
+        None => Ok(()),
+    }
+}
+
+pub fn interpn_serial<T: Float>(
     grids: &[&[T]],
     vals: &[T],
     obs: &[&[T]],
@@ -251,11 +338,8 @@ pub(crate) fn index_arr_fixed_dims<T: Copy, const N: usize>(
 ) -> T {
     let mut i = 0;
 
-    // unroll! {
-    //     for j < 7 in 0..N {
     for j in 0..N {
         i += loc[j] * dimprod[j];
-        // }
     }
 
     data[i]

From 86a2c3b85e6761a5b1193538d3f13f3a14b24c8e Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Thu, 8 Jan 2026 20:27:04 -0500
Subject: [PATCH 03/20] raw python bindings for new method

---
 src/python.rs | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/src/python.rs b/src/python.rs
index 165d684..d27d86b 100644
--- a/src/python.rs
+++ b/src/python.rs
@@ -5,6 +5,7 @@ use pyo3::prelude::*;
 use crate::multicubic;
 use crate::multilinear;
 use crate::nearest;
+use crate::{GridInterpMethod, GridKind};
 
 /// Maximum number of dimensions for linear interpn convenience methods
 const MAXDIMS: usize = 8;
@@ -35,6 +36,9 @@ fn interpn<'py>(_py: Python, m: &Bound<'py, PyModule>) -> PyResult<()> {
     // Multicubic with rectilinear grid
     m.add_function(wrap_pyfunction!(interpn_cubic_rectilinear_f64, m)?)?;
     m.add_function(wrap_pyfunction!(interpn_cubic_rectilinear_f32, m)?)?;
+    // Top-level interpn dispatch
+    m.add_function(wrap_pyfunction!(interpn_f64, m)?)?;
+    m.add_function(wrap_pyfunction!(interpn_f32, m)?)?;
     Ok(())
 }
 
@@ -290,3 +294,66 @@ macro_rules! interpn_cubic_rectilinear_impl {
 
 interpn_cubic_rectilinear_impl!(interpn_cubic_rectilinear_f64, f64);
 interpn_cubic_rectilinear_impl!(interpn_cubic_rectilinear_f32, f32);
+
+fn parse_grid_interp_method(method: &str) -> Result<GridInterpMethod, PyErr> {
+    match method.to_ascii_lowercase().as_str() {
+        "linear" => Ok(GridInterpMethod::Linear),
+        "cubic" => Ok(GridInterpMethod::Cubic),
+        _ => Err(exceptions::PyValueError::new_err(
+            "`method` must be 'linear' or 'cubic'",
+        )),
+    }
+}
+
+fn parse_grid_kind(grid_kind: Option<&str>) -> Result<Option<GridKind>, PyErr> {
+    match grid_kind.map(|kind| kind.to_ascii_lowercase()) {
+        None => Ok(None),
+        Some(kind) => match kind.as_str() {
+            "regular" => Ok(Some(GridKind::Regular)),
+            "rectilinear" => Ok(Some(GridKind::Rectilinear)),
+            _ => Err(exceptions::PyValueError::new_err(
+                "`grid_kind` must be 'regular', 'rectilinear', or None",
+            )),
+        },
+    }
+}
+
+macro_rules! interpn_top_impl {
+    ($funcname:ident, $T:ty) => {
+        #[pyfunction]
+        #[pyo3(signature = (grids, vals, obs, out, method="linear", grid_kind=None, linearize_extrapolation=true, max_threads=None))]
+        fn $funcname(
+            grids: Vec<PyReadonlyArray1<$T>>,
+            vals: PyReadonlyArray1<$T>,
+            obs: Vec<PyReadonlyArray1<$T>>,
+            mut out: PyReadwriteArray1<$T>,
+            method: &str,
+            grid_kind: Option<&str>,
+            linearize_extrapolation: bool,
+            max_threads: Option<usize>,
+        ) -> PyResult<()> {
+            unpack_vec_of_arr!(grids, grids, $T);
+            unpack_vec_of_arr!(obs, obs, $T);
+
+            let method = parse_grid_interp_method(method)?;
+            let grid_kind = parse_grid_kind(grid_kind)?;
+
+            match crate::interpn(
+                grids,
+                vals.as_slice()?,
+                obs,
+                out.as_slice_mut()?,
+                method,
+                grid_kind,
+                linearize_extrapolation,
+                max_threads,
+            ) {
+                Ok(()) => Ok(()),
+                Err(msg) => Err(exceptions::PyAssertionError::new_err(msg)),
+            }
+        }
+    };
+}
+
+interpn_top_impl!(interpn_f64, f64);
+interpn_top_impl!(interpn_f32, f32);

From 1987ff154f9bd9917b69d36f68927377e9b84d89 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Thu, 8 Jan 2026 21:11:07 -0500
Subject: [PATCH 04/20] add bounds checks

---
 src/lib.rs    | 33 +++++++++++++++++++++++++++++++++
 src/python.rs |  4 +++-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index aacc2ee..e632773 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -162,6 +162,7 @@ pub fn interpn<T: Float + Send + Sync>(
     method: GridInterpMethod,
     assume_grid_kind: Option<GridKind>,
     linearize_extrapolation: bool,
+    check_bounds_with_atol: Option<T>,
     max_threads: Option<usize>,
 ) -> Result<(), &'static str> {
     let ndims = grids.len();
@@ -210,6 +211,7 @@ pub fn interpn<T: Float + Send + Sync>(
             method,
             assume_grid_kind,
             linearize_extrapolation,
+            check_bounds_with_atol,
         );
 
         match res_inner {
@@ -233,6 +235,7 @@ pub fn interpn_serial<T: Float>(
     method: GridInterpMethod,
     assume_grid_kind: Option<GridKind>,
     linearize_extrapolation: bool,
+    check_bounds_with_atol: Option<T>,
 ) -> Result<(), &'static str> {
     let ndims = grids.len();
     if ndims > MAXDIMS {
@@ -284,10 +287,37 @@ pub fn interpn_serial<T: Float>(
         Ok((dims, starts, steps))
     };
 
+    // Bounds checks for regular grid, if requested
+    let maybe_check_bounds_regular = |dims: &[usize], starts: &[T], steps: &[T], obs: &[&[T]]| {
+        if let Some(atol) = check_bounds_with_atol {
+            let mut bounds = [false; MAXDIMS];
+            let out = &mut bounds[..ndims];
+            multilinear::regular::check_bounds(
+                &dims[..ndims],
+                &starts[..ndims],
+                &steps[..ndims],
+                obs,
+                atol,
+                out,
+            )?
+        }
+        Ok(())
+    };
+
+    let maybe_check_bounds_rectilinear = |grids, obs| {
+        if let Some(atol) = check_bounds_with_atol {
+            let mut bounds = [false; MAXDIMS];
+            let out = &mut bounds[..ndims];
+            multilinear::rectilinear::check_bounds(grids, obs, atol, out)?
+        }
+        Ok(())
+    };
+
     // Select lower-level method
     match (method, kind) {
         (GridInterpMethod::Linear, GridKind::Regular) => {
             let (dims, starts, steps) = get_regular_grid()?;
+            maybe_check_bounds_regular(&dims, &starts, &steps, obs)?;
             linear::regular::interpn(
                 &dims[..ndims],
                 &starts[..ndims],
@@ -298,10 +328,12 @@ pub fn interpn_serial<T: Float>(
             )
         }
         (GridInterpMethod::Linear, GridKind::Rectilinear) => {
+            maybe_check_bounds_rectilinear(grids, obs)?;
             linear::rectilinear::interpn(grids, vals, obs, out)
         }
         (GridInterpMethod::Cubic, GridKind::Regular) => {
             let (dims, starts, steps) = get_regular_grid()?;
+            maybe_check_bounds_regular(&dims, &starts, &steps, obs)?;
             cubic::regular::interpn(
                 &dims[..ndims],
                 &starts[..ndims],
@@ -313,6 +345,7 @@ pub fn interpn_serial<T: Float>(
             )
         }
         (GridInterpMethod::Cubic, GridKind::Rectilinear) => {
+            maybe_check_bounds_rectilinear(grids, obs)?;
             cubic::rectilinear::interpn(grids, vals, linearize_extrapolation, obs, out)
         }
     }
diff --git a/src/python.rs b/src/python.rs
index d27d86b..dcd7beb 100644
--- a/src/python.rs
+++ b/src/python.rs
@@ -321,7 +321,7 @@ fn parse_grid_kind(grid_kind: Option<&str>) -> Result<Option<GridKind>, PyErr> {
 macro_rules! interpn_top_impl {
     ($funcname:ident, $T:ty) => {
         #[pyfunction]
-        #[pyo3(signature = (grids, vals, obs, out, method="linear", grid_kind=None, linearize_extrapolation=true, max_threads=None))]
+        #[pyo3(signature = (grids, vals, obs, out, method="linear", grid_kind=None, linearize_extrapolation=true, check_bounds_with_atol=None, max_threads=None))]
         fn $funcname(
             grids: Vec<PyReadonlyArray1<$T>>,
             vals: PyReadonlyArray1<$T>,
@@ -330,6 +330,7 @@ macro_rules! interpn_top_impl {
             method: &str,
             grid_kind: Option<&str>,
             linearize_extrapolation: bool,
+            check_bounds_with_atol: Option<$T>,
             max_threads: Option<usize>,
         ) -> PyResult<()> {
             unpack_vec_of_arr!(grids, grids, $T);
@@ -346,6 +347,7 @@ macro_rules! interpn_top_impl {
                 method,
                 grid_kind,
                 linearize_extrapolation,
+                check_bounds_with_atol,
                 max_threads,
             ) {
                 Ok(()) => Ok(()),

From adf07fbbfce5e7923798d29ce021caf9abcef191 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Thu, 8 Jan 2026 21:31:37 -0500
Subject: [PATCH 05/20] add nearest methods

---
 src/lib.rs    | 23 +++++++++++++++++++++++
 src/python.rs |  3 ++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index e632773..5dd259f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -136,8 +136,12 @@ pub mod python;
 /// Interpolant function for multi-dimensional methods.
 #[derive(Clone, Copy)]
 pub enum GridInterpMethod {
+    /// Multi-linear interpolation.
     Linear,
+    /// Cubic Hermite spline interpolation.
     Cubic,
+    /// Nearest-neighbor interpolation.
+    Nearest,
 }
 
 /// Grid spacing category for multi-dimensional methods.
@@ -242,6 +246,8 @@ pub fn interpn_serial<T: Float>(
         return Err(MAXDIMS_ERR);
     }
 
+    // Resolve grid kind, checking the grid if
+    // the kind is not provided by the user.
     let kind = match assume_grid_kind {
         Some(GridKind::Regular) => GridKind::Regular,
         Some(GridKind::Rectilinear) => GridKind::Rectilinear,
@@ -304,6 +310,7 @@ pub fn interpn_serial<T: Float>(
         Ok(())
     };
 
+    // Bounds checks for rectilinear grid, if requested
     let maybe_check_bounds_rectilinear = |grids, obs| {
         if let Some(atol) = check_bounds_with_atol {
             let mut bounds = [false; MAXDIMS];
@@ -348,6 +355,22 @@ pub fn interpn_serial<T: Float>(
             maybe_check_bounds_rectilinear(grids, obs)?;
             cubic::rectilinear::interpn(grids, vals, linearize_extrapolation, obs, out)
         }
+        (GridInterpMethod::Nearest, GridKind::Regular) => {
+            let (dims, starts, steps) = get_regular_grid()?;
+            maybe_check_bounds_regular(&dims, &starts, &steps, obs)?;
+            nearest::regular::interpn(
+                &dims[..ndims],
+                &starts[..ndims],
+                &steps[..ndims],
+                vals,
+                obs,
+                out,
+            )
+        }
+        (GridInterpMethod::Nearest, GridKind::Rectilinear) => {
+            maybe_check_bounds_rectilinear(grids, obs)?;
+            nearest::rectilinear::interpn(grids, vals, obs, out)
+        }
     }
 }
 
diff --git a/src/python.rs b/src/python.rs
index dcd7beb..2a3d3c9 100644
--- a/src/python.rs
+++ b/src/python.rs
@@ -299,8 +299,9 @@ fn parse_grid_interp_method(method: &str) -> Result<GridInterpMethod, PyErr> {
     match method.to_ascii_lowercase().as_str() {
         "linear" => Ok(GridInterpMethod::Linear),
         "cubic" => Ok(GridInterpMethod::Cubic),
+        "nearest" => Ok(GridInterpMethod::Nearest),
         _ => Err(exceptions::PyValueError::new_err(
-            "`method` must be 'linear' or 'cubic'",
+            "`method` must be 'linear', 'cubic', or 'nearest'",
         )),
     }
 }

From d23c0a2a5c7f1ddacefd35ab0bb5df272803f8a5 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Thu, 8 Jan 2026 21:50:24 -0500
Subject: [PATCH 06/20] use rust top level interpn function for python top
 level interpn function. update tests.

---
 src/interpn/__init__.py | 122 +++++++++-------------------------------
 src/interpn/raw.py      |   4 ++
 src/interpn/raw.pyi     |  25 ++++++++
 src/lib.rs              |  10 +++-
 4 files changed, 64 insertions(+), 97 deletions(-)

diff --git a/src/interpn/__init__.py b/src/interpn/__init__.py
index fd2db38..54c4b89 100644
--- a/src/interpn/__init__.py
+++ b/src/interpn/__init__.py
@@ -56,17 +56,16 @@ def interpn(
     method: Literal["linear", "cubic", "nearest"] = "linear",
     out: NDArray | None = None,
     linearize_extrapolation: bool = True,
-    assume_regular: bool = False,
-    check_bounds: bool = False,
-    bounds_atol: float = 1e-8,
+    grid_kind: Literal["regular", "rectilinear"] | None = None,
+    check_bounds_with_atol: float | None = None,
+    max_threads: int | None = None,
 ) -> NDArray:
     """
     Evaluate an N-dimensional grid at the supplied observation points.
 
-    Performs some small allocations to prepare the inputs and
-    performs O(gridsize) checks to determine grid regularity
-    unless `assume_regular` is set. To avoid this overhead entirely,
-    use the persistent wrapper classes or raw bindings instead.
+    Performs some small allocations to prepare the inputs.
+    Use the persistent wrapper classes or raw bindings instead
+    to avoid this overhead entirely.
 
     Reallocates input arrays if and only if they are not contiguous yet.
 
@@ -78,9 +77,11 @@ def interpn(
         out: Optional preallocated array that receives the result.
         linearize_extrapolation: Whether cubic extrapolation should fall back to
             linear behaviour outside the grid bounds.
-        assume_regular: Treat the grid as regular without checking spacing.
-        check_bounds: When True, raise if any observation lies outside the grid.
-        bounds_atol: Absolute tolerance for bounds checks, to avoid spurious errors
+        grid_kind: Optional ``"regular"`` or ``"rectilinear"`` to skip
+            grid-shape autodetection.
+        check_bounds_with_atol: When set, raise if any observation lies outside
+            the grid by more than this absolute tolerance.
+        max_threads: Optional upper bound for parallel execution threads.
 
     Returns:
         Interpolated values
@@ -101,106 +102,37 @@ def interpn(
         "`interpn` defined only for float32 and float64 data"
     )
 
-    # Check regularity
-    is_regular = assume_regular or _check_regular(grids)
-
-    if is_regular:
-        dims = np.array([len(grid) for grid in grids], dtype=int)
-        starts = np.array([grid[0] for grid in grids], dtype=dtype)
-        steps = np.array([grid[1] - grid[0] for grid in grids], dtype=dtype)
-    else:
-        # Pyright doesn't understand match-case
-        dims = np.empty((0,), dtype=int)
-        starts = np.empty((0,), dtype=dtype)
-        steps = starts
-
-    # Check bounds
-    if check_bounds:
-        outb = np.zeros_like(out.shape, dtype=bool)
-        match (dtype, is_regular):
-            case (np.float32, True):
-                raw.check_bounds_regular_f32(
-                    dims, starts, steps, obs, atol=bounds_atol, out=outb
-                )
-            case (np.float64, True):
-                raw.check_bounds_regular_f64(
-                    dims, starts, steps, obs, atol=bounds_atol, out=outb
-                )
-            case (np.float32, False):
-                raw.check_bounds_rectilinear_f32(grids, obs, atol=bounds_atol, out=outb)
-            case (np.float64, False):
-                raw.check_bounds_rectilinear_f64(grids, obs, atol=bounds_atol, out=outb)
-
-        if any(outb):
-            raise ValueError("Observation points violate interpolator bounds")
-
     # Do interpolation
-    match (dtype, is_regular, method):
-        case (np.float32, True, "linear"):
-            raw.interpn_linear_regular_f32(dims, starts, steps, vals, obs, out)
-        case (np.float64, True, "linear"):
-            raw.interpn_linear_regular_f64(dims, starts, steps, vals, obs, out)
-        case (np.float32, False, "linear"):
-            raw.interpn_linear_rectilinear_f32(grids, vals, obs, out)
-        case (np.float64, False, "linear"):
-            raw.interpn_linear_rectilinear_f64(grids, vals, obs, out)
-        case (np.float32, True, "nearest"):
-            raw.interpn_nearest_regular_f32(dims, starts, steps, vals, obs, out)
-        case (np.float64, True, "nearest"):
-            raw.interpn_nearest_regular_f64(dims, starts, steps, vals, obs, out)
-        case (np.float32, False, "nearest"):
-            raw.interpn_nearest_rectilinear_f32(grids, vals, obs, out)
-        case (np.float64, False, "nearest"):
-            raw.interpn_nearest_rectilinear_f64(grids, vals, obs, out)
-        case (np.float32, True, "cubic"):
-            raw.interpn_cubic_regular_f32(
-                dims,
-                starts,
-                steps,
-                vals,
-                linearize_extrapolation,
-                obs,
-                out,
-            )
-        case (np.float64, True, "cubic"):
-            raw.interpn_cubic_regular_f64(
-                dims,
-                starts,
-                steps,
-                vals,
-                linearize_extrapolation,
-                obs,
-                out,
-            )
-        case (np.float32, False, "cubic"):
-            raw.interpn_cubic_rectilinear_f32(
+    match dtype:
+        case np.float32:
+            raw.interpn_f32(
                 grids,
                 vals,
-                linearize_extrapolation,
                 obs,
                 out,
+                method=method,
+                grid_kind=grid_kind,
+                linearize_extrapolation=linearize_extrapolation,
+                check_bounds_with_atol=check_bounds_with_atol,
+                max_threads=max_threads,
             )
-        case (np.float64, False, "cubic"):
-            raw.interpn_cubic_rectilinear_f64(
+        case np.float64:
+            raw.interpn_f64(
                 grids,
                 vals,
-                linearize_extrapolation,
                 obs,
                 out,
+                method=method,
+                grid_kind=grid_kind,
+                linearize_extrapolation=linearize_extrapolation,
+                check_bounds_with_atol=check_bounds_with_atol,
+                max_threads=max_threads,
             )
         case _:
             raise ValueError(
                 "Unsupported interpolation configuration:"
-                f" {dtype}, {is_regular}, {method}"
+                f" {dtype}, {method}"
             )
 
     return out.reshape(outshape)
 
-
-def _check_regular(grids: Sequence[NDArray]) -> bool:
-    """Check if grids are all regularly spaced"""
-    is_regular = True
-    for grid in grids:
-        dgrid = np.diff(grid)
-        is_regular = is_regular and np.all(dgrid == dgrid[0])
-    return bool(is_regular)
diff --git a/src/interpn/raw.py b/src/interpn/raw.py
index 1d72221..3b80dac 100644
--- a/src/interpn/raw.py
+++ b/src/interpn/raw.py
@@ -4,6 +4,8 @@
 """
 
 from .interpn import (
+    interpn_f64,
+    interpn_f32,
     interpn_linear_regular_f64,
     interpn_linear_regular_f32,
     interpn_linear_rectilinear_f64,
@@ -23,6 +25,8 @@
 )
 
 __all__ = [
+    "interpn_f64",
+    "interpn_f32",
     "interpn_linear_regular_f64",
     "interpn_linear_regular_f32",
     "interpn_linear_rectilinear_f64",
diff --git a/src/interpn/raw.pyi b/src/interpn/raw.pyi
index af7f424..256ab30 100644
--- a/src/interpn/raw.pyi
+++ b/src/interpn/raw.pyi
@@ -11,6 +11,8 @@ BoolArray = NDArray[np.bool_]
 IntArray = NDArray[np.intp]
 
 __all__ = [
+    "interpn_f64",
+    "interpn_f32",
     "interpn_linear_regular_f64",
     "interpn_linear_regular_f32",
     "interpn_linear_rectilinear_f64",
@@ -29,6 +31,29 @@ __all__ = [
     "check_bounds_rectilinear_f32",
 ]
 
+def interpn_f64(
+    grids: Sequence[NDArrayF64],
+    vals: NDArrayF64,
+    obs: Sequence[NDArrayF64],
+    out: NDArrayF64,
+    method: str = "linear",
+    grid_kind: str | None = None,
+    linearize_extrapolation: bool = True,
+    check_bounds_with_atol: float | None = None,
+    max_threads: int | None = None,
+) -> None: ...
+def interpn_f32(
+    grids: Sequence[NDArrayF32],
+    vals: NDArrayF32,
+    obs: Sequence[NDArrayF32],
+    out: NDArrayF32,
+    method: str = "linear",
+    grid_kind: str | None = None,
+    linearize_extrapolation: bool = True,
+    check_bounds_with_atol: float | None = None,
+    max_threads: int | None = None,
+) -> None: ...
+
 def interpn_linear_regular_f64(
     dims: IntArray,
     starts: NDArrayF64,
diff --git a/src/lib.rs b/src/lib.rs
index 5dd259f..35994b8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -305,7 +305,10 @@ pub fn interpn_serial<T: Float>(
                 obs,
                 atol,
                 out,
-            )?
+            )?;
+            if bounds.iter().any(|x| *x) {
+                return Err("At least one observation point is outside the grid.")
+            }
         }
         Ok(())
     };
@@ -315,7 +318,10 @@ pub fn interpn_serial<T: Float>(
         if let Some(atol) = check_bounds_with_atol {
             let mut bounds = [false; MAXDIMS];
             let out = &mut bounds[..ndims];
-            multilinear::rectilinear::check_bounds(grids, obs, atol, out)?
+            multilinear::rectilinear::check_bounds(grids, obs, atol, out)?;
+            if bounds.iter().any(|x| *x) {
+                return Err("At least one observation point is outside the grid.")
+            }
         }
         Ok(())
     };

From ce69fdff3270bc79ab24af5165c5bee8dc27e2e3 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Thu, 8 Jan 2026 21:55:57 -0500
Subject: [PATCH 07/20] update clamping order for number of cores

---
 src/lib.rs           |  4 ++--
 test/test_interpn.py | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 35994b8..1282de2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -176,8 +176,8 @@ pub fn interpn<T: Float + Send + Sync>(
 
     // Chunk for parallelism
     let num_cores = rayon::current_num_threads()
-        .max(1)
-        .min(max_threads.unwrap_or(usize::MAX));
+        .min(max_threads.unwrap_or(usize::MAX))
+        .max(1);
     let n = out.len();
     let chunk = 1024.max(n / num_cores);
 
diff --git a/test/test_interpn.py b/test/test_interpn.py
index 989731b..cee796c 100644
--- a/test/test_interpn.py
+++ b/test/test_interpn.py
@@ -17,17 +17,17 @@ def test_interpn_check_bounds_regular(dtype):
         grids=[grid],
         vals=vals,
         method="linear",
-        check_bounds=True,
+        check_bounds_with_atol=1e-8,
     )
     assert inside.shape == obs_inside[0].shape
 
-    with pytest.raises(ValueError):
+    with pytest.raises(AssertionError):
         interpn(
             obs=obs_outside,
             grids=[grid],
             vals=vals,
             method="linear",
-            check_bounds=True,
+            check_bounds_with_atol=1e-8,
         )
 
 
@@ -44,15 +44,15 @@ def test_interpn_check_bounds_rectilinear(dtype):
         grids=[grid],
         vals=vals,
         method="linear",
-        check_bounds=True,
+        check_bounds_with_atol=1e-8,
     )
     assert inside.shape == obs_inside[0].shape
 
-    with pytest.raises(ValueError):
+    with pytest.raises(AssertionError):
         interpn(
             obs=obs_outside,
             grids=[grid],
             vals=vals,
             method="linear",
-            check_bounds=True,
+            check_bounds_with_atol=1e-8,
         )

From afbe98a00b8565ab131d44d88f1ece291461ed73 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Thu, 8 Jan 2026 22:08:48 -0500
Subject: [PATCH 08/20] parametrize parallelism in tests

---
 test/test_interpn.py                 | 10 ++++++++--
 test/test_multicubic_rectilinear.py  |  5 ++++-
 test/test_multicubic_regular.py      |  5 ++++-
 test/test_multilinear_rectilinear.py |  5 ++++-
 test/test_multilinear_regular.py     |  5 ++++-
 test/test_nearest_rectilinear.py     |  5 ++++-
 test/test_nearest_regular.py         |  5 ++++-
 7 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/test/test_interpn.py b/test/test_interpn.py
index cee796c..0f185f8 100644
--- a/test/test_interpn.py
+++ b/test/test_interpn.py
@@ -5,7 +5,8 @@
 
 
 @pytest.mark.parametrize("dtype", [np.float64, np.float32])
-def test_interpn_check_bounds_regular(dtype):
+@pytest.mark.parametrize("max_threads", [None, 1], ids=["parallel", "serial"])
+def test_interpn_check_bounds_regular(dtype, max_threads):
     grid = np.linspace(-1.0, 1.0, 5).astype(dtype)
     vals = np.linspace(0.0, 10.0, grid.size).astype(dtype)
 
@@ -18,6 +19,7 @@ def test_interpn_check_bounds_regular(dtype):
         vals=vals,
         method="linear",
         check_bounds_with_atol=1e-8,
+        max_threads=max_threads,
     )
     assert inside.shape == obs_inside[0].shape
 
@@ -28,11 +30,13 @@ def test_interpn_check_bounds_regular(dtype):
             vals=vals,
             method="linear",
             check_bounds_with_atol=1e-8,
+            max_threads=max_threads,
         )
 
 
 @pytest.mark.parametrize("dtype", [np.float64, np.float32])
-def test_interpn_check_bounds_rectilinear(dtype):
+@pytest.mark.parametrize("max_threads", [None, 1], ids=["parallel", "serial"])
+def test_interpn_check_bounds_rectilinear(dtype, max_threads):
     grid = np.array([-1.0, -0.25, 0.5, 2.0], dtype=dtype)
     vals = np.linspace(0.0, 10.0, grid.size).astype(dtype)
 
@@ -45,6 +49,7 @@ def test_interpn_check_bounds_rectilinear(dtype):
         vals=vals,
         method="linear",
         check_bounds_with_atol=1e-8,
+        max_threads=max_threads,
     )
     assert inside.shape == obs_inside[0].shape
 
@@ -55,4 +60,5 @@ def test_interpn_check_bounds_rectilinear(dtype):
             vals=vals,
             method="linear",
             check_bounds_with_atol=1e-8,
+            max_threads=max_threads,
         )
diff --git a/test/test_multicubic_rectilinear.py b/test/test_multicubic_rectilinear.py
index 65d5b7f..61313dc 100644
--- a/test/test_multicubic_rectilinear.py
+++ b/test/test_multicubic_rectilinear.py
@@ -1,8 +1,10 @@
 import numpy as np
+import pytest
 import interpn
 
 
-def test_multilinear_rectilinear():
+@pytest.mark.parametrize("max_threads", [None, 1], ids=["parallel", "serial"])
+def test_multilinear_rectilinear(max_threads):
     for dtype in [np.float64, np.float32]:
         x = np.linspace(0.0, 10.0, 5).astype(dtype)
         y = np.linspace(20.0, 30.0, 4).astype(dtype)
@@ -48,6 +50,7 @@ def test_multilinear_rectilinear():
             vals=zgrid.flatten(),
             method="cubic",
             linearize_extrapolation=False,
+            max_threads=max_threads,
         )
         for i in range(out_helper.size):
             assert out_helper[i] == zf[i]
diff --git a/test/test_multicubic_regular.py b/test/test_multicubic_regular.py
index 54cfa4a..8989f42 100644
--- a/test/test_multicubic_regular.py
+++ b/test/test_multicubic_regular.py
@@ -1,8 +1,10 @@
 import numpy as np
+import pytest
 import interpn
 
 
-def test_multicubic_regular():
+@pytest.mark.parametrize("max_threads", [None, 1], ids=["parallel", "serial"])
+def test_multicubic_regular(max_threads):
     for dtype, tol in [(np.float64, 1e-12), (np.float32, 1e-6)]:
         x = np.linspace(0.0, 10.0, 7).astype(dtype)
         y = np.linspace(20.0, 30.0, 5).astype(dtype)
@@ -55,6 +57,7 @@ def test_multicubic_regular():
             vals=zgrid.flatten(),
             method="cubic",
             linearize_extrapolation=False,
+            max_threads=max_threads,
         )
         for i in range(out_helper.size):
             assert approx(out_helper[i], zf[i], dtype(tol))
diff --git a/test/test_multilinear_rectilinear.py b/test/test_multilinear_rectilinear.py
index e6b9528..aa1fbe0 100644
--- a/test/test_multilinear_rectilinear.py
+++ b/test/test_multilinear_rectilinear.py
@@ -1,8 +1,10 @@
 import numpy as np
+import pytest
 import interpn
 
 
-def test_multilinear_rectilinear():
+@pytest.mark.parametrize("max_threads", [None, 1], ids=["parallel", "serial"])
+def test_multilinear_rectilinear(max_threads):
     for dtype in [np.float64, np.float32]:
         x = np.linspace(0.0, 10.0, 5).astype(dtype)
         y = np.linspace(20.0, 30.0, 3).astype(dtype)
@@ -46,6 +48,7 @@ def test_multilinear_rectilinear():
             grids=grids,
             vals=zgrid.flatten(),
             method="linear",
+            max_threads=max_threads,
         )
 
         for i in range(out_helper.size):
diff --git a/test/test_multilinear_regular.py b/test/test_multilinear_regular.py
index 4280869..b39d7df 100644
--- a/test/test_multilinear_regular.py
+++ b/test/test_multilinear_regular.py
@@ -1,8 +1,10 @@
 import numpy as np
+import pytest
 import interpn
 
 
-def test_multilinear_regular():
+@pytest.mark.parametrize("max_threads", [None, 1], ids=["parallel", "serial"])
+def test_multilinear_regular(max_threads):
     for dtype in [np.float64, np.float32]:
         x = np.linspace(0.0, 10.0, 5).astype(dtype)
         y = np.linspace(20.0, 30.0, 3).astype(dtype)
@@ -53,6 +55,7 @@ def test_multilinear_regular():
             grids=grids,
             vals=zgrid.flatten(),
             method="linear",
+            max_threads=max_threads,
         )
 
         for i in range(out_helper.size):
diff --git a/test/test_nearest_rectilinear.py b/test/test_nearest_rectilinear.py
index f47182a..9ece6f4 100644
--- a/test/test_nearest_rectilinear.py
+++ b/test/test_nearest_rectilinear.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 import interpn
 
 
@@ -11,7 +12,8 @@ def _nearest_rectilinear_index(value: float, grid: np.ndarray) -> int:
     return idx if dt <= 0.5 else idx + 1
 
 
-def test_nearest_rectilinear():
+@pytest.mark.parametrize("max_threads", [None, 1], ids=["parallel", "serial"])
+def test_nearest_rectilinear(max_threads):
     for dtype in [np.float64, np.float32]:
         x = np.array([0.0, 1.0, 3.5, 4.0], dtype=dtype)
         y = np.array([-2.0, -0.5, 0.1], dtype=dtype)
@@ -54,6 +56,7 @@ def test_nearest_rectilinear():
             grids=grids,
             vals=zgrid.flatten(),
             method="nearest",
+            max_threads=max_threads,
         )
         np.testing.assert_array_equal(out_helper, expected)
 
diff --git a/test/test_nearest_regular.py b/test/test_nearest_regular.py
index f1d64ca..9f22a8d 100644
--- a/test/test_nearest_regular.py
+++ b/test/test_nearest_regular.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 import interpn
 
 
@@ -10,7 +11,8 @@ def _nearest_regular_index(value: float, start: float, step: float, size: int) -
     return min(loc if dt <= 0.5 else loc + 1, size - 1)
 
 
-def test_nearest_regular():
+@pytest.mark.parametrize("max_threads", [None, 1], ids=["parallel", "serial"])
+def test_nearest_regular(max_threads):
     for dtype in [np.float64, np.float32]:
         x = np.linspace(0.0, 6.0, 4).astype(dtype)
         y = np.linspace(-3.0, 3.0, 3).astype(dtype)
@@ -65,6 +67,7 @@ def test_nearest_regular():
             grids=grids,
             vals=zgrid.flatten(),
             method="nearest",
+            max_threads=max_threads,
         )
         np.testing.assert_array_equal(out_helper, expected)
 

From f28b5b5dc6c93bc668087b67c67fe1cc96bd8c58 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Thu, 8 Jan 2026 22:49:32 -0500
Subject: [PATCH 09/20] add parallel methods to pgo profile and benchmarks

---
 benches/bench_cpu.py        | 177 ++++++++++++++++++++++++++++++++++++
 scripts/profile_workload.py | 103 +++++++++++----------
 src/interpn/__init__.py     |   6 +-
 3 files changed, 234 insertions(+), 52 deletions(-)

diff --git a/benches/bench_cpu.py b/benches/bench_cpu.py
index ab06eab..d2a858d 100644
--- a/benches/bench_cpu.py
+++ b/benches/bench_cpu.py
@@ -18,6 +18,7 @@
     MulticubicRectilinear,
     NearestRegular,
     NearestRectilinear,
+    interpn as interpn_fn,
 )
 
 # Toggle SciPy/NumPy baselines via environment for PGO workloads.
@@ -50,6 +51,11 @@ def average_call_time(
 
 
 DASH_STYLES = ["solid", "dash", "dot", "dashdot", "longdash", "longdashdot"]
+THREAD_SPEEDUP_COLORS = {
+    "linear": "#1f77b4",
+    "cubic": "#ff7f0e",
+    "nearest": "#2ca02c",
+}
 
 
 def _normalized_line_style(index: int) -> str:
@@ -466,6 +472,176 @@ def _plot_speedup_vs_dims(
         fig.show()
 
 
+def _thread_counts() -> list[int]:
+    max_threads = os.cpu_count() or 1
+    counts = []
+    threads = 1
+    while threads < max_threads:
+        counts.append(threads)
+        threads *= 2
+    counts.append(max_threads)
+    return sorted(set(counts))
+
+
+def _plot_speedup_vs_threads(
+    *,
+    thread_counts: list[int],
+    speedups: dict[str, dict[str, list[float]]],
+    nobs: int,
+    output_path: Path,
+) -> None:
+    fig = make_subplots(
+        rows=1,
+        cols=2,
+        subplot_titles=("Regular Grid", "Rectilinear Grid"),
+        horizontal_spacing=0.08,
+    )
+    for col, grid_kind in enumerate(["regular", "rectilinear"], start=1):
+        for method in ["linear", "cubic", "nearest"]:
+            values = speedups[grid_kind].get(method)
+            if not values:
+                continue
+            fig.add_trace(
+                go.Scatter(
+                    x=thread_counts,
+                    y=values,
+                    mode="lines+markers",
+                    name=method.title(),
+                    line=dict(color=THREAD_SPEEDUP_COLORS[method], width=2),
+                    marker=dict(size=7),
+                    showlegend=col == 1,
+                ),
+                row=1,
+                col=col,
+            )
+        fig.add_hline(
+            y=1.0,
+            line=dict(color="black", dash="dot", width=1),
+            row=1,
+            col=col,
+        )
+        fig.update_xaxes(
+            title_text="Threads",
+            row=1,
+            col=col,
+            showline=True,
+            linecolor="black",
+            linewidth=1,
+            mirror=True,
+            ticks="outside",
+            tickcolor="black",
+            showgrid=False,
+            zeroline=False,
+        )
+        fig.update_yaxes(
+            title_text="Speedup vs. 1 Thread",
+            row=1,
+            col=col,
+            showline=True,
+            linecolor="black",
+            linewidth=1,
+            mirror=True,
+            ticks="outside",
+            tickcolor="black",
+            showgrid=False,
+            zeroline=False,
+        )
+
+    fig.update_layout(
+        title=dict(
+            text=f"InterpN Thread Speedup ({nobs} Observation Points)",
+            y=0.98,
+            yanchor="top",
+        ),
+        height=430,
+        margin=dict(t=70, l=60, r=40, b=80),
+        legend=dict(
+            orientation="h",
+            yanchor="top",
+            y=1.05,
+            x=1.0,
+            xanchor="right",
+        ),
+        plot_bgcolor="rgba(0,0,0,0)",
+        paper_bgcolor="rgba(0,0,0,0)",
+        font=dict(color="black"),
+    )
+    fig.write_image(str(output_path))
+    fig.write_html(
+        str(output_path.with_suffix(".html")),
+        include_plotlyjs="cdn",
+        full_html=False,
+    )
+    fig.show()
+
+
+def bench_thread_speedup_vs_threads():
+    nobs = 10_000
+    ndims = 3
+    ngrid = 20
+    rng = np.random.default_rng(17)
+
+    thread_counts = _thread_counts()
+    speedups: dict[str, dict[str, list[float]]] = {
+        "regular": {},
+        "rectilinear": {},
+    }
+
+    def make_grids(kind: str) -> list[NDArray]:
+        base = np.linspace(-1.0, 1.0, ngrid)
+        if kind == "regular":
+            return [base for _ in range(ndims)]
+        warped = base**3
+        return [warped for _ in range(ndims)]
+
+    for grid_kind in ["regular", "rectilinear"]:
+        grids = make_grids(grid_kind)
+        mesh = np.meshgrid(*grids, indexing="ij")
+        vals = (mesh[0] + 2.0 * mesh[1] - 0.5 * mesh[2]).astype(np.float64)
+        vals_flat = vals.ravel()
+        obs = []
+        for grid in grids:
+            lo = grid[0]
+            hi = grid[-1]
+            span = hi - lo
+            obs.append(
+                rng.uniform(lo + 0.05 * span, hi - 0.05 * span, size=nobs).astype(
+                    np.float64
+                )
+            )
+        out = np.zeros_like(obs[0])
+
+        for method in ["linear", "cubic", "nearest"]:
+            timings = []
+            for threads in thread_counts:
+                timed = average_call_time(
+                    lambda points, threads=threads: interpn_fn(
+                        obs=points,
+                        grids=grids,
+                        vals=vals_flat,
+                        method=method,
+                        out=out,
+                        linearize_extrapolation=False,
+                        grid_kind=grid_kind,
+                        max_threads=threads,
+                    ),
+                    obs,
+                )
+                timings.append(timed)
+            baseline = timings[0]
+            speedups[grid_kind][method] = [
+                baseline / t if t else 0.0 for t in timings
+            ]
+
+    output_path = Path(__file__).parent / "../docs/speedup_vs_threads_10000_obs.svg"
+    _plot_speedup_vs_threads(
+        thread_counts=thread_counts,
+        speedups=speedups,
+        nobs=nobs,
+        output_path=output_path,
+    )
+
+
 def bench_4_dims_1_obs():
     nbench = 30  # Bench iterations
     preallocate = False  # Whether to preallocate output array for InterpN
@@ -1076,6 +1252,7 @@ def bench_throughput_vs_dims():
 
 
 def main():
+    bench_thread_speedup_vs_threads()
     bench_throughput_vs_dims()
     bench_4_dims_1_obs()
     bench_4_dims_n_obs_unordered()
diff --git a/scripts/profile_workload.py b/scripts/profile_workload.py
index 87debc1..4b68e7e 100644
--- a/scripts/profile_workload.py
+++ b/scripts/profile_workload.py
@@ -5,14 +5,7 @@
 
 import numpy as np
 
-from interpn import (
-    MulticubicRectilinear,
-    MulticubicRegular,
-    MultilinearRectilinear,
-    MultilinearRegular,
-    NearestRectilinear,
-    NearestRegular,
-)
+from interpn import interpn as interpn_fn
 
 _TARGET_COUNT = int(1e4)
 _OBSERVATION_COUNTS = (1, 3, 571, 2017)
@@ -34,12 +27,35 @@ def _observation_points(
     return points
 
 
-def _evaluate(interpolator, points: list[np.ndarray]) -> None:
-    # Without preallocated output
-    interpolator.eval(points)
-    # With preallocated output
+def _evaluate(
+    *,
+    grids: list[np.ndarray],
+    vals: np.ndarray,
+    points: list[np.ndarray],
+    method: str,
+    grid_kind: str,
+    max_threads: int | None,
+) -> None:
+    interpn_fn(
+        obs=points,
+        grids=grids,
+        vals=vals,
+        method=method,
+        grid_kind=grid_kind,
+        linearize_extrapolation=True,
+        max_threads=max_threads,
+    )
     out = np.empty_like(points[0])
-    interpolator.eval(points, out)
+    interpn_fn(
+        obs=points,
+        grids=grids,
+        vals=vals,
+        method=method,
+        grid_kind=grid_kind,
+        linearize_extrapolation=True,
+        out=out,
+        max_threads=max_threads,
+    )
 
 
 def main() -> None:
@@ -55,46 +71,37 @@ def main() -> None:
             ]
             mesh = np.meshgrid(*grids, indexing="ij")
             zgrid = rng.uniform(-1.0, 1.0, mesh[0].size).astype(dtype)
-            dims = [grid.size for grid in grids]
-            starts = np.array([grid[0] for grid in grids], dtype=dtype)
-            steps = np.array([grid[1] - grid[0] for grid in grids], dtype=dtype)
-
-            linear_regular = MultilinearRegular.new(dims, starts, steps, zgrid)
-            linear_rect = MultilinearRectilinear.new(grids_rect, zgrid)
-            cubic_regular = MulticubicRegular.new(
-                dims,
-                starts,
-                steps,
-                zgrid,
-                linearize_extrapolation=True,
+            cases = (
+                ("linear", "regular", grids),
+                ("linear", "rectilinear", grids_rect),
+                ("cubic", "regular", grids),
+                ("cubic", "rectilinear", grids_rect),
+                ("nearest", "regular", grids),
+                ("nearest", "rectilinear", grids_rect),
             )
-            cubic_rect = MulticubicRectilinear.new(
-                grids_rect,
-                zgrid,
-                linearize_extrapolation=True,
-            )
-            nearest_regular = NearestRegular.new(dims, starts, steps, zgrid)
-            nearest_rect = NearestRectilinear.new(grids_rect, zgrid)
 
             for nobs in _OBSERVATION_COUNTS:
                 nreps = max(int(_TARGET_COUNT / nobs), 1)
 
-                for interpolator in (
-                    linear_regular,
-                    linear_rect,
-                    cubic_regular,
-                    cubic_rect,
-                    nearest_regular,
-                    nearest_rect,
-                ):
-                    for _ in range(nreps):
-                        points = _observation_points(rng, ndims, nobs, dtype)
-                        _evaluate(interpolator, points)
-
-                    print(
-                        f"Completed {type(interpolator).__name__} "
-                        f"dtype={np.dtype(dtype).name} ndims={ndims} nobs={nobs}"
-                    )
+                for max_threads in (None, 1):
+                    for method, grid_kind, grids_in in cases:
+                        for _ in range(nreps):
+                            points = _observation_points(rng, ndims, nobs, dtype)
+                            _evaluate(
+                                grids=grids_in,
+                                vals=zgrid,
+                                points=points,
+                                method=method,
+                                grid_kind=grid_kind,
+                                max_threads=max_threads,
+                            )
+
+                        mode = "parallel" if max_threads is None else "serial"
+                        print(
+                            f"Completed interpn method={method} grid={grid_kind} "
+                            f"dtype={np.dtype(dtype).name} ndims={ndims} nobs={nobs} "
+                            f"mode={mode}"
+                        )
 
 
 if __name__ == "__main__":
diff --git a/src/interpn/__init__.py b/src/interpn/__init__.py
index 54c4b89..087f0af 100644
--- a/src/interpn/__init__.py
+++ b/src/interpn/__init__.py
@@ -87,7 +87,7 @@ def interpn(
         Interpolated values
     """
     # Allocate for the output if it is not supplied
-    out = out or np.zeros_like(obs[0])
+    out = out if out is not None else np.zeros_like(obs[0])
     outshape = out.shape
     out = out.ravel()  # Flat view without reallocating
 
@@ -130,9 +130,7 @@ def interpn(
             )
         case _:
             raise ValueError(
-                "Unsupported interpolation configuration:"
-                f" {dtype}, {method}"
+                f"Unsupported interpolation configuration: {dtype}, {method}"
             )
 
     return out.reshape(outshape)
-

From 08b6f1cd23e372fee1859c808e872453f7cfb8a0 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Thu, 8 Jan 2026 23:36:27 -0500
Subject: [PATCH 10/20] add serial workflow back in. name pgo profiles per
 process.

---
 benches/bench_cpu.py            |   4 +-
 scripts/distr_pgo_profile.sh    |  19 +++---
 scripts/profile_workload.py     |   8 ++-
 scripts/profile_workload_ser.py | 101 ++++++++++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 11 deletions(-)
 create mode 100644 scripts/profile_workload_ser.py

diff --git a/benches/bench_cpu.py b/benches/bench_cpu.py
index d2a858d..8c4a3ac 100644
--- a/benches/bench_cpu.py
+++ b/benches/bench_cpu.py
@@ -576,7 +576,7 @@ def _plot_speedup_vs_threads(
 
 
 def bench_thread_speedup_vs_threads():
-    nobs = 10_000
+    nobs = 1_000_000
     ndims = 3
     ngrid = 20
     rng = np.random.default_rng(17)
@@ -633,7 +633,7 @@ def make_grids(kind: str) -> list[NDArray]:
                 baseline / t if t else 0.0 for t in timings
             ]
 
-    output_path = Path(__file__).parent / "../docs/speedup_vs_threads_10000_obs.svg"
+    output_path = Path(__file__).parent / "../docs/speedup_vs_threads_1000000_obs.svg"
     _plot_speedup_vs_threads(
         thread_counts=thread_counts,
         speedups=speedups,
diff --git a/scripts/distr_pgo_profile.sh b/scripts/distr_pgo_profile.sh
index 46323e3..68755fa 100644
--- a/scripts/distr_pgo_profile.sh
+++ b/scripts/distr_pgo_profile.sh
@@ -1,23 +1,26 @@
 #!/bin/bash
 
-# Get llvm-profdata from `apt install llvm-20`
-# Must match rust's llvm version or it will crash
+# Get llvm-profdata from `apt install llvm-20`.
+# Must match rust's llvm version or it will crash.
 
-# Build instrumented wheel
+# Build instrumented wheel.
 cargo clean
 uv cache clean
 uv pip install maturin
 rm -rf dist/
 UV_NO_BUILD_CACHE=1 uv run --no-sync maturin build --compatibility pypi --out dist --verbose -- "-Cprofile-generate=${PWD}/scripts/pgo-profiles/pgo.profraw"
 
-# Install instrumented wheel
+# Install instrumented wheel.
 uv pip install $(find dist/ -name '*.whl')[pydantic] --group test --group bench --reinstall
 
-# Clear existing profiles
+# Clear existing profiles.
 rm -rf ./scripts/pgo-profiles; mkdir ./scripts/pgo-profiles; mkdir ./scripts/pgo-profiles/pgo.profraw
 
-# Run reference workload to generate profile
-uv run --no-sync ./scripts/profile_workload.py
+# Run reference workload to generate profile.
+# Name each profile file per-binary (%m), per-process (%p)
+# so that they don't overwrite each other's results.
+# Unfortunately, per-thread (%t) is not always available.
+LLVM_PROFILE_FILE="${PWD}/scripts/pgo-profiles/pgo.profraw/%m-%p.profraw" uv run --no-sync ./scripts/profile_workload.py
 
-# Merge profiles
+# Merge profiles.
 /usr/lib/llvm-21/bin/llvm-profdata merge -o scripts/pgo-profiles/pgo.profdata $(find scripts/pgo-profiles/pgo.profraw -name '*.profraw')
diff --git a/scripts/profile_workload.py b/scripts/profile_workload.py
index 4b68e7e..9dbeccb 100644
--- a/scripts/profile_workload.py
+++ b/scripts/profile_workload.py
@@ -3,12 +3,16 @@
 
 from __future__ import annotations
 
+import subprocess
+import sys
+from pathlib import Path
+
 import numpy as np
 
 from interpn import interpn as interpn_fn
 
 _TARGET_COUNT = int(1e4)
-_OBSERVATION_COUNTS = (1, 3, 571, 2017)
+_OBSERVATION_COUNTS = (1, 3, 571, 2017, int(1e6))
 _MAX_DIMS = 4
 _GRID_SIZE = 30
 
@@ -106,3 +110,5 @@ def main() -> None:
 
 if __name__ == "__main__":
     main()
+    script = Path(__file__).with_name("profile_workload_ser.py")
+    subprocess.run([sys.executable, str(script)], check=True)
diff --git a/scripts/profile_workload_ser.py b/scripts/profile_workload_ser.py
new file mode 100644
index 0000000..87debc1
--- /dev/null
+++ b/scripts/profile_workload_ser.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+"""Lightweight workload used to gather PGO profiles for interpn."""
+
+from __future__ import annotations
+
+import numpy as np
+
+from interpn import (
+    MulticubicRectilinear,
+    MulticubicRegular,
+    MultilinearRectilinear,
+    MultilinearRegular,
+    NearestRectilinear,
+    NearestRegular,
+)
+
+_TARGET_COUNT = int(1e4)
+_OBSERVATION_COUNTS = (1, 3, 571, 2017)
+_MAX_DIMS = 4
+_GRID_SIZE = 30
+
+
+def _observation_points(
+    rng: np.random.Generator, ndims: int, nobs: int, dtype: np.dtype
+) -> tuple[list[np.ndarray], list[np.ndarray]]:
+    """Generate observation points inside and outside the grid domain.
+    The fraction of points outside the domain here will set the relative weight of
+    extrapolation branches.
+    """
+    m = max(int(float(nobs) ** (1.0 / ndims) + 2.0), 2)
+    axes = [rng.uniform(-1.05, 1.05, m).astype(dtype) for _ in range(ndims)]
+    mesh = np.meshgrid(*axes, indexing="ij")
+    points = [axis.flatten()[:nobs].copy() for axis in mesh]
+    return points
+
+
+def _evaluate(interpolator, points: list[np.ndarray]) -> None:
+    # Without preallocated output
+    interpolator.eval(points)
+    # With preallocated output
+    out = np.empty_like(points[0])
+    interpolator.eval(points, out)
+
+
+def main() -> None:
+    rng = np.random.default_rng(2394587)
+
+    for dtype in (np.float64, np.float32):
+        for ndims in range(1, _MAX_DIMS + 1):
+            ngrid = _GRID_SIZE if ndims < 5 else 6
+            grids = [np.linspace(-1.0, 1.0, ngrid, dtype=dtype) for _ in range(ndims)]
+            grids_rect = [
+                np.array(sorted(np.random.uniform(-1.0, 1.0, ngrid).astype(dtype)))
+                for _ in range(ndims)
+            ]
+            mesh = np.meshgrid(*grids, indexing="ij")
+            zgrid = rng.uniform(-1.0, 1.0, mesh[0].size).astype(dtype)
+            dims = [grid.size for grid in grids]
+            starts = np.array([grid[0] for grid in grids], dtype=dtype)
+            steps = np.array([grid[1] - grid[0] for grid in grids], dtype=dtype)
+
+            linear_regular = MultilinearRegular.new(dims, starts, steps, zgrid)
+            linear_rect = MultilinearRectilinear.new(grids_rect, zgrid)
+            cubic_regular = MulticubicRegular.new(
+                dims,
+                starts,
+                steps,
+                zgrid,
+                linearize_extrapolation=True,
+            )
+            cubic_rect = MulticubicRectilinear.new(
+                grids_rect,
+                zgrid,
+                linearize_extrapolation=True,
+            )
+            nearest_regular = NearestRegular.new(dims, starts, steps, zgrid)
+            nearest_rect = NearestRectilinear.new(grids_rect, zgrid)
+
+            for nobs in _OBSERVATION_COUNTS:
+                nreps = max(int(_TARGET_COUNT / nobs), 1)
+
+                for interpolator in (
+                    linear_regular,
+                    linear_rect,
+                    cubic_regular,
+                    cubic_rect,
+                    nearest_regular,
+                    nearest_rect,
+                ):
+                    for _ in range(nreps):
+                        points = _observation_points(rng, ndims, nobs, dtype)
+                        _evaluate(interpolator, points)
+
+                    print(
+                        f"Completed {type(interpolator).__name__} "
+                        f"dtype={np.dtype(dtype).name} ndims={ndims} nobs={nobs}"
+                    )
+
+
+if __name__ == "__main__":
+    main()

From 294f774c1b6f703dffccd31545cb4a679d5a3cfb Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Fri, 9 Jan 2026 18:07:34 -0500
Subject: [PATCH 11/20] use physical core count

---
 Cargo.lock           |  17 ++++++
 Cargo.toml           |   6 +-
 benches/bench_cpu.py | 131 +++++++++++++++++++++++++------------------
 src/lib.rs           |  24 ++++++--
 4 files changed, 116 insertions(+), 62 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 25a0e27..5aeb1ee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -224,6 +224,12 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+
 [[package]]
 name = "indoc"
 version = "2.0.5"
@@ -239,6 +245,7 @@ dependencies = [
  "itertools 0.14.0",
  "ndarray",
  "num-traits",
+ "num_cpus",
  "numpy",
  "pyo3",
  "rand",
@@ -365,6 +372,16 @@ dependencies = [
  "libm",
 ]
 
+[[package]]
+name = "num_cpus"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
 [[package]]
 name = "numpy"
 version = "0.27.1"
diff --git a/Cargo.toml b/Cargo.toml
index 125f39f..6c19cf1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,10 @@ crate-type = ["cdylib", "rlib"]
 # Rust lib deps
 num-traits = { version = "^0.2.19", default-features = false, features = ["libm"] }
 crunchy = { version = "^0.2.4", default-features = false }
+
+# Parallelism
 rayon = { version = "^1.11.0", optional = true }
+num_cpus = { version = "1.17.0", optional = true }
 
 # Python bindings
 pyo3 = { version = "0.27.2", features = ["extension-module", "abi3-py310", "generate-import-lib"], optional = true }
@@ -28,6 +31,7 @@ numpy = { version = "0.27.1", optional = true }
 # Test-only utils
 itertools = { version = "0.14.0", optional = true }
 
+
 [dev-dependencies]
 rand = "0.9.2"
 criterion = "0.8.1"
@@ -39,7 +43,7 @@ python = ["numpy", "pyo3", "std"]
 std = ["itertools"]
 deep-unroll = ["crunchy/limit_256"]
 fma = []
-par = ["std", "rayon"]
+par = ["std", "rayon", "num_cpus"]
 
 [profile.release]
 opt-level = 3
diff --git a/benches/bench_cpu.py b/benches/bench_cpu.py
index 8c4a3ac..5356ff9 100644
--- a/benches/bench_cpu.py
+++ b/benches/bench_cpu.py
@@ -474,6 +474,7 @@ def _plot_speedup_vs_dims(
 
 def _thread_counts() -> list[int]:
     max_threads = os.cpu_count() or 1
+    max_threads = max(int(max_threads / 2), 1)  # Real threads, not hyperthreads
     counts = []
     threads = 1
     while threads < max_threads:
@@ -490,62 +491,88 @@ def _plot_speedup_vs_threads(
     nobs: int,
     output_path: Path,
 ) -> None:
-    fig = make_subplots(
-        rows=1,
-        cols=2,
-        subplot_titles=("Regular Grid", "Rectilinear Grid"),
-        horizontal_spacing=0.08,
-    )
-    for col, grid_kind in enumerate(["regular", "rectilinear"], start=1):
+    fig = make_subplots(rows=1, cols=1)
+    dash_styles = [
+        _normalized_line_style(i) for i in range(len(["linear", "cubic", "nearest"]) * 2)
+    ]
+    all_values = []
+    thread_arr = np.array(thread_counts)
+    series: list[tuple[str, np.ndarray]] = []
+    for grid_kind in ["regular", "rectilinear"]:
         for method in ["linear", "cubic", "nearest"]:
             values = speedups[grid_kind].get(method)
             if not values:
                 continue
-            fig.add_trace(
-                go.Scatter(
-                    x=thread_counts,
-                    y=values,
-                    mode="lines+markers",
-                    name=method.title(),
-                    line=dict(color=THREAD_SPEEDUP_COLORS[method], width=2),
-                    marker=dict(size=7),
-                    showlegend=col == 1,
-                ),
-                row=1,
-                col=col,
-            )
-        fig.add_hline(
-            y=1.0,
-            line=dict(color="black", dash="dot", width=1),
-            row=1,
-            col=col,
-        )
-        fig.update_xaxes(
-            title_text="Threads",
+            values_arr = np.array(values, dtype=float)
+            all_values.append(values_arr)
+            series.append((f"{method.title()} {grid_kind}", values_arr))
+    for _, values_arr in series:
+        ones = np.ones_like(values_arr)
+        fill_between(
+            fig,
+            x=thread_arr,
+            upper=np.maximum(values_arr, ones),
+            lower=np.minimum(values_arr, ones),
             row=1,
-            col=col,
-            showline=True,
-            linecolor="black",
-            linewidth=1,
-            mirror=True,
-            ticks="outside",
-            tickcolor="black",
-            showgrid=False,
-            zeroline=False,
+            col=1,
+            fillcolor="rgba(139, 196, 59, 0.25)",
         )
-        fig.update_yaxes(
-            title_text="Speedup vs. 1 Thread",
+    for idx, (label, values_arr) in enumerate(series):
+        fig.add_trace(
+            go.Scatter(
+                x=thread_arr,
+                y=values_arr,
+                mode="lines+markers",
+                name=label,
+                line=dict(
+                    color="black",
+                    width=2,
+                    dash=dash_styles[idx],
+                ),
+                marker=dict(size=7, color="black"),
+                showlegend=False,
+            ),
             row=1,
-            col=col,
-            showline=True,
-            linecolor="black",
-            linewidth=1,
-            mirror=True,
-            ticks="outside",
-            tickcolor="black",
-            showgrid=False,
-            zeroline=False,
+            col=1,
         )
+    fig.add_hline(
+        y=1.0,
+        line=dict(color="black", dash="dot", width=1),
+        row=1,
+        col=1,
+    )
+    y_min = 1.0
+    if all_values:
+        y_min = min(1.0, min(values.min() for values in all_values))
+    y_max = max(thread_counts) if thread_counts else 1
+    fig.update_xaxes(
+        title_text="Threads",
+        row=1,
+        col=1,
+        range=[min(thread_counts), max(thread_counts)] if thread_counts else None,
+        showline=True,
+        linecolor="black",
+        linewidth=1,
+        mirror=True,
+        ticks="outside",
+        tickcolor="black",
+        showgrid=False,
+        zeroline=False,
+    )
+    fig.update_yaxes(
+        title_text="Speedup vs. 1 Thread",
+        row=1,
+        col=1,
+        range=[y_min, y_max],
+        showline=True,
+        linecolor="black",
+        linewidth=1,
+        mirror=True,
+        ticks="outside",
+        tickcolor="black",
+        showgrid=False,
+        zeroline=False,
+    )
 
     fig.update_layout(
         title=dict(
@@ -555,13 +582,7 @@ def _plot_speedup_vs_threads(
         ),
         height=430,
         margin=dict(t=70, l=60, r=40, b=80),
-        legend=dict(
-            orientation="h",
-            yanchor="top",
-            y=1.05,
-            x=1.0,
-            xanchor="right",
-        ),
+        showlegend=False,
         plot_bgcolor="rgba(0,0,0,0)",
         paper_bgcolor="rgba(0,0,0,0)",
         font=dict(color="black"),
diff --git a/src/lib.rs b/src/lib.rs
index 1282de2..de38158 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -174,10 +174,22 @@ pub fn interpn<T: Float + Send + Sync>(
         return Err(MAXDIMS_ERR);
     }
 
-    // Chunk for parallelism
-    let num_cores = rayon::current_num_threads()
-        .min(max_threads.unwrap_or(usize::MAX))
-        .max(1);
+    // Chunk for parallelism.
+    //
+    // By default, use only physical cores, because on most machines as of
+    // 2026, only half the available cores represent real compute capability due to
+    // the widespread adoption of hyperthreading.
+    //
+    // We also use a minimum chunk size of 1024 as a heuristic, because below that limit,
+    // single-threaded performance is usually faster due to a combination of threading overhead
+    // and memory page sizing.
+    let num_cores_physical = num_cpus::get_physical(); // Real cores
+    let num_cores_pool = rayon::current_num_threads(); // Available cores from rayon thread pool
+    let num_cores_available = num_cores_physical.min(num_cores_pool).max(1); // Real max
+    let num_cores = match max_threads {
+        Some(num_cores_requested) => num_cores_requested.min(num_cores_available),
+        None => num_cores_available,
+    };
     let n = out.len();
     let chunk = 1024.max(n / num_cores);
 
@@ -307,7 +319,7 @@ pub fn interpn_serial<T: Float>(
                 out,
             )?;
             if bounds.iter().any(|x| *x) {
-                return Err("At least one observation point is outside the grid.")
+                return Err("At least one observation point is outside the grid.");
             }
         }
         Ok(())
@@ -320,7 +332,7 @@ pub fn interpn_serial<T: Float>(
             let out = &mut bounds[..ndims];
             multilinear::rectilinear::check_bounds(grids, obs, atol, out)?;
             if bounds.iter().any(|x| *x) {
-                return Err("At least one observation point is outside the grid.")
+                return Err("At least one observation point is outside the grid.");
             }
         }
         Ok(())

From a1376e2d689e072a06815efe9dab52e6a430f7fe Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sat, 10 Jan 2026 12:42:10 -0500
Subject: [PATCH 12/20] docstring for rust top level function

---
 Cargo.toml                  |   4 +-
 scripts/profile_workload.py |   2 +-
 src/lib.rs                  | 124 ++++++++++++++++++++++++++----------
 3 files changed, 95 insertions(+), 35 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 6c19cf1..847926c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "interpn"
-version = "0.9.2"
+version = "0.10.0"
 edition = "2024"
 rust-version = "1.87" # 2025-05-15
 authors = ["James Logan <jlogan03@gmail.com>"]
@@ -22,7 +22,7 @@ crunchy = { version = "^0.2.4", default-features = false }
 
 # Parallelism
 rayon = { version = "^1.11.0", optional = true }
-num_cpus = { version = "1.17.0", optional = true }
+num_cpus = { version = "^1.17.0", optional = true }
 
 # Python bindings
 pyo3 = { version = "0.27.2", features = ["extension-module", "abi3-py310", "generate-import-lib"], optional = true }
diff --git a/scripts/profile_workload.py b/scripts/profile_workload.py
index 9dbeccb..267789a 100644
--- a/scripts/profile_workload.py
+++ b/scripts/profile_workload.py
@@ -12,7 +12,7 @@
 from interpn import interpn as interpn_fn
 
 _TARGET_COUNT = int(1e4)
-_OBSERVATION_COUNTS = (1, 3, 571, 2017, int(1e6))
+_OBSERVATION_COUNTS = (1, 3, 571, 2017, int(1e4))
 _MAX_DIMS = 4
 _GRID_SIZE = 30
 
diff --git a/src/lib.rs b/src/lib.rs
index de38158..fec1f82 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -157,6 +157,51 @@ const MAXDIMS: usize = 8;
 const MAXDIMS_ERR: &str =
     "Dimension exceeds maximum (8). Use interpolator struct directly for higher dimensions.";
 
+
+/// Evaluate multidimensional interpolation on a regular grid in up to 8 dimensions.
+/// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
+///
+/// For lower dimensions, a fast flattened method is used. For higher dimensions, where that flattening
+/// becomes impractical due to compile times and instruction size, evaluation defers to a bounded
+/// recursion.
+/// The linear method uses the flattening for 1-6 dimensions, while
+/// flattened cubic methods are available up to 3 dimensions by default and up to 4 dimensions
+/// with the `deep_unroll` feature enabled.
+///
+/// This is a convenience function; best performance will be achieved by using the exact right
+/// number for the N parameter, as this will slightly reduce compute and storage overhead,
+/// and the underlying method can be extended to more than this function's limit of 8 dimensions.
+/// The limit of 8 dimensions was chosen for no more specific reason than to reduce unit test times.
+///
+/// While this method initializes the interpolator struct on every call, the overhead of doing this
+/// is minimal even when using it to evaluate one observation point at a time.
+/// 
+/// Like most grid search algorithms (including in the standard library), the uniqueness and
+/// monotonicity of the grid is the responsibility of the user, because checking it is often much
+/// more expensive than the algorithm that we will perform on it. Behavior with ill-posed grids
+/// is undefined.
+/// 
+/// #### Args:
+/// 
+/// * `grids`: `N` slices of each axis' grid coordinates. Must be unique and monotonically increasing.
+/// * `vals`:  Flattened `N`-dimensional array of data values at each grid point in C-style order.
+///          Must be the same length as the cartesian product of the grids, (n_x * n_y * ...).
+/// * `obs`:   `N` slices of Observation points where the interpolant should be evaluated.
+///          Must be of equal length.
+/// * `out`:   Pre-allocated output buffer to place the resulting values.
+///          Must  be the same length as each of the `obs` slices.
+/// * `method`: Choice of interpolant function.
+/// * `assume_grid_kind`: Whether to assume the grid is regular (evenly-spaced),
+///                     rectilinear (un-evenly spaced), or make no assumption.
+///                     If an assumption is provided, this bypasses a check of each
+///                     grid, which can be a major speedup in some cases.
+/// * `linearize_extrapolation`: Whether cubic methods should extrapolate linearly instead of the default
+///                            quadratic extrapolation. Linearization is recommended to prevent
+///                            the interpolant from diverging to extremes outside the grid.
+/// * `check_bounds_with_atol`: If provided, return an error if any observation points are outside the grid
+///                           by an amount exceeding the provided tolerance.
+/// * `max_threads`: If provided, limit number of threads used to at most this number. Otherwise,
+///                use a heuristic to choose the number that will provide the best throughput.
 #[cfg(feature = "par")]
 pub fn interpn<T: Float + Send + Sync>(
     grids: &[&[T]],
@@ -174,15 +219,21 @@ pub fn interpn<T: Float + Send + Sync>(
         return Err(MAXDIMS_ERR);
     }
 
+    // Resolve grid kind, checking the grid if the kind is not provided by the user.
+    // We do this once at the top level so that the work is not repeated by each thread.
+    let kind = resolve_grid_kind(assume_grid_kind, grids)?;
+
     // Chunk for parallelism.
     //
     // By default, use only physical cores, because on most machines as of
     // 2026, only half the available cores represent real compute capability due to
-    // the widespread adoption of hyperthreading.
+    // the widespread adoption of hyperthreading. If a larger number is requested for
+    // max_threads, that value is clamped to the total available threads so that we don't
+    // queue chunks unnecessarily.
     //
     // We also use a minimum chunk size of 1024 as a heuristic, because below that limit,
-    // single-threaded performance is usually faster due to a combination of threading overhead
-    // and memory page sizing.
+    // single-threaded performance is usually faster due to a combination of thread spawning overhead,
+    // memory page sizing, and improved vectorization over larger inputs.
     let num_cores_physical = num_cpus::get_physical(); // Real cores
     let num_cores_pool = rayon::current_num_threads(); // Available cores from rayon thread pool
     let num_cores_available = num_cores_physical.min(num_cores_pool).max(1); // Real max
@@ -225,7 +276,7 @@ pub fn interpn<T: Float + Send + Sync>(
             &obs_slices[..ndims],
             outc,
             method,
-            assume_grid_kind,
+            Some(kind),
             linearize_extrapolation,
             check_bounds_with_atol,
         );
@@ -258,34 +309,8 @@ pub fn interpn_serial<T: Float>(
         return Err(MAXDIMS_ERR);
     }
 
-    // Resolve grid kind, checking the grid if
-    // the kind is not provided by the user.
-    let kind = match assume_grid_kind {
-        Some(GridKind::Regular) => GridKind::Regular,
-        Some(GridKind::Rectilinear) => GridKind::Rectilinear,
-        None => {
-            // Check whether grid is regular
-            let mut is_regular = true;
-
-            for grid in grids.iter() {
-                if grid.len() < 2 {
-                    return Err("All grids must have at least two entries");
-                }
-                let step = grid[1] - grid[0];
-
-                if !grid.windows(2).all(|pair| pair[1] - pair[0] == step) {
-                    is_regular = false;
-                    break;
-                }
-            }
-
-            if is_regular {
-                GridKind::Regular
-            } else {
-                GridKind::Rectilinear
-            }
-        }
-    };
+    // Resolve grid kind, checking the grid if the kind is not provided by the user.
+    let kind = resolve_grid_kind(assume_grid_kind, grids)?;
 
     // Extract regular grid params
     let get_regular_grid = || {
@@ -392,6 +417,41 @@ pub fn interpn_serial<T: Float>(
     }
 }
 
+/// Figure out whether a grid is regular or rectilinear.
+fn resolve_grid_kind<T: Float>(
+    assume_grid_kind: Option<GridKind>,
+    grids: &[&[T]],
+) -> Result<GridKind, &'static str> {
+    let kind = match assume_grid_kind {
+        Some(GridKind::Regular) => GridKind::Regular,
+        Some(GridKind::Rectilinear) => GridKind::Rectilinear,
+        None => {
+            // Check whether grid is regular
+            let mut is_regular = true;
+
+            for grid in grids.iter() {
+                if grid.len() < 2 {
+                    return Err("All grids must have at least two entries");
+                }
+                let step = grid[1] - grid[0];
+
+                if !grid.windows(2).all(|pair| pair[1] - pair[0] == step) {
+                    is_regular = false;
+                    break;
+                }
+            }
+
+            if is_regular {
+                GridKind::Regular
+            } else {
+                GridKind::Rectilinear
+            }
+        }
+    };
+
+    Ok(kind)
+}
+
 /// Index a single value from an array
 #[inline]
 pub(crate) fn index_arr<T: Copy>(loc: &[usize], dimprod: &[usize], data: &[T]) -> T {

From a2ee55b5aa33cbf1dcb6382ec6df3a4da122e0e3 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sat, 10 Jan 2026 13:02:20 -0500
Subject: [PATCH 13/20] add top level interpn_alloc

---
 src/lib.rs | 50 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index fec1f82..fded30b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -157,7 +157,6 @@ const MAXDIMS: usize = 8;
 const MAXDIMS_ERR: &str =
     "Dimension exceeds maximum (8). Use interpolator struct directly for higher dimensions.";
 
-
 /// Evaluate multidimensional interpolation on a regular grid in up to 8 dimensions.
 /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
 ///
@@ -175,14 +174,14 @@ const MAXDIMS_ERR: &str =
 ///
 /// While this method initializes the interpolator struct on every call, the overhead of doing this
 /// is minimal even when using it to evaluate one observation point at a time.
-/// 
+///
 /// Like most grid search algorithms (including in the standard library), the uniqueness and
 /// monotonicity of the grid is the responsibility of the user, because checking it is often much
 /// more expensive than the algorithm that we will perform on it. Behavior with ill-posed grids
 /// is undefined.
-/// 
+///
 /// #### Args:
-/// 
+///
 /// * `grids`: `N` slices of each axis' grid coordinates. Must be unique and monotonically increasing.
 /// * `vals`:  Flattened `N`-dimensional array of data values at each grid point in C-style order.
 ///          Must be the same length as the cartesian product of the grids, (n_x * n_y * ...).
@@ -265,7 +264,10 @@ pub fn interpn<T: Float + Send + Sync>(
             let s = &o.get(start..end);
             match s {
                 Some(s) => obs_slices[j] = s,
-                None => write_err("Dimension mismatch"),
+                None => {
+                    write_err("Dimension mismatch");
+                    return;
+                }
             };
         }
 
@@ -294,6 +296,44 @@ pub fn interpn<T: Float + Send + Sync>(
     }
 }
 
+/// Allocating variant of [interpn].
+/// It is recommended to pre-allocate outputs and use the non-allocating variant
+/// whenever possible.
+#[cfg(feature = "par")]
+pub fn interpn_alloc<T: Float + Send + Sync>(
+    grids: &[&[T]],
+    vals: &[T],
+    obs: &[&[T]],
+    out: Option<Vec<T>>,
+    method: GridInterpMethod,
+    assume_grid_kind: Option<GridKind>,
+    linearize_extrapolation: bool,
+    check_bounds_with_atol: Option<T>,
+    max_threads: Option<usize>,
+) -> Result<Vec<T>, &'static str> {
+    // Empty input -> empty output
+    if obs.len() == 0 {
+        return Ok(Vec::with_capacity(0));
+    }
+
+    // If output storage was not provided, build it now
+    let mut out = out.unwrap_or_else(|| vec![T::zero(); obs[0].len()]);
+
+    interpn(
+        grids,
+        vals,
+        obs,
+        &mut out,
+        method,
+        assume_grid_kind,
+        linearize_extrapolation,
+        check_bounds_with_atol,
+        max_threads,
+    )?;
+
+    Ok(out)
+}
+
 pub fn interpn_serial<T: Float>(
     grids: &[&[T]],
     vals: &[T],

From 2c5603ad7267103a829127a0dc32ad2ac6b5c655 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sat, 10 Jan 2026 13:31:01 -0500
Subject: [PATCH 14/20] use lazylock for physical cores

---
 src/lib.rs | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index fded30b..72ad5b6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -122,7 +122,7 @@ use rayon::{
 };
 
 #[cfg(feature = "par")]
-use std::sync::Mutex;
+use std::sync::{LazyLock, Mutex};
 
 #[cfg(feature = "std")]
 pub mod utils;
@@ -157,6 +157,19 @@ const MAXDIMS: usize = 8;
 const MAXDIMS_ERR: &str =
     "Dimension exceeds maximum (8). Use interpolator struct directly for higher dimensions.";
 
+/// The number of physical cores present on the machine;
+/// initialized once, then never again, because each call involves some file I/O
+/// and allocations that can be slower than the function call that they support.
+/// 
+/// On subsequent accesses, each access is an atomic load without any waiting paths.
+/// 
+/// This lock can only be contended if multiple threads attempt access
+/// before it is initialized; in that case, the waiting threads may park
+/// until initialization is complete, which can cause ~20us delays
+/// on first access only.
+#[cfg(feature = "par")]
+static PHYSICAL_CORES: LazyLock<usize> = LazyLock::new(num_cpus::get_physical);
+
 /// Evaluate multidimensional interpolation on a regular grid in up to 8 dimensions.
 /// Assumes C-style ordering of vals (z(x0, y0), z(x0, y1), ..., z(x0, yn), z(x1, y0), ...).
 ///
@@ -233,7 +246,7 @@ pub fn interpn<T: Float + Send + Sync>(
     // We also use a minimum chunk size of 1024 as a heuristic, because below that limit,
     // single-threaded performance is usually faster due to a combination of thread spawning overhead,
     // memory page sizing, and improved vectorization over larger inputs.
-    let num_cores_physical = num_cpus::get_physical(); // Real cores
+    let num_cores_physical = *PHYSICAL_CORES; // Real cores, populated on first access
     let num_cores_pool = rayon::current_num_threads(); // Available cores from rayon thread pool
     let num_cores_available = num_cores_physical.min(num_cores_pool).max(1); // Real max
     let num_cores = match max_threads {

From a240647758d37a2aaa8d5b24fb2748850775e89c Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sat, 10 Jan 2026 13:38:19 -0500
Subject: [PATCH 15/20] add serial path for top level interpn function if there
 are not enough points

---
 src/lib.rs | 103 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 44 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 72ad5b6..b3ad133 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -160,9 +160,9 @@ const MAXDIMS_ERR: &str =
 /// The number of physical cores present on the machine;
 /// initialized once, then never again, because each call involves some file I/O
 /// and allocations that can be slower than the function call that they support.
-/// 
+///
 /// On subsequent accesses, each access is an atomic load without any waiting paths.
-/// 
+///
 /// This lock can only be contended if multiple threads attempt access
 /// before it is initialized; in that case, the waiting threads may park
 /// until initialization is complete, which can cause ~20us delays
@@ -256,56 +256,71 @@ pub fn interpn<T: Float + Send + Sync>(
     let n = out.len();
     let chunk = 1024.max(n / num_cores);
 
-    // Make a shared error indicator
-    let result: Mutex<Option<&'static str>> = Mutex::new(None);
-    let write_err = |msg: &'static str| {
-        let mut guard = result.lock().unwrap();
-        if guard.is_none() {
-            *guard = Some(msg);
-        }
-    };
+    // If there are enough points to justify it, run parallel
+    if chunk > 2 * n {
+        // Make a shared error indicator
+        let result: Mutex<Option<&'static str>> = Mutex::new(None);
+        let write_err = |msg: &'static str| {
+            let mut guard = result.lock().unwrap();
+            if guard.is_none() {
+                *guard = Some(msg);
+            }
+        };
+
+        // Run threaded
+        out.par_chunks_mut(chunk).enumerate().for_each(|(i, outc)| {
+            // Calculate the start and end of observation point chunks
+            let start = chunk * i;
+            let end = start + outc.len();
+
+            // Chunk observation points
+            let mut obs_slices: [&[T]; 8] = [&[]; 8];
+            for (j, o) in obs.iter().enumerate() {
+                let s = &o.get(start..end);
+                match s {
+                    Some(s) => obs_slices[j] = s,
+                    None => {
+                        write_err("Dimension mismatch");
+                        return;
+                    }
+                };
+            }
 
-    // Run threaded
-    out.par_chunks_mut(chunk).enumerate().for_each(|(i, outc)| {
-        // Calculate the start and end of observation point chunks
-        let start = chunk * i;
-        let end = start + outc.len();
-
-        // Chunk observation points
-        let mut obs_slices: [&[T]; 8] = [&[]; 8];
-        for (j, o) in obs.iter().enumerate() {
-            let s = &o.get(start..end);
-            match s {
-                Some(s) => obs_slices[j] = s,
-                None => {
-                    write_err("Dimension mismatch");
-                    return;
-                }
-            };
-        }
+            // Do interpolations
+            let res_inner = interpn_serial(
+                grids,
+                vals,
+                &obs_slices[..ndims],
+                outc,
+                method,
+                Some(kind),
+                linearize_extrapolation,
+                check_bounds_with_atol,
+            );
+
+            match res_inner {
+                Ok(()) => {}
+                Err(msg) => write_err(msg),
+            }
+        });
 
-        // Do interpolations
-        let res_inner = interpn_serial(
+        // Handle errors from threads
+        match *result.lock().unwrap() {
+            Some(msg) => Err(msg),
+            None => Ok(()),
+        }
+    } else {
+        // If there are not enough points to justify parallelism, run serial
+        interpn_serial(
             grids,
             vals,
-            &obs_slices[..ndims],
-            outc,
+            obs,
+            out,
             method,
             Some(kind),
             linearize_extrapolation,
             check_bounds_with_atol,
-        );
-
-        match res_inner {
-            Ok(()) => {}
-            Err(msg) => write_err(msg),
-        }
-    });
-
-    // Handle errors from threads
-    match *result.lock().unwrap() {
-        Some(msg) => Err(msg),
-        None => Ok(()),
+        )
     }
 }
 

From 0b1c48ae14623c546a669ece76230b3739389d36 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sat, 10 Jan 2026 14:25:38 -0500
Subject: [PATCH 16/20] better heuristic for when to use serial path

---
 src/lib.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index b3ad133..46899bb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -156,6 +156,7 @@ pub enum GridKind {
 const MAXDIMS: usize = 8;
 const MAXDIMS_ERR: &str =
     "Dimension exceeds maximum (8). Use interpolator struct directly for higher dimensions.";
+const MIN_CHUNK_SIZE: usize = 1024;
 
 /// The number of physical cores present on the machine;
 /// initialized once, then never again, because each call involves some file I/O
@@ -254,10 +255,10 @@ pub fn interpn<T: Float + Send + Sync>(
         None => num_cores_available,
     };
     let n = out.len();
-    let chunk = 1024.max(n / num_cores);
+    let chunk = MIN_CHUNK_SIZE.max(n / num_cores);
 
     // If there are enough points to justify it, run parallel
-    if chunk > 2 * n {
+    if 2 * MIN_CHUNK_SIZE <= n {
         // Make a shared error indicator
         let result: Mutex<Option<&'static str>> = Mutex::new(None);
         let write_err = |msg: &'static str| {

From 8a49db6bbc28d5ff66c6089b7f77112be43215b0 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sat, 10 Jan 2026 14:59:49 -0500
Subject: [PATCH 17/20] don't check cores if running serial. use 10 megapixel
 case for threading bench.

---
 benches/bench_cpu.py                      | 23 ++++++-------
 docs/speedup_vs_threads_10000000_obs.html |  2 ++
 src/lib.rs                                | 42 +++++++++++------------
 3 files changed, 34 insertions(+), 33 deletions(-)
 create mode 100644 docs/speedup_vs_threads_10000000_obs.html

diff --git a/benches/bench_cpu.py b/benches/bench_cpu.py
index 5356ff9..b5fcdc3 100644
--- a/benches/bench_cpu.py
+++ b/benches/bench_cpu.py
@@ -493,7 +493,8 @@ def _plot_speedup_vs_threads(
 ) -> None:
     fig = make_subplots(rows=1, cols=1)
     dash_styles = [
-        _normalized_line_style(i) for i in range(len(["linear", "cubic", "nearest"]) * 2)
+        _normalized_line_style(i)
+        for i in range(len(["linear", "cubic", "nearest"]) * 2)
     ]
     all_values = []
     thread_arr = np.array(thread_counts)
@@ -597,9 +598,9 @@ def _plot_speedup_vs_threads(
 
 
 def bench_thread_speedup_vs_threads():
-    nobs = 1_000_000
-    ndims = 3
-    ngrid = 20
+    nobs = 10_000_000
+    ndims = 2
+    ngrid = int(10e6**0.5)
     rng = np.random.default_rng(17)
 
     thread_counts = _thread_counts()
@@ -618,7 +619,7 @@ def make_grids(kind: str) -> list[NDArray]:
     for grid_kind in ["regular", "rectilinear"]:
         grids = make_grids(grid_kind)
         mesh = np.meshgrid(*grids, indexing="ij")
-        vals = (mesh[0] + 2.0 * mesh[1] - 0.5 * mesh[2]).astype(np.float64)
+        vals = (mesh[0] + 2.0 * mesh[1]).astype(np.float64)
         vals_flat = vals.ravel()
         obs = []
         for grid in grids:
@@ -650,11 +651,9 @@ def make_grids(kind: str) -> list[NDArray]:
                 )
                 timings.append(timed)
             baseline = timings[0]
-            speedups[grid_kind][method] = [
-                baseline / t if t else 0.0 for t in timings
-            ]
+            speedups[grid_kind][method] = [baseline / t if t else 0.0 for t in timings]
 
-    output_path = Path(__file__).parent / "../docs/speedup_vs_threads_1000000_obs.svg"
+    output_path = Path(__file__).parent / f"../docs/speedup_vs_threads_{nobs}_obs.svg"
     _plot_speedup_vs_threads(
         thread_counts=thread_counts,
         speedups=speedups,
@@ -847,7 +846,7 @@ def bench_4_dims_1_obs():
 
 
 def bench_3_dims_n_obs_unordered():
-    for preallocate in [False, True]:
+    for preallocate in [True, False]:
         ndims = 3  # Number of grid dimensions
         ngrid = 20  # Size of grid on each dimension
 
@@ -987,7 +986,7 @@ def bench_3_dims_n_obs_unordered():
 
 
 def bench_4_dims_n_obs_unordered():
-    for preallocate in [False, True]:
+    for preallocate in [True, False]:
         ndims = 4  # Number of grid dimensions
         ngrid = 20  # Size of grid on each dimension
 
@@ -1273,8 +1272,8 @@ def bench_throughput_vs_dims():
 
 
 def main():
-    bench_thread_speedup_vs_threads()
     bench_throughput_vs_dims()
+    bench_thread_speedup_vs_threads()
     bench_4_dims_1_obs()
     bench_4_dims_n_obs_unordered()
     bench_3_dims_n_obs_unordered()
diff --git a/docs/speedup_vs_threads_10000000_obs.html b/docs/speedup_vs_threads_10000000_obs.html
new file mode 100644
index 0000000..1c55871
--- /dev/null
+++ b/docs/speedup_vs_threads_10000000_obs.html
@@ -0,0 +1,2 @@
+<div>                        <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
+        <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.35.2.min.js"></script>                <div id="d5617fce-6e5f-465b-b7ac-67c419ba3952" class="plotly-graph-div" style="height:430px; width:100%;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("d5617fce-6e5f-465b-b7ac-67c419ba3952")) {                    Plotly.newPlot(                        "d5617fce-6e5f-465b-b7ac-67c419ba3952",                        [{"fill":"toself","fillcolor":"rgba(139, 196, 59, 0.25)","hoverinfo":"skip","line":{"width":0},"mode":"lines","showlegend":false,"x":[1,2,4,6,6,4,2,1],"y":[1.0,1.9231061279250716,3.7108747690570993,4.896694501272425,1.0,1.0,1.0,1.0],"type":"scatter","xaxis":"x","yaxis":"y"},{"fill":"toself","fillcolor":"rgba(139, 196, 59, 0.25)","hoverinfo":"skip","line":{"width":0},"mode":"lines","showlegend":false,"x":[1,2,4,6,6,4,2,1],"y":[1.0,1.9308491100928684,3.9098019654975893,5.848027074580322,1.0,1.0,1.0,1.0],"type":"scatter","xaxis":"x","yaxis":"y"},{"fill":"toself","fillcolor":"rgba(139, 196, 59, 0.25)","hoverinfo":"skip","line":{"width":0},"mode":"lines","showlegend":false,"x":[1,2,4,6,6,4,2,1],"y":[1.0,1.9132181972596194,3.694972076817514,4.876802995471296,1.0,1.0,1.0,1.0],"type":"scatter","xaxis":"x","yaxis":"y"},{"fill":"toself","fillcolor":"rgba(139, 196, 59, 0.25)","hoverinfo":"skip","line":{"width":0},"mode":"lines","showlegend":false,"x":[1,2,4,6,6,4,2,1],"y":[1.0,1.9972020650193796,3.949472542560401,5.67436328971166,1.0,1.0,1.0,1.0],"type":"scatter","xaxis":"x","yaxis":"y"},{"fill":"toself","fillcolor":"rgba(139, 196, 59, 0.25)","hoverinfo":"skip","line":{"width":0},"mode":"lines","showlegend":false,"x":[1,2,4,6,6,4,2,1],"y":[1.0,1.9631273680810757,3.942223778306255,5.644160950452179,1.0,1.0,1.0,1.0],"type":"scatter","xaxis":"x","yaxis":"y"},{"fill":"toself","fillcolor":"rgba(139, 196, 59, 0.25)","hoverinfo":"skip","line":{"width":0},"mode":"lines","showlegend":false,"x":[1,2,4,6,6,4,2,1],"y":[1.0,2.0527945898499493,4.082241246625384,6.016831217312652,1.0,1.0,1.0,1.0],"type":"scatter","xaxis":"x","yaxis":"y"},{"line":{"color":"black","dash":"solid","width":2},"marker":{"color":"black","size":7},"mode":"lines+markers","name":"Linear regular","showlegend":false,"x":[1,2,4,6],"y":[1.0,1.9231061279250716,3.7108747690570993,4.896694501272425],"type":"scatter","xaxis":"x","yaxis":"y"},{"line":{"color":"black","dash":"dash","width":2},"marker":{"color":"black","size":7},"mode":"lines+markers","name":"Cubic regular","showlegend":false,"x":[1,2,4,6],"y":[1.0,1.9308491100928684,3.9098019654975893,5.848027074580322],"type":"scatter","xaxis":"x","yaxis":"y"},{"line":{"color":"black","dash":"dot","width":2},"marker":{"color":"black","size":7},"mode":"lines+markers","name":"Nearest regular","showlegend":false,"x":[1,2,4,6],"y":[1.0,1.9132181972596194,3.694972076817514,4.876802995471296],"type":"scatter","xaxis":"x","yaxis":"y"},{"line":{"color":"black","dash":"dashdot","width":2},"marker":{"color":"black","size":7},"mode":"lines+markers","name":"Linear rectilinear","showlegend":false,"x":[1,2,4,6],"y":[1.0,1.9972020650193796,3.949472542560401,5.67436328971166],"type":"scatter","xaxis":"x","yaxis":"y"},{"line":{"color":"black","dash":"longdash","width":2},"marker":{"color":"black","size":7},"mode":"lines+markers","name":"Cubic rectilinear","showlegend":false,"x":[1,2,4,6],"y":[1.0,1.9631273680810757,3.942223778306255,5.644160950452179],"type":"scatter","xaxis":"x","yaxis":"y"},{"line":{"color":"black","dash":"longdashdot","width":2},"marker":{"color":"black","size":7},"mode":"lines+markers","name":"Nearest rectilinear","showlegend":false,"x":[1,2,4,6],"y":[1.0,2.0527945898499493,4.082241246625384,6.016831217312652],"type":"scatter","xaxis":"x","yaxis":"y"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,1.0],"title":{"text":"Threads"},"range":[1,6],"showline":true,"linecolor":"black","linewidth":1,"mirror":true,"ticks":"outside","tickcolor":"black","showgrid":false,"zeroline":false},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"Speedup vs. 1 Thread"},"range":[1.0,6],"showline":true,"linecolor":"black","linewidth":1,"mirror":true,"ticks":"outside","tickcolor":"black","showgrid":false,"zeroline":false},"shapes":[{"line":{"color":"black","dash":"dot","width":1},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":1.0,"y1":1.0,"yref":"y"}],"title":{"text":"InterpN Thread Speedup (10000000 Observation Points)","y":0.98,"yanchor":"top"},"margin":{"t":70,"l":60,"r":40,"b":80},"font":{"color":"black"},"height":430,"showlegend":false,"plot_bgcolor":"rgba(0,0,0,0)","paper_bgcolor":"rgba(0,0,0,0)"},                        {"responsive": true}                    )                };                            </script>        </div>
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index 46899bb..c5dac83 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -231,34 +231,34 @@ pub fn interpn<T: Float + Send + Sync>(
     if ndims > MAXDIMS {
         return Err(MAXDIMS_ERR);
     }
+    let n = out.len();
 
     // Resolve grid kind, checking the grid if the kind is not provided by the user.
     // We do this once at the top level so that the work is not repeated by each thread.
     let kind = resolve_grid_kind(assume_grid_kind, grids)?;
 
-    // Chunk for parallelism.
-    //
-    // By default, use only physical cores, because on most machines as of
-    // 2026, only half the available cores represent real compute capability due to
-    // the widespread adoption of hyperthreading. If a larger number is requested for
-    // max_threads, that value is clamped to the total available threads so that we don't
-    // queue chunks unnecessarily.
-    //
-    // We also use a minimum chunk size of 1024 as a heuristic, because below that limit,
-    // single-threaded performance is usually faster due to a combination of thread spawning overhead,
-    // memory page sizing, and improved vectorization over larger inputs.
-    let num_cores_physical = *PHYSICAL_CORES; // Real cores, populated on first access
-    let num_cores_pool = rayon::current_num_threads(); // Available cores from rayon thread pool
-    let num_cores_available = num_cores_physical.min(num_cores_pool).max(1); // Real max
-    let num_cores = match max_threads {
-        Some(num_cores_requested) => num_cores_requested.min(num_cores_available),
-        None => num_cores_available,
-    };
-    let n = out.len();
-    let chunk = MIN_CHUNK_SIZE.max(n / num_cores);
-
     // If there are enough points to justify it, run parallel
     if 2 * MIN_CHUNK_SIZE <= n {
+        // Chunk for parallelism.
+        //
+        // By default, use only physical cores, because on most machines as of
+        // 2026, only half the available cores represent real compute capability due to
+        // the widespread adoption of hyperthreading. If a larger number is requested for
+        // max_threads, that value is clamped to the total available threads so that we don't
+        // queue chunks unnecessarily.
+        //
+        // We also use a minimum chunk size of 1024 as a heuristic, because below that limit,
+        // single-threaded performance is usually faster due to a combination of thread spawning overhead,
+        // memory page sizing, and improved vectorization over larger inputs.
+        let num_cores_physical = *PHYSICAL_CORES; // Real cores, populated on first access
+        let num_cores_pool = rayon::current_num_threads(); // Available cores from rayon thread pool
+        let num_cores_available = num_cores_physical.min(num_cores_pool).max(1); // Real max
+        let num_cores = match max_threads {
+            Some(num_cores_requested) => num_cores_requested.min(num_cores_available),
+            None => num_cores_available,
+        };
+        let chunk = MIN_CHUNK_SIZE.max(n / num_cores);
+
         // Make a shared error indicator
         let result: Mutex<Option<&'static str>> = Mutex::new(None);
         let write_err = |msg: &'static str| {

From 58ff0a4e081d72491d026b5dd2bf01283a2efbb4 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sat, 10 Jan 2026 15:37:00 -0500
Subject: [PATCH 18/20] update changelog and docstrings

---
 CHANGELOG.md            | 19 +++++++++++++++++++
 Cargo.lock              |  2 +-
 src/interpn/__init__.py | 13 ++++++++-----
 src/lib.rs              |  1 +
 4 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a62923..df771bf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,24 @@
 # Changelog
 
+## 0.10.0 2026-01-10
+
+### Added
+
+* Rust
+  * Add top-level `interpn`, `interpn_alloc` and `interpn_serial` methods along with supporting enums for selecting methods
+  * Add `par` feature, enabled by default, that enables parallelism with rayon in `interpn` function
+  * Add lazy static for getting number of physical cores to advise thread chunk sizes
+
+### Changed
+
+* Rust
+  * Update deps
+* Python
+  * !Refactor `interpn` function
+    * Combine `check_bounds: bool` and `bounds_check_atol: float` to `check_bounds_with_atol: float | None`
+    * Add `max_threads: int | None` input to allow manually limiting parallelism
+    * Use rust top-level `interpn` function as backend for method selection, bounds checks, and parallelism
+
 ## 0.9.1 2025-12-31
 
 ### Changed
diff --git a/Cargo.lock b/Cargo.lock
index 5aeb1ee..c1d4854 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -238,7 +238,7 @@ checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
 
 [[package]]
 name = "interpn"
-version = "0.9.2"
+version = "0.10.0"
 dependencies = [
  "criterion",
  "crunchy",
diff --git a/src/interpn/__init__.py b/src/interpn/__init__.py
index 087f0af..64cc2ef 100644
--- a/src/interpn/__init__.py
+++ b/src/interpn/__init__.py
@@ -63,16 +63,19 @@ def interpn(
     """
     Evaluate an N-dimensional grid at the supplied observation points.
 
-    Performs some small allocations to prepare the inputs.
-    Use the persistent wrapper classes or raw bindings instead
-    to avoid this overhead entirely.
-
     Reallocates input arrays if and only if they are not contiguous yet.
 
+    Note: values must be defined in C-order, like made by
+    `numpy.meshgrid(*grids, indexing="ij")`. Values on meshgrids defined
+    in graphics-order without `indexing="ij"` will not have the desired effect.
+
+    If a pre-allocated output array is provided, the returned array is a
+    reference to that array.
+
     Args:
         obs: Observation coordinates, one array per dimension.
         grids: Grid axis coordinates, one array per dimension.
-        vals: Values defined on the full tensor-product grid.
+        vals: Values defined on the full cartesian-product grid.
         method: Interpolation kind, one of ``"linear"``, ``"cubic"``, or ``"nearest"``.
         out: Optional preallocated array that receives the result.
         linearize_extrapolation: Whether cubic extrapolation should fall back to
diff --git a/src/lib.rs b/src/lib.rs
index c5dac83..a407d68 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -363,6 +363,7 @@ pub fn interpn_alloc<T: Float + Send + Sync>(
     Ok(out)
 }
 
+/// Single-threaded, non-allocating variant of [interpn] available without `par` feature.
 pub fn interpn_serial<T: Float>(
     grids: &[&[T]],
     vals: &[T],

From dd46e69faf87266da4536f7525d4f1f03ef3a58c Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sat, 10 Jan 2026 15:40:05 -0500
Subject: [PATCH 19/20] update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index df771bf..1be8242 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@
 * Python
   * !Refactor `interpn` function
     * Combine `check_bounds: bool` and `bounds_check_atol: float` to `check_bounds_with_atol: float | None`
+    * Replace `assume_regular` input with optional `grid_kind` to allow assuming either regular or rectilinear, or making no assumption
     * Add `max_threads: int | None` input to allow manually limiting parallelism
     * Use rust top-level `interpn` function as backend for method selection, bounds checks, and parallelism
 

From 42f7b4d74eb44558d956872c59af0cbd54038a03 Mon Sep 17 00:00:00 2001
From: James Logan <jlogan03@gmail.com>
Date: Sat, 10 Jan 2026 15:43:37 -0500
Subject: [PATCH 20/20] format

---
 src/interpn/raw.pyi | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/interpn/raw.pyi b/src/interpn/raw.pyi
index 256ab30..e585c4c 100644
--- a/src/interpn/raw.pyi
+++ b/src/interpn/raw.pyi
@@ -53,7 +53,6 @@ def interpn_f32(
     check_bounds_with_atol: float | None = None,
     max_threads: int | None = None,
 ) -> None: ...
-
 def interpn_linear_regular_f64(
     dims: IntArray,
     starts: NDArrayF64,