diff --git a/numba_celltree/celltree.py b/numba_celltree/celltree.py
index 6d3677a..1591ad9 100644
--- a/numba_celltree/celltree.py
+++ b/numba_celltree/celltree.py
@@ -1,5 +1,6 @@
 from typing import Tuple
 
+import numba as nb
 import numpy as np
 
 from numba_celltree.algorithms import (
@@ -181,7 +182,8 @@ def locate_boxes(self, bbox_coords: FloatArray) -> Tuple[IntArray, IntArray]:
             Indices of the face.
         """
         bbox_coords = cast_bboxes(bbox_coords)
-        return locate_boxes(bbox_coords, self.celltree_data)
+        n_chunks = nb.get_num_threads()
+        return locate_boxes(bbox_coords, self.celltree_data, n_chunks)
 
     def intersect_boxes(
         self, bbox_coords: FloatArray
@@ -205,7 +207,8 @@ def intersect_boxes(
             Area of intersection between the two intersecting faces.
         """
         bbox_coords = cast_bboxes(bbox_coords)
-        i, j = locate_boxes(bbox_coords, self.celltree_data)
+        n_chunks = nb.get_num_threads()
+        i, j = locate_boxes(bbox_coords, self.celltree_data, n_chunks)
         area = box_area_of_intersection(
             bbox_coords=bbox_coords,
             vertices=self.vertices,
@@ -246,7 +249,10 @@ def locate_faces(
         """
         counter_clockwise(vertices, faces)
         bbox_coords = build_bboxes(faces, vertices)
-        shortlist_i, shortlist_j = locate_boxes(bbox_coords, self.celltree_data)
+        n_chunks = nb.get_num_threads()
+        shortlist_i, shortlist_j = locate_boxes(
+            bbox_coords, self.celltree_data, n_chunks
+        )
         intersects = polygons_intersect(
             vertices_a=vertices,
             vertices_b=self.vertices,
diff --git a/numba_celltree/constants.py b/numba_celltree/constants.py
index 4a2162b..bf127a8 100644
--- a/numba_celltree/constants.py
+++ b/numba_celltree/constants.py
@@ -143,7 +143,7 @@ class CellTreeData(NamedTuple):
 # (int(math.ceil(math.log(MAX_N_FACE, 2))) + 1)
 # This is only true for relatively balanced trees. MAX_N_FACE = int(2e9)
 # results in required stack of 32.
-INITIAL_TREE_DEPTH = 32
+INITIAL_STACK_LENGTH = 32
 # Floating point slack
 TOLERANCE_ON_EDGE = 1e-9
 
diff --git a/numba_celltree/creation.py b/numba_celltree/creation.py
index 648bf31..12d3f10 100644
--- a/numba_celltree/creation.py
+++ b/numba_celltree/creation.py
@@ -18,7 +18,11 @@
     NodeDType,
 )
 from numba_celltree.geometry_utils import build_bboxes
-from numba_celltree.utils import allocate_stack, pop, push
+from numba_celltree.utils import (
+    allocate_double_stack,
+    pop_both,
+    push_both,
+)
 
 
 @nb.njit(inline="always")
@@ -227,20 +231,6 @@ def pessimistic_n_nodes(n_polys: int):
     return n_nodes + 1
 
 
-@nb.njit(inline="always")
-def push_both(root_stack, dim_stack, root, dim, size):
-    root_stack, size_root = push(root_stack, root, size)
-    dim_stack, _ = push(dim_stack, dim, size)
-    return root_stack, dim_stack, size_root
-
-
-@nb.njit(inline="always")
-def pop_both(root_stack, dim_stack, size):
-    root, size_root = pop(root_stack, size)
-    dim, _ = pop(dim_stack, size)
-    return root, dim, size_root
-
-
 @nb.njit(cache=True)
 def build(
     nodes: NodeArray,
@@ -251,15 +241,14 @@ def build(
     cells_per_leaf: int,
 ):
     # Cannot compile ahead of time with Numba and recursion
-    # Just use a stack based approach instead
-    root_stack = allocate_stack()
-    dim_stack = allocate_stack()
-    root_stack[0] = 0
-    dim_stack[0] = 0
+    # Just use a stack based approach instead; store root and dim values.
+    stack = allocate_double_stack()
+    stack[0, 0] = 0
+    stack[0, 1] = 0
     size = 1
 
     while size > 0:
-        root_index, dim, size = pop_both(root_stack, dim_stack, size)
+        root_index, dim, size = pop_both(stack, size)
 
         dim_flag = dim
         if dim < 0:
@@ -303,7 +292,7 @@ def build(
                     0,  # size
                 )
             )
-        # NOTA BENE: do not change the default size (0) given to the bucket here
+        # NOTE: do not change the default size (0) given to the bucket here
         # it is used to detect empty buckets later on.
 
         # Now that the buckets are setup, sort them
@@ -361,9 +350,7 @@ def build(
                 if dim_flag >= 0:
                     dim_flag = (not dim) - 2
                     nodes[root_index]["dim"] = not root.dim
-                    root_stack, dim_stack, size = push_both(
-                        root_stack, dim_stack, root_index, dim_flag, size
-                    )
+                    stack, size = push_both(stack, root_index, dim_flag, size)
                 else:  # Already split once, convert to leaf.
                     nodes[root_index]["Lmax"] = -1
                     nodes[root_index]["Rmin"] = -1
@@ -389,12 +376,8 @@ def build(
         node_index = push_node(nodes, left_child, node_index)
         node_index = push_node(nodes, right_child, node_index)
 
-        root_stack, dim_stack, size = push_both(
-            root_stack, dim_stack, child_ind + 1, right_child.dim, size
-        )
-        root_stack, dim_stack, size = push_both(
-            root_stack, dim_stack, child_ind, left_child.dim, size
-        )
+        stack, size = push_both(stack, child_ind + 1, right_child.dim, size)
+        stack, size = push_both(stack, child_ind, left_child.dim, size)
 
     return node_index
 
diff --git a/numba_celltree/query.py b/numba_celltree/query.py
index 459eb03..b229967 100644
--- a/numba_celltree/query.py
+++ b/numba_celltree/query.py
@@ -1,3 +1,5 @@
+from typing import Tuple
+
 import numba as nb
 import numpy as np
 
@@ -25,7 +27,32 @@
     point_in_polygon_or_on_edge,
     to_vector,
 )
-from numba_celltree.utils import allocate_polygon, allocate_stack, pop, push
+from numba_celltree.utils import (
+    allocate_polygon,
+    allocate_stack,
+    allocate_triple_stack,
+    grow,
+    pop,
+    pop_triple,
+    push,
+    push_triple,
+)
+
+
+@nb.njit(inline="always")
+def concatenate_indices(
+    indices: IntArray, counts: IntArray
+) -> Tuple[IntArray, IntArray]:
+    total_size = sum(counts)
+    ii = np.empty(total_size, dtype=IntDType)
+    jj = np.empty(total_size, dtype=IntDType)
+    start = 0
+    for i, size in enumerate(counts):
+        end = start + size
+        ii[start:end] = indices[i][:size, 0]
+        jj[start:end] = indices[i][:size, 1]
+        start = end
+    return ii, jj
 
 
 # Inlining saves about 15% runtime
@@ -90,14 +117,33 @@ def locate_points(
 
 
 @nb.njit(inline="always")
-def locate_box(box: Box, tree: CellTreeData, indices: IntArray, store_indices: bool):
+def locate_box(
+    box: Box, tree: CellTreeData, indices: IntArray, indices_size: int, index: int
+) -> Tuple[int, int]:
+    """
+    Search the tree for a single axis-aligned box.
+
+    Parameters
+    ----------
+    box: Box named tuple
+    tree: CellTreeData
+    indices: IntArray
+        Array for results. Contains ``index`` of the box we're searching for
+        the first column, and the index of the box in the celltree (if any) in
+        the second.
+    indices_size: int
+        Current number of filled in values in ``indices``.
+    index: int
+        Current index of the box we're searching.
+    """
     tree_bbox = as_box(tree.bbox)
     if not boxes_intersect(box, tree_bbox):
-        return 0
+        return 0, indices_size
     stack = allocate_stack()
     stack[0] = 0
     size = 1
     count = 0
+    length = len(indices)
 
     while size > 0:
         node_index, size = pop(stack, size)
@@ -110,8 +156,15 @@ def locate_box(box: Box, tree: CellTreeData, indices: IntArray, store_indices: b
                 # As a named tuple: saves about 15% runtime
                 leaf_box = as_box(tree.bb_coords[bbox_index])
                 if boxes_intersect(box, leaf_box):
-                    if store_indices:
-                        indices[count] = bbox_index
+                    # Exit if we need to re-allocate the array. Exiting instead
+                    # of drawing the re-allocation logic in here makes a
+                    # significant runtime difference; seems like numba can
+                    # optimize this form better.
+                    if indices_size >= length:
+                        return -1, indices_size
+                    indices[indices_size, 0] = index
+                    indices[indices_size, 1] = bbox_index
+                    indices_size += 1
                     count += 1
         else:
             dim = 1 if node["dim"] else 0
@@ -130,51 +183,52 @@ def locate_box(box: Box, tree: CellTreeData, indices: IntArray, store_indices: b
             elif right:
                 stack, size = push(stack, right_child, size)
 
-    return count
+    return count, indices_size
 
 
-@nb.njit(parallel=PARALLEL, cache=True)
-def locate_boxes(
+@nb.njit(cache=True)
+def locate_boxes_helper(
     box_coords: FloatArray,
     tree: CellTreeData,
-):
-    # Numba does not support a concurrent list or bag like stucture:
-    # https://github.com/numba/numba/issues/5878
-    # (Standard lists are not thread safe.)
-    # To support parallel execution, we're stuck with numpy arrays therefore.
-    # Since we don't know the number of contained bounding boxes, we traverse
-    # the tree twice: first to count, then allocate, then another time to
-    # actually store the indices.
-    # The cost of traversing twice is roughly a factor two. Since many
-    # computers can parallellize over more than two threads, counting first --
-    # which enables parallelization -- should still result in a net speed up.
-    n_box = box_coords.shape[0]
-    counts = np.empty(n_box + 1, dtype=IntDType)
-    dummy = np.empty((0,), dtype=IntDType)
-    counts[0] = 0
-    # First run a count so we can allocate afterwards
-    for i in nb.prange(n_box):  # pylint: disable=not-an-iterable
-        box = as_box(box_coords[i])
-        counts[i + 1] = locate_box(box, tree, dummy, False)
-
-    # Run a cumulative sum
-    total = 0
-    for i in range(1, n_box + 1):
-        total += counts[i]
-        counts[i] = total
-
-    # Now allocate appropriately
-    ii = np.empty(total, dtype=IntDType)
-    jj = np.empty(total, dtype=IntDType)
-    for i in nb.prange(n_box):  # pylint: disable=not-an-iterable
-        start = counts[i]
-        end = counts[i + 1]
-        ii[start:end] = i
-        indices = jj[start:end]
-        box = as_box(box_coords[i])
-        locate_box(box, tree, indices, True)
-
-    return ii, jj
+    offset: int,
+) -> IntArray:
+    n_box = len(box_coords)
+    # Ensure the initial indices array isn't too small.
+    indices = np.empty((max(n_box, 256), 2), dtype=IntDType)
+    total_count = 0
+    indices_size = 0
+    for box_index in range(n_box):
+        box = as_box(box_coords[box_index])
+        # Re-allocating here is significantly faster than re-allocating inside
+        # of ``locate_box``; presumably because that function is kept simpler and
+        # numba can optimize better. Unfortunately, that means we have to keep
+        # trying until we succeed here; in most cases, success is immediate as
+        # the indices array will have enough capacity.
+        while True:
+            count, indices_size = locate_box(
+                box, tree, indices, indices_size, box_index + offset
+            )
+            if count != -1:
+                break
+            # Not enough capacity: grow capacity, discard partial work, retry.
+            indices_size = total_count
+            indices = grow(indices)
+        total_count += count
+    return indices, total_count
+
+
+@nb.njit(cache=True, parallel=PARALLEL)
+def locate_boxes(box_coords: FloatArray, tree: CellTreeData, n_chunks: int):
+    chunks = np.array_split(box_coords, n_chunks)
+    offsets = np.zeros(n_chunks, dtype=IntDType)
+    for i, chunk in enumerate(chunks[:-1]):
+        offsets[i + 1] = offsets[i] + len(chunk)
+    # Setup (dummy) typed list for numba to store parallel results.
+    indices = [np.empty((0, 2), dtype=IntDType) for _ in range(n_chunks)]
+    counts = np.empty(n_chunks, dtype=IntDType)
+    for i in nb.prange(n_chunks):
+        indices[i], counts[i] = locate_boxes_helper(chunks[i], tree, offsets[i])
+    return concatenate_indices(indices, counts)
 
 
 # Inlining this function drives compilation time through the roof. It's
@@ -340,27 +394,18 @@ def collect_node_bounds(tree: CellTreeData) -> FloatArray:
     node_bounds[0, 2] = tree.bbox[2]
     node_bounds[0, 3] = tree.bbox[3]
 
-    stack = allocate_stack()
-    parent_stack = allocate_stack()
-    side_stack = allocate_stack()
-
-    # Right child
-    stack[0] = 2
-    parent_stack[0] = 0
-    side_stack[0] = 0
-    # Left child
-    stack[1] = 1
-    parent_stack[1] = 0
-    side_stack[1] = 1
-    # Stack size starts at two.
+    # Stack contains: node_index, parent_index, side (right/left)
+    ROOT = 0
+    RIGHT = 0
+    LEFT = 1
+    stack = allocate_triple_stack()
+    stack[0, :] = (2, ROOT, RIGHT)  # Right child
+    stack[1, :] = (1, ROOT, LEFT)  # Left child
     size = 2
 
     while size > 0:
         # Collect from stacks
-        # Sizes are synchronized.
-        parent_index, _ = pop(parent_stack, size)
-        side, _ = pop(side_stack, size)
-        node_index, size = pop(stack, size)
+        node_index, parent_index, side, size = pop_triple(stack, size)
 
         parent = tree.nodes[parent_index]
         bbox = node_bounds[parent_index]
@@ -388,14 +433,9 @@ def collect_node_bounds(tree: CellTreeData) -> FloatArray:
         right_child = left_child + 1
 
         # Right child
-        push(parent_stack, node_index, size)
-        push(side_stack, 0, size)
-        stack, size = push(stack, right_child, size)
-
+        stack, size = push_triple(stack, right_child, node_index, RIGHT, size)
         # Left child
-        push(parent_stack, node_index, size)
-        push(side_stack, 1, size)
-        stack, size = push(stack, left_child, size)
+        stack, size = push_triple(stack, left_child, node_index, LEFT, size)
 
     return node_bounds
 
diff --git a/numba_celltree/utils.py b/numba_celltree/utils.py
index 15f78c5..4c77ebe 100644
--- a/numba_celltree/utils.py
+++ b/numba_celltree/utils.py
@@ -7,7 +7,7 @@
 from numba.extending import intrinsic
 
 from numba_celltree.constants import (
-    INITIAL_TREE_DEPTH,
+    INITIAL_STACK_LENGTH,
     MAX_N_VERTEX,
     NDIM,
     STACK_ALLOCATE_STATIC_ARRAYS,
@@ -16,6 +16,21 @@
 )
 
 
+@nb.njit(inline="always")
+def allocate_stack():
+    return np.empty(INITIAL_STACK_LENGTH, dtype=IntDType)
+
+
+@nb.njit(inline="always")
+def allocate_double_stack():
+    return np.empty((INITIAL_STACK_LENGTH, 2), dtype=IntDType)
+
+
+@nb.njit(inline="always")
+def allocate_triple_stack():
+    return np.empty((INITIAL_STACK_LENGTH, 3), dtype=IntDType)
+
+
 @nb.njit(inline="always")
 def pop(array, size):
     return array[size - 1], size - 1
@@ -23,10 +38,8 @@ def pop(array, size):
 
 @nb.njit(inline="always")
 def push(array, value, size):
-    if size >= array.size:
-        new = np.empty(array.size * 2, dtype=array.dtype)
-        new[: array.size] = array[:]
-        array = new
+    if size >= len(array):
+        array = grow(array)
     array[size] = value
     return array, size + 1
 
@@ -35,6 +48,53 @@ def push(array, value, size):
 def copy(src, dst, n) -> None:
     for i in range(n):
         dst[i] = src[i]
+    return
+
+
+@nb.njit(inline="always")
+def push_both(stack, a, b, size):
+    if size >= len(stack):
+        stack = grow(stack)
+    stack[size, 0] = a
+    stack[size, 1] = b
+    return stack, size + 1
+
+
+@nb.njit(inline="always")
+def pop_both(stack, size):
+    index = size - 1
+    a = stack[index, 0]
+    b = stack[index, 1]
+    return a, b, index
+
+
+@nb.njit(inline="always")
+def push_triple(stack, a, b, c, size):
+    if size >= len(stack):
+        stack = grow(stack)
+    stack[size, 0] = a
+    stack[size, 1] = b
+    stack[size, 2] = c
+    return stack, size + 1
+
+
+@nb.njit(inline="always")
+def pop_triple(stack, size):
+    index = size - 1
+    a = stack[index, 0]
+    b = stack[index, 1]
+    c = stack[index, 2]
+    return a, b, c, index
+
+
+@nb.njit(inline="always")
+def grow(array: np.ndarray) -> np.ndarray:
+    """Double storage capacity."""
+    n = len(array)
+    new_shape = (2 * n,) + array.shape[1:]
+    new = np.empty(new_shape, dtype=array.dtype)
+    new[:n] = array[:]
+    return new
 
 
 # Ensure these are constants for numba
@@ -94,8 +154,3 @@ def allocate_clip_polygon():
     @nb.njit(inline="always")
     def allocate_box_polygon():
         return np.empty((4, 2), dtype=FloatDType)
-
-
-@nb.njit(inline="always")
-def allocate_stack():
-    return np.empty(INITIAL_TREE_DEPTH, dtype=IntDType)
diff --git a/tests/test_celltree.py b/tests/test_celltree.py
index edd3ae0..41f090c 100644
--- a/tests/test_celltree.py
+++ b/tests/test_celltree.py
@@ -1,3 +1,7 @@
+import os
+
+os.environ["NUMBA_DISABLE_JIT"] = "1"
+
 import pathlib
 import shutil
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0ead4e7..3863863 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,7 +2,19 @@
 import numpy as np
 
 from numba_celltree import utils as ut
-from numba_celltree.constants import INITIAL_TREE_DEPTH, MAX_N_VERTEX
+from numba_celltree.constants import INITIAL_STACK_LENGTH, MAX_N_VERTEX
+
+
+def test_grow():
+    a = np.arange(10)
+    b = ut.grow(a)
+    assert b.shape == (20,)
+    assert np.array_equal(a, b[:10])
+
+    a = np.arange(20).reshape((10, 2))
+    b = ut.grow(a)
+    assert b.shape == (20, 2)
+    assert np.array_equal(a, b[:10])
 
 
 def test_pop():
@@ -49,12 +61,59 @@ def test_copy():
     assert np.array_equal(dst, [0, 1, 2, 0, 0])
 
 
+def test_double_stack():
+    stack = ut.allocate_double_stack()
+    assert stack.shape == (INITIAL_STACK_LENGTH, 2)
+
+    size = 1
+    stack, size = ut.push_both(stack, 1, 2, size)
+    assert size == 2
+    assert len(stack) == INITIAL_STACK_LENGTH
+    assert np.array_equal(stack[1], (1, 2))
+
+    size = INITIAL_STACK_LENGTH
+    stack, size = ut.push_both(stack, 1, 2, size)
+    assert len(stack) == INITIAL_STACK_LENGTH * 2
+    assert size == INITIAL_STACK_LENGTH + 1
+    assert np.array_equal(stack[32], (1, 2))
+
+    a, b, size = ut.pop_both(stack, size)
+    assert size == INITIAL_STACK_LENGTH
+    assert len(stack) == INITIAL_STACK_LENGTH * 2
+    assert a == 1
+    assert b == 2
+
+
+def test_triple_stack():
+    stack = ut.allocate_triple_stack()
+    assert stack.shape == (INITIAL_STACK_LENGTH, 3)
+
+    size = 1
+    stack, size = ut.push_triple(stack, 1, 2, 3, size)
+    assert size == 2
+    assert len(stack) == INITIAL_STACK_LENGTH
+    assert np.array_equal(stack[1], (1, 2, 3))
+
+    size = INITIAL_STACK_LENGTH
+    stack, size = ut.push_triple(stack, 1, 2, 3, size)
+    assert len(stack) == INITIAL_STACK_LENGTH * 2
+    assert size == INITIAL_STACK_LENGTH + 1
+    assert np.array_equal(stack[32], (1, 2, 3))
+
+    a, b, c, size = ut.pop_triple(stack, size)
+    assert size == INITIAL_STACK_LENGTH
+    assert len(stack) == INITIAL_STACK_LENGTH * 2
+    assert a == 1
+    assert b == 2
+    assert c == 3
+
+
 # These array is not returned properly to dynamic python. This is OK: these
 # arrays are exclusively for internal use to temporarily store values.
 @nb.njit
 def do_allocate_stack():
     stack = ut.allocate_stack()
-    return (stack.size == INITIAL_TREE_DEPTH) and (stack[:5].size == 5)
+    return (stack.size == INITIAL_STACK_LENGTH) and (stack[:5].size == 5)
 
 
 def test_allocate_stack():