From 7dffda8fe811671c2822d9ad23f185fe8e5a265c Mon Sep 17 00:00:00 2001
From: suyash67 <suyashnbagad1997@gmail.com>
Date: Sat, 4 Apr 2026 07:40:48 +0000
Subject: [PATCH 1/6] fixed base tables in cycle group batch_mul.

---
 .../stdlib/primitives/group/cycle_group.cpp   | 209 ++++++++++++++++++
 .../stdlib/primitives/group/cycle_group.hpp   |  24 +-
 .../primitives/group/cycle_group.test.cpp     | 143 ++++++++++++
 .../primitives/group/straus_plookup_table.cpp | 140 ++++++++++++
 .../primitives/group/straus_plookup_table.hpp |  54 +++++
 5 files changed, 569 insertions(+), 1 deletion(-)
 create mode 100644 barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.cpp
 create mode 100644 barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.hpp
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp
index cd1c8acfbe54..9dd85c8292d4 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp
@@ -968,6 +968,215 @@ typename cycle_group<Builder>::batch_mul_internal_output cycle_group<Builder>::_
     return { accumulator, offset_generator_accumulator };
 }
 
+/**
+ * @brief Internal algorithm to perform a fixed-base batch mul using plookup tables.
+ *
+ * @details Computes a batch mul of constant base points using the Straus multiscalar multiplication algorithm.
+ * For each constant base point, a plookup table (BasicTable) is created with (1 << ROM_TABLE_BITS) entries.
+ * Unlike ROM tables, plookup tables have zero construction cost and zero finalization overhead.
+ * Each table read costs exactly 1 lookup gate.
+ *
+ * @param scalars Witness scalars to multiply with base points
+ * @param base_points Constant affine points (SRS elements or similar)
+ * @param offset_generators Offset points to prevent infinity edge cases (size = base_points.size() + 1)
+ * @return {accumulator, offset_generator_delta} where result = accumulator - offset_generator_delta
+ */
+template <typename Builder>
+typename cycle_group<Builder>::batch_mul_internal_output cycle_group<Builder>::_fixed_base_plookup_batch_mul_internal(
+    const std::span<cycle_scalar> scalars,
+    const std::span<AffineElement const> base_points,
+    const std::span<AffineElement const> offset_generators)
+{
+    BB_ASSERT_EQ(!scalars.empty(), true, "Empty scalars provided to fixed base plookup batch mul!");
+    BB_ASSERT_EQ(scalars.size(), base_points.size(), "Points/scalars size mismatch in fixed base plookup batch mul!");
+    BB_ASSERT_EQ(offset_generators.size(), base_points.size() + 1, "Too few offset generators provided!");
+    const size_t num_points = scalars.size();
+
+    Builder* context = nullptr;
+    for (const auto& scalar : scalars) {
+        if (context = scalar.get_context(); context != nullptr) {
+            break;
+        }
+    }
+    BB_ASSERT(context != nullptr);
+
+    constexpr size_t num_rounds = numeric::ceil_div(cycle_scalar::NUM_BITS, ROM_TABLE_BITS);
+
+    // Decompose each scalar into ROM_TABLE_BITS-bit slices (also enforces range constraints)
+    std::vector<straus_scalar_slices> scalar_slices;
+    scalar_slices.reserve(num_points);
+    for (const auto& scalar : scalars) {
+        scalar_slices.emplace_back(context, scalar, ROM_TABLE_BITS);
+    }
+
+    // Create plookup tables for each constant base point (zero gate cost)
+    std::vector<straus_plookup_table> point_tables;
+    point_tables.reserve(num_points);
+    for (size_t i = 0; i < num_points; ++i) {
+        point_tables.emplace_back(context, base_points[i], offset_generators[i + 1], ROM_TABLE_BITS);
+    }
+
+    // Compute all intermediate points natively for use as hints in the in-circuit Straus algorithm.
+    // Using projective coordinates + batch normalize to avoid per-operation modular inversions.
+    std::vector<Element> operation_transcript;
+    Element offset_generator_accumulator = offset_generators[0];
+    {
+        // Build native straus tables
+        std::vector<std::vector<Element>> native_straus_tables;
+        for (size_t i = 0; i < num_points; ++i) {
+            std::vector<Element> table(1UL << ROM_TABLE_BITS);
+            table[0] = Element(offset_generators[i + 1]);
+            Element base_proj(base_points[i]);
+            for (size_t j = 1; j < table.size(); ++j) {
+                table[j] = table[j - 1] + base_proj;
+            }
+            native_straus_tables.emplace_back(std::move(table));
+        }
+
+        // Perform Straus algorithm natively
+        Element accumulator = offset_generators[0];
+        for (size_t i = 0; i < num_rounds; ++i) {
+            if (i != 0) {
+                for (size_t j = 0; j < ROM_TABLE_BITS; ++j) {
+                    accumulator = accumulator.dbl();
+                    operation_transcript.push_back(accumulator);
+                    offset_generator_accumulator = offset_generator_accumulator.dbl();
+                }
+            }
+            for (size_t j = 0; j < num_points; ++j) {
+                auto slice_value = static_cast<size_t>(scalar_slices[j].slices_native[num_rounds - i - 1]);
+                const Element point = native_straus_tables[j][slice_value];
+                accumulator += point;
+                operation_transcript.push_back(accumulator);
+                offset_generator_accumulator += Element(offset_generators[j + 1]);
+            }
+        }
+    }
+
+    // Batch-normalize all hint points
+    Element::batch_normalize(operation_transcript.data(), operation_transcript.size());
+    std::vector<AffineElement> operation_hints;
+    operation_hints.reserve(operation_transcript.size());
+    for (const Element& element : operation_transcript) {
+        operation_hints.emplace_back(element.x, element.y);
+    }
+
+    // Execute Straus algorithm in-circuit using plookup reads and precomputed hints
+    AffineElement* hint_ptr = operation_hints.data();
+    cycle_group accumulator = offset_generators[0];
+
+    for (size_t i = 0; i < num_rounds; ++i) {
+        if (i != 0) {
+            for (size_t j = 0; j < ROM_TABLE_BITS; ++j) {
+                accumulator = accumulator.dbl(*hint_ptr);
+                hint_ptr++;
+            }
+        }
+        for (size_t j = 0; j < num_points; ++j) {
+            const field_t scalar_slice = scalar_slices[j][num_rounds - i - 1];
+            const cycle_group point = point_tables[j].read(scalar_slice);
+            // Safe to use unconditional_add: all base points are constants hence linearly independent of offset
+            // generators
+            accumulator = accumulator.unconditional_add(point, *hint_ptr);
+            hint_ptr++;
+        }
+    }
+
+    accumulator.set_origin_tag(OriginTag::constant());
+    return { accumulator, AffineElement(offset_generator_accumulator) };
+}
+
+/**
+ * @brief Fixed-base multiscalar multiplication using plookup tables.
+ *
+ * @details Optimized MSM for the case where all base points are circuit constants (e.g. SRS elements).
+ * Uses plookup tables instead of ROM tables, eliminating table construction gates and finalization overhead.
+ * All base points MUST be constants; witness base points will trigger an assertion failure.
+ *
+ * @param constant_points Vector of constant cycle_group points
+ * @param scalars Vector of cycle_scalar values (may be witnesses or constants)
+ * @param context Generator context for offset generators
+ * @return cycle_group The result of sum(scalars[i] * constant_points[i])
+ */
+template <typename Builder>
+cycle_group<Builder> cycle_group<Builder>::fixed_batch_mul(const std::vector<cycle_group>& constant_points,
+                                                           const std::vector<cycle_scalar>& scalars,
+                                                           const GeneratorContext& context)
+{
+    BB_ASSERT_EQ(scalars.size(), constant_points.size(), "Points/scalars size mismatch in fixed_batch_mul!");
+
+    if (scalars.empty()) {
+        return cycle_group{ Group::point_at_infinity };
+    }
+
+    // Merge all tags
+    OriginTag result_tag = OriginTag::constant();
+    for (auto [point, scalar] : zip_view(constant_points, scalars)) {
+        result_tag = OriginTag(result_tag, OriginTag(point.get_origin_tag(), scalar.get_origin_tag()));
+    }
+
+    std::vector<cycle_scalar> plookup_scalars;
+    std::vector<AffineElement> plookup_points;
+    bool has_non_constant_component = false;
+    Element constant_acc = Group::point_at_infinity;
+
+    for (const auto [point, scalar] : zip_view(constant_points, scalars)) {
+        BB_ASSERT(point.is_constant());
+        if (scalar.is_constant()) {
+            // Both constant: compute natively
+            constant_acc += point.get_value() * scalar.get_value();
+        } else {
+            if (point.get_value().is_point_at_infinity()) {
+                // Constant infinity contributes nothing, but still need range constraints on scalar
+                auto* ctx = scalar.get_context();
+                ctx->create_limbed_range_constraint(scalar.lo().get_witness_index(),
+                                                    cycle_scalar::LO_BITS,
+                                                    ROM_TABLE_BITS,
+                                                    "fixed_batch_mul: lo range constraint for scalar with constant "
+                                                    "infinity");
+                ctx->create_limbed_range_constraint(scalar.hi().get_witness_index(),
+                                                    cycle_scalar::HI_BITS,
+                                                    ROM_TABLE_BITS,
+                                                    "fixed_batch_mul: hi range constraint for scalar with constant "
+                                                    "infinity");
+                continue;
+            }
+            plookup_scalars.push_back(scalar);
+            plookup_points.push_back(point.get_value());
+            has_non_constant_component = true;
+        }
+    }
+
+    if (!has_non_constant_component) {
+        auto result = cycle_group(constant_acc);
+        result.set_origin_tag(result_tag);
+        return result;
+    }
+
+    // Compute offset generators
+    const size_t num_offset_generators = plookup_points.size() + 1;
+    const std::span<AffineElement const> offset_generators =
+        context.generators->get(num_offset_generators, 0, OFFSET_GENERATOR_DOMAIN_SEPARATOR);
+
+    // Run the plookup-based Straus algorithm
+    Element offset_accumulator = -constant_acc;
+    const auto [accumulator, offset_generator_delta] =
+        _fixed_base_plookup_batch_mul_internal(plookup_scalars, plookup_points, offset_generators);
+    offset_accumulator += offset_generator_delta;
+
+    // Subtract offset. Since all points are constants and linearly independent of offset generators,
+    // we can safely use unconditional_add when constant_acc is non-trivial.
+    cycle_group result;
+    if (!constant_acc.is_point_at_infinity()) {
+        result = accumulator.unconditional_add(AffineElement(-offset_accumulator));
+    } else {
+        result = accumulator - cycle_group(AffineElement(offset_accumulator));
+    }
+
+    result.set_origin_tag(result_tag);
+    return result;
+}
+
 /**
  * @brief Multiscalar multiplication algorithm.
  *
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp
index 641f52e2fc1f..d38de4e50867 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp
@@ -14,6 +14,7 @@
 #include "barretenberg/stdlib/primitives/field/field.hpp"
 #include "barretenberg/stdlib/primitives/group/cycle_scalar.hpp"
 #include "barretenberg/stdlib/primitives/group/straus_lookup_table.hpp"
+#include "barretenberg/stdlib/primitives/group/straus_plookup_table.hpp"
 #include "barretenberg/stdlib/primitives/group/straus_scalar_slice.hpp"
 #include "barretenberg/stdlib_circuit_builders/plookup_tables/fixed_base/fixed_base_params.hpp"
 #include "barretenberg/transcript/origin_tag.hpp"
@@ -52,6 +53,7 @@ template <typename Builder> class cycle_group {
     using BigScalarField = stdlib::bigfield<Builder, bb::fq::Params>;
     using cycle_scalar = ::bb::stdlib::cycle_scalar<Builder>;
     using straus_lookup_table = ::bb::stdlib::straus_lookup_table<Builder>;
+    using straus_plookup_table = ::bb::stdlib::straus_plookup_table<Builder>;
     using straus_scalar_slices = ::bb::stdlib::straus_scalar_slices<Builder>;
 
     // Bit-size for scalars represented in the ROM lookup tables used in the variable-base MSM algorithm
@@ -128,6 +130,20 @@ template <typename Builder> class cycle_group {
     static cycle_group batch_mul(const std::vector<cycle_group>& base_points,
                                  const std::vector<cycle_scalar>& scalars,
                                  const GeneratorContext& context = {});
+
+    static cycle_group fixed_batch_mul(const std::vector<cycle_group>& constant_points,
+                                       const std::vector<BigScalarField>& scalars,
+                                       GeneratorContext context = {})
+    {
+        std::vector<cycle_scalar> cycle_scalars;
+        for (auto scalar : scalars) {
+            cycle_scalars.emplace_back(scalar);
+        }
+        return fixed_batch_mul(constant_points, cycle_scalars, context);
+    }
+    static cycle_group fixed_batch_mul(const std::vector<cycle_group>& constant_points,
+                                       const std::vector<cycle_scalar>& scalars,
+                                       const GeneratorContext& context = {});
     cycle_group operator*(const cycle_scalar& scalar) const;
     cycle_group& operator*=(const cycle_scalar& scalar);
     cycle_group operator*(const BigScalarField& scalar) const;
@@ -205,8 +221,9 @@ template <typename Builder> class cycle_group {
     }
 
   private:
-    // Allow straus_lookup_table to access the private constructor for efficiency
+    // Allow straus_lookup_table and straus_plookup_table to access the private constructor for efficiency
     friend class ::bb::stdlib::straus_lookup_table<Builder>;
+    friend class ::bb::stdlib::straus_plookup_table<Builder>;
 
     // Private constructor that allows explicit control over infinity flag.
     // Use public constructors or factory methods instead - they auto-detect infinity from coordinates.
@@ -225,6 +242,11 @@ template <typename Builder> class cycle_group {
     static batch_mul_internal_output _fixed_base_batch_mul_internal(std::span<cycle_scalar> scalars,
                                                                     std::span<AffineElement> base_points);
 
+    static batch_mul_internal_output _fixed_base_plookup_batch_mul_internal(
+        std::span<cycle_scalar> scalars,
+        std::span<AffineElement const> base_points,
+        std::span<AffineElement const> offset_generators);
+
     // Internal implementation for unconditional_add and unconditional_subtract
     cycle_group _unconditional_add_or_subtract(const cycle_group& other,
                                                bool is_addition,
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp
index 15416d1a5988..0c48b48a87c9 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp
@@ -2080,4 +2080,147 @@ TYPED_TEST(CycleGroupTest, TestInfinityAutoDetectionInConstructor)
     EXPECT_FALSE(builder.failed());
     EXPECT_TRUE(CircuitChecker::check(builder));
 }
+
+/**
+ * @brief Test fixed_batch_mul correctness with constant points and witness scalars
+ */
+TYPED_TEST(CycleGroupTest, TestFixedBatchMul)
+{
+    STDLIB_TYPE_ALIASES;
+    auto builder = Builder();
+
+    constexpr size_t num_points = 8;
+    std::vector<cycle_group_ct> points;
+    std::vector<typename cycle_group_ct::cycle_scalar> scalars;
+    Element expected = Group::point_at_infinity;
+
+    for (size_t i = 0; i < num_points; ++i) {
+        auto element = TestFixture::generators[i];
+        typename Group::Fr scalar = Group::Fr::random_element(&engine);
+        expected += (element * scalar);
+        // Points are constant, scalars are witnesses
+        points.emplace_back(cycle_group_ct(element));
+        scalars.emplace_back(cycle_group_ct::cycle_scalar::from_witness(&builder, scalar));
+    }
+
+    auto result = cycle_group_ct::fixed_batch_mul(points, scalars);
+    EXPECT_EQ(result.get_value(), AffineElement(expected));
+
+    EXPECT_FALSE(builder.failed());
+    EXPECT_TRUE(CircuitChecker::check(builder));
+}
+
+/**
+ * @brief Test fixed_batch_mul with a single constant point
+ */
+TYPED_TEST(CycleGroupTest, TestFixedBatchMulSinglePoint)
+{
+    STDLIB_TYPE_ALIASES;
+    auto builder = Builder();
+
+    auto element = TestFixture::generators[0];
+    typename Group::Fr scalar = Group::Fr::random_element(&engine);
+    Element expected = element * scalar;
+
+    std::vector<cycle_group_ct> points{ cycle_group_ct(element) };
+    std::vector<typename cycle_group_ct::cycle_scalar> scalars{
+        cycle_group_ct::cycle_scalar::from_witness(&builder, scalar)
+    };
+
+    auto result = cycle_group_ct::fixed_batch_mul(points, scalars);
+    EXPECT_EQ(result.get_value(), AffineElement(expected));
+
+    EXPECT_FALSE(builder.failed());
+    EXPECT_TRUE(CircuitChecker::check(builder));
+}
+
+/**
+ * @brief Test fixed_batch_mul with a zero scalar
+ */
+TYPED_TEST(CycleGroupTest, TestFixedBatchMulZeroScalar)
+{
+    STDLIB_TYPE_ALIASES;
+    auto builder = Builder();
+
+    auto element = TestFixture::generators[0];
+    typename Group::Fr zero_scalar = 0;
+
+    std::vector<cycle_group_ct> points{ cycle_group_ct(element) };
+    std::vector<typename cycle_group_ct::cycle_scalar> scalars{
+        cycle_group_ct::cycle_scalar::from_witness(&builder, zero_scalar)
+    };
+
+    auto result = cycle_group_ct::fixed_batch_mul(points, scalars);
+    EXPECT_TRUE(result.is_point_at_infinity().get_value());
+
+    EXPECT_FALSE(builder.failed());
+    EXPECT_TRUE(CircuitChecker::check(builder));
+}
+
+/**
+ * @brief Profiling comparison: fixed_batch_mul (plookup) vs batch_mul (ROM) for constant points
+ * @details Both approaches compute the same MSM on constant base points with witness scalars.
+ *          fixed_batch_mul should use significantly fewer gates due to zero table construction
+ *          and zero finalization overhead.
+ */
+TYPED_TEST(CycleGroupTest, TestFixedBatchMulGateComparison)
+{
+    STDLIB_TYPE_ALIASES;
+
+    constexpr size_t num_points = 128;
+
+    // Generate random constant points and witness scalars
+    std::vector<AffineElement> native_points;
+    std::vector<typename Group::Fr> native_scalars;
+    for (size_t i = 0; i < num_points; ++i) {
+        native_points.push_back(Group::one * Group::Fr::random_element(&engine));
+        native_scalars.push_back(Group::Fr::random_element(&engine));
+    }
+
+    // Compute expected result natively
+    Element expected = Group::point_at_infinity;
+    for (size_t i = 0; i < num_points; ++i) {
+        expected += native_points[i] * native_scalars[i];
+    }
+
+    // --- ROM-based batch_mul ---
+    size_t rom_gates;
+    {
+        Builder rom_builder;
+        std::vector<cycle_group_ct> points;
+        std::vector<typename cycle_group_ct::cycle_scalar> scalars;
+        for (size_t i = 0; i < num_points; ++i) {
+            points.emplace_back(cycle_group_ct(native_points[i]));
+            scalars.emplace_back(cycle_group_ct::cycle_scalar::from_witness(&rom_builder, native_scalars[i]));
+        }
+        auto result = cycle_group_ct::batch_mul(points, scalars);
+        EXPECT_EQ(result.get_value(), AffineElement(expected));
+        EXPECT_TRUE(CircuitChecker::check(rom_builder));
+        rom_gates = rom_builder.get_num_finalized_gates_inefficient();
+    }
+
+    // --- Plookup-based fixed_batch_mul ---
+    size_t plookup_gates;
+    {
+        Builder plookup_builder;
+        std::vector<cycle_group_ct> points;
+        std::vector<typename cycle_group_ct::cycle_scalar> scalars;
+        for (size_t i = 0; i < num_points; ++i) {
+            points.emplace_back(cycle_group_ct(native_points[i]));
+            scalars.emplace_back(cycle_group_ct::cycle_scalar::from_witness(&plookup_builder, native_scalars[i]));
+        }
+        auto result = cycle_group_ct::fixed_batch_mul(points, scalars);
+        EXPECT_EQ(result.get_value(), AffineElement(expected));
+        EXPECT_TRUE(CircuitChecker::check(plookup_builder));
+        plookup_gates = plookup_builder.get_num_finalized_gates_inefficient();
+    }
+
+    info("batch_mul (ROM) gates:          ", rom_gates);
+    info("fixed_batch_mul (plookup) gates: ", plookup_gates);
+    info("gate savings:                    ", static_cast<int64_t>(rom_gates) - static_cast<int64_t>(plookup_gates));
+
+    // fixed_batch_mul should be strictly cheaper than ROM-based batch_mul
+    EXPECT_LT(plookup_gates, rom_gates);
+}
+
 #pragma GCC diagnostic pop
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.cpp
new file mode 100644
index 000000000000..2f4bdb718918
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.cpp
@@ -0,0 +1,140 @@
+#include "./straus_plookup_table.hpp"
+#include "./cycle_group.hpp"
+#include "barretenberg/stdlib/primitives/circuit_builders/circuit_builders.hpp"
+
+namespace bb::stdlib {
+
+/**
+ * @brief Construct a plookup-based Straus lookup table for a constant base point.
+ *
+ * @details Creates a BasicTable with (1 << table_bits) entries of the form:
+ *   { offset_generator + i * base_point } for i in [0, 1 << table_bits)
+ *
+ * The table is pushed directly into the builder's lookup_tables deque. Table data becomes part of the
+ * proving polynomial (zero gate cost). Each subsequent read costs exactly 1 lookup gate.
+ *
+ * @param context The circuit builder
+ * @param base_point Constant base point (must not be a witness)
+ * @param offset_generator Offset to prevent point-at-infinity edge cases
+ * @param table_bits Number of bits per table (table has 1 << table_bits entries)
+ */
+template <typename Builder>
+straus_plookup_table<Builder>::straus_plookup_table(Builder* context,
+                                                    const AffineElement& base_point,
+                                                    const AffineElement& offset_generator,
+                                                    size_t table_bits)
+    : _context(context)
+{
+    const size_t table_size = 1UL << table_bits;
+
+    // Compute native table entries using projective coordinates, then batch-normalize
+    std::vector<Element> projective_points(table_size);
+    projective_points[0] = Element(offset_generator);
+    Element base_proj(base_point);
+    for (size_t i = 1; i < table_size; ++i) {
+        projective_points[i] = projective_points[i - 1] + base_proj;
+    }
+    Element::batch_normalize(projective_points.data(), table_size);
+
+    native_table.resize(table_size);
+    for (size_t i = 0; i < table_size; ++i) {
+        native_table[i] = AffineElement(projective_points[i].x, projective_points[i].y);
+    }
+
+    // Create a BasicTable and populate its columns
+    plookup::BasicTable table;
+    table.id = plookup::BasicTableId::KECCAK_RHO_9; // unused sentinel; table_index is what matters
+    table.use_twin_keys = false;
+    table.column_1_step_size = bb::fr(0);
+    table.column_2_step_size = bb::fr(0);
+    table.column_3_step_size = bb::fr(0);
+    table.get_values_from_key = nullptr;
+
+    table.column_1.resize(table_size);
+    table.column_2.resize(table_size);
+    table.column_3.resize(table_size);
+    for (size_t i = 0; i < table_size; ++i) {
+        table.column_1[i] = bb::fr(i);
+        table.column_2[i] = native_table[i].x;
+        table.column_3[i] = native_table[i].y;
+    }
+
+    // Assign table_index and push into the builder's lookup_tables deque
+    table.table_index = context->get_num_lookup_tables();
+    auto& tables = context->get_lookup_tables();
+    tables.emplace_back(std::move(table));
+    _table = &tables.back();
+
+    // This table is built entirely from native constants (base_point and offset_generator are AffineElements),
+    // so the tag is pure constant. If left as the default FREE_WITNESS, merging with a transcript-tagged
+    // scalar index in read() would trigger "free witness interacting with origin" errors.
+    tag = OriginTag::constant();
+}
+
+/**
+ * @brief Read from the plookup table at the given index.
+ *
+ * @details Creates a single lookup gate that constrains: (index, x, y) is a valid row in this table.
+ * The index's own witness is reused as wire_1 of the gate (not a new variable), so the gate directly
+ * constrains the scalar slice to a valid (x, y) point — matching the pattern of
+ * create_gates_from_plookup_accumulators where key_a_index is reused in the first lookup gate.
+ *
+ * @param _index The lookup index (witness or constant field element, typically a 4-bit scalar slice)
+ * @return cycle_group<Builder> The point at native_table[index]
+ */
+template <typename Builder> cycle_group<Builder> straus_plookup_table<Builder>::read(const field_t& _index)
+{
+    // A plookup gate key must be a witness; convert constants to a witness constrained to the constant value
+    // (mirrors the same pattern in straus_lookup_table::read and create_gates_from_plookup_accumulators).
+    field_t index(_index);
+    if (index.is_constant()) {
+        index = field_t::from_witness(_context, _index.get_value());
+        index.assert_equal(_index.get_value());
+    }
+
+    // Get native index value and look up the corresponding point
+    auto native_index = static_cast<size_t>(uint256_t(index.get_value()));
+    BB_ASSERT(native_index < native_table.size());
+    const auto& point = native_table[native_index];
+
+    // Create witnesses for x and y outputs
+    auto x_idx = _context->add_variable(point.x);
+    auto y_idx = _context->add_variable(point.y);
+
+    // Record lookup entry in the table's lookup_gates (needed for read_counts construction)
+    plookup::BasicTable::LookupEntry entry;
+    entry.key = { uint256_t(native_index), 0 };
+    entry.value = { point.x, point.y };
+    _table->lookup_gates.emplace_back(entry);
+
+    // Write lookup gate reusing the index's own witness index as the key (wire_1).
+    // This matches the pattern in create_gates_from_plookup_accumulators where key_a_index is reused
+    // in the first (and here only) lookup gate, ensuring the key is the actual scalar slice witness.
+    auto& block = _context->blocks.lookup;
+    block.populate_wires(index.get_witness_index(), x_idx, y_idx, _context->zero_idx());
+    block.set_gate_selector(1);
+    block.q_3().emplace_back(bb::fr(_table->table_index)); // table identifier
+    block.q_2().emplace_back(0);                           // column_1 step size (0 = standalone lookup)
+    block.q_m().emplace_back(0);                           // column_2 step size
+    block.q_c().emplace_back(0);                           // column_3 step size
+    block.q_1().emplace_back(0);
+    block.q_4().emplace_back(0);
+
+    _context->check_selector_length_consistency();
+    _context->increment_num_gates();
+
+    // Wrap output witnesses in field_t and propagate origin tag from the index
+    field_t x = field_t::from_witness_index(_context, x_idx);
+    field_t y = field_t::from_witness_index(_context, y_idx);
+    OriginTag merged_tag(tag, index.get_origin_tag());
+    x.set_origin_tag(merged_tag);
+    y.set_origin_tag(merged_tag);
+
+    // Result is never at infinity due to offset generator in every table entry
+    return cycle_group<Builder>(x, y, /*is_infinity=*/bool_t(_context, false), /*assert_on_curve=*/false);
+}
+
+template class straus_plookup_table<bb::UltraCircuitBuilder>;
+template class straus_plookup_table<bb::MegaCircuitBuilder>;
+
+} // namespace bb::stdlib
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.hpp
new file mode 100644
index 000000000000..0d422deaced0
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.hpp
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "barretenberg/stdlib/primitives/field/field.hpp"
+#include "barretenberg/stdlib_circuit_builders/plookup_tables/types.hpp"
+#include "barretenberg/transcript/origin_tag.hpp"
+#include <vector>
+
+namespace bb::stdlib {
+
+// Forward declaration
+template <typename Builder> class cycle_group;
+
+/**
+ * @brief straus_plookup_table computes a plookup-based lookup table of size 1 << table_bits
+ *
+ * @details For a CONSTANT base_point [P] and offset_generator point [G], where N = 1 << table_bits,
+ * the following is computed:
+ *
+ * { [G] + 0.[P], [G] + 1.[P], ..., [G] + (N - 1).[P] }
+ *
+ * Unlike straus_lookup_table (which uses ROM tables), this class creates plookup BasicTable entries.
+ * Plookup tables have zero construction cost (table data is part of the proving polynomial) and each
+ * read costs exactly 1 lookup gate with no finalization overhead. This makes them significantly cheaper
+ * than ROM tables for fixed/constant base points.
+ *
+ * @note This class requires the base point to be a circuit constant (not a witness). For witness base
+ * points, use straus_lookup_table instead.
+ *
+ * @note The offset generator [G] prevents point-at-infinity edge cases, same as in straus_lookup_table.
+ */
+template <typename Builder> class straus_plookup_table {
+  public:
+    using field_t = stdlib::field_t<Builder>;
+    using bool_t = stdlib::bool_t<Builder>;
+    using Curve = typename Builder::EmbeddedCurve;
+    using Group = typename Curve::Group;
+    using Element = typename Curve::Element;
+    using AffineElement = typename Curve::AffineElement;
+
+    straus_plookup_table() = default;
+    straus_plookup_table(Builder* context,
+                         const AffineElement& base_point,
+                         const AffineElement& offset_generator,
+                         size_t table_bits);
+    cycle_group<Builder> read(const field_t& index);
+
+  private:
+    Builder* _context = nullptr;
+    plookup::BasicTable* _table = nullptr;   // pointer into builder's lookup_tables deque
+    std::vector<AffineElement> native_table; // precomputed table entries for witness generation
+    OriginTag tag;
+};
+
+} // namespace bb::stdlib

From 5ca8fa7a906ff3a51d34ca165cedc0edf11899ae Mon Sep 17 00:00:00 2001
From: suyash67 <suyashnbagad1997@gmail.com>
Date: Sat, 4 Apr 2026 07:41:09 +0000
Subject: [PATCH 2/6] ipa verifier uses fixed base batch_mul.

---
 .../cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp      | 5 +++--
 .../barretenberg/dsl/acir_format/gate_count_constants.hpp    | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
index e4a3b8555ca2..bde8c2417f10 100644
--- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
+++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -803,11 +803,12 @@ template <typename Curve_, size_t log_poly_length = CONST_ECCVM_LOG_N> class IPA
         }
 
         // Compute G_zero
-        // In the native verifier, this uses pippenger. Here we use batch_mul.
+        // In the native verifier, this uses pippenger. Here we use fixed_batch_mul since all SRS points are
+        // circuit constants, which uses plookup tables instead of ROM tables and is significantly cheaper.
         std::vector<Commitment> srs_elements = vk.get_monomial_points();
         BB_ASSERT_GTE(srs_elements.size(), poly_length, "Not enough SRS points for IPA!");
         srs_elements.resize(poly_length);
-        Commitment computed_G_zero = Commitment::batch_mul(srs_elements, s_vec);
+        Commitment computed_G_zero = Commitment::fixed_batch_mul(srs_elements, s_vec);
         // check the computed G_zero and the claimed G_zero are the same.
         claimed_G_zero.assert_equal(computed_G_zero);
         BB_ASSERT_EQ(computed_G_zero.get_value(), claimed_G_zero.get_value(), "G_zero doesn't match received G_zero.");
diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp
index 1daf3fa3d135..7dc4c5d65a46 100644
--- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp
+++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp
@@ -55,7 +55,7 @@ template <typename Builder> inline constexpr size_t ASSERT_EQUALITY = ZERO_GATE
 // Honk Recursion Constants
 // ========================================
 
-inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 12904885;
+inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 9038443;
 
 template <typename RecursiveFlavor>
 constexpr std::tuple<size_t, size_t> HONK_RECURSION_CONSTANTS(

From 33e1e07cef6b5d3dbec7435ef037c06fd4487583 Mon Sep 17 00:00:00 2001
From: suyash67 <suyashnbagad1997@gmail.com>
Date: Sun, 5 Apr 2026 09:51:40 +0000
Subject: [PATCH 3/6] use 8-bit tables.

---
 .../commitment_schemes/ipa/ipa.hpp            |  5 +-
 .../dsl/acir_format/gate_count_constants.hpp  |  2 +-
 .../stdlib/primitives/group/cycle_group.cpp   | 32 ++++++----
 .../stdlib/primitives/group/cycle_group.hpp   | 11 ++--
 .../primitives/group/cycle_group.test.cpp     | 62 +++++++++++++++----
 5 files changed, 82 insertions(+), 30 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
index bde8c2417f10..b0bd4b72f780 100644
--- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
+++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -805,10 +805,13 @@ template <typename Curve_, size_t log_poly_length = CONST_ECCVM_LOG_N> class IPA
         // Compute G_zero
         // In the native verifier, this uses pippenger. Here we use fixed_batch_mul since all SRS points are
         // circuit constants, which uses plookup tables instead of ROM tables and is significantly cheaper.
+        // We use 8-bit tables (table_bits=8, 32 rounds) rather than the default 4-bit (64 rounds) because
+        // table rows are preprocessed and don't cost witness rows; halving the rounds halves lookup/add gates.
+        // 8-bit is valid since cycle_scalar::LO_BITS (128) is evenly divisible by 8.
         std::vector<Commitment> srs_elements = vk.get_monomial_points();
         BB_ASSERT_GTE(srs_elements.size(), poly_length, "Not enough SRS points for IPA!");
         srs_elements.resize(poly_length);
-        Commitment computed_G_zero = Commitment::fixed_batch_mul(srs_elements, s_vec);
+        Commitment computed_G_zero = Commitment::fixed_batch_mul(srs_elements, s_vec, {}, 8);
         // check the computed G_zero and the claimed G_zero are the same.
         claimed_G_zero.assert_equal(computed_G_zero);
         BB_ASSERT_EQ(computed_G_zero.get_value(), claimed_G_zero.get_value(), "G_zero doesn't match received G_zero.");
diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp
index 7dc4c5d65a46..cc316058617e 100644
--- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp
+++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp
@@ -55,7 +55,7 @@ template <typename Builder> inline constexpr size_t ASSERT_EQUALITY = ZERO_GATE
 // Honk Recursion Constants
 // ========================================
 
-inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 9038443;
+inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 6351560;
 
 template <typename RecursiveFlavor>
 constexpr std::tuple<size_t, size_t> HONK_RECURSION_CONSTANTS(
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp
index 9dd85c8292d4..852175b3c033 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp
@@ -985,7 +985,8 @@ template <typename Builder>
 typename cycle_group<Builder>::batch_mul_internal_output cycle_group<Builder>::_fixed_base_plookup_batch_mul_internal(
     const std::span<cycle_scalar> scalars,
     const std::span<AffineElement const> base_points,
-    const std::span<AffineElement const> offset_generators)
+    const std::span<AffineElement const> offset_generators,
+    const size_t table_bits)
 {
     BB_ASSERT_EQ(!scalars.empty(), true, "Empty scalars provided to fixed base plookup batch mul!");
     BB_ASSERT_EQ(scalars.size(), base_points.size(), "Points/scalars size mismatch in fixed base plookup batch mul!");
@@ -999,21 +1000,27 @@ typename cycle_group<Builder>::batch_mul_internal_output cycle_group<Builder>::_
         }
     }
     BB_ASSERT(context != nullptr);
+    BB_ASSERT_EQ(cycle_scalar::LO_BITS % table_bits,
+                 0UL,
+                 "table_bits must evenly divide cycle_scalar::LO_BITS. The Straus algorithm splits the scalar "
+                 "into lo/hi limbs and decomposes each separately; if LO_BITS is not a multiple of table_bits, "
+                 "the hi-limb slices start at the wrong bit-offset and the MSM result is incorrect. "
+                 "Valid values for table_bits (given LO_BITS=128) are: 1, 2, 4, 8, 16, 32, 64, 128.");
 
-    constexpr size_t num_rounds = numeric::ceil_div(cycle_scalar::NUM_BITS, ROM_TABLE_BITS);
+    const size_t num_rounds = numeric::ceil_div(cycle_scalar::NUM_BITS, table_bits);
 
-    // Decompose each scalar into ROM_TABLE_BITS-bit slices (also enforces range constraints)
+    // Decompose each scalar into table_bits-bit slices (also enforces range constraints)
     std::vector<straus_scalar_slices> scalar_slices;
     scalar_slices.reserve(num_points);
     for (const auto& scalar : scalars) {
-        scalar_slices.emplace_back(context, scalar, ROM_TABLE_BITS);
+        scalar_slices.emplace_back(context, scalar, table_bits);
     }
 
     // Create plookup tables for each constant base point (zero gate cost)
     std::vector<straus_plookup_table> point_tables;
     point_tables.reserve(num_points);
     for (size_t i = 0; i < num_points; ++i) {
-        point_tables.emplace_back(context, base_points[i], offset_generators[i + 1], ROM_TABLE_BITS);
+        point_tables.emplace_back(context, base_points[i], offset_generators[i + 1], table_bits);
     }
 
     // Compute all intermediate points natively for use as hints in the in-circuit Straus algorithm.
@@ -1024,7 +1031,7 @@ typename cycle_group<Builder>::batch_mul_internal_output cycle_group<Builder>::_
         // Build native straus tables
         std::vector<std::vector<Element>> native_straus_tables;
         for (size_t i = 0; i < num_points; ++i) {
-            std::vector<Element> table(1UL << ROM_TABLE_BITS);
+            std::vector<Element> table(1UL << table_bits);
             table[0] = Element(offset_generators[i + 1]);
             Element base_proj(base_points[i]);
             for (size_t j = 1; j < table.size(); ++j) {
@@ -1037,7 +1044,7 @@ typename cycle_group<Builder>::batch_mul_internal_output cycle_group<Builder>::_
         Element accumulator = offset_generators[0];
         for (size_t i = 0; i < num_rounds; ++i) {
             if (i != 0) {
-                for (size_t j = 0; j < ROM_TABLE_BITS; ++j) {
+                for (size_t j = 0; j < table_bits; ++j) {
                     accumulator = accumulator.dbl();
                     operation_transcript.push_back(accumulator);
                     offset_generator_accumulator = offset_generator_accumulator.dbl();
@@ -1067,7 +1074,7 @@ typename cycle_group<Builder>::batch_mul_internal_output cycle_group<Builder>::_
 
     for (size_t i = 0; i < num_rounds; ++i) {
         if (i != 0) {
-            for (size_t j = 0; j < ROM_TABLE_BITS; ++j) {
+            for (size_t j = 0; j < table_bits; ++j) {
                 accumulator = accumulator.dbl(*hint_ptr);
                 hint_ptr++;
             }
@@ -1101,7 +1108,8 @@ typename cycle_group<Builder>::batch_mul_internal_output cycle_group<Builder>::_
 template <typename Builder>
 cycle_group<Builder> cycle_group<Builder>::fixed_batch_mul(const std::vector<cycle_group>& constant_points,
                                                            const std::vector<cycle_scalar>& scalars,
-                                                           const GeneratorContext& context)
+                                                           const GeneratorContext& context,
+                                                           const size_t table_bits)
 {
     BB_ASSERT_EQ(scalars.size(), constant_points.size(), "Points/scalars size mismatch in fixed_batch_mul!");
 
@@ -1131,12 +1139,12 @@ cycle_group<Builder> cycle_group<Builder>::fixed_batch_mul(const std::vector<cyc
                 auto* ctx = scalar.get_context();
                 ctx->create_limbed_range_constraint(scalar.lo().get_witness_index(),
                                                     cycle_scalar::LO_BITS,
-                                                    ROM_TABLE_BITS,
+                                                    table_bits,
                                                     "fixed_batch_mul: lo range constraint for scalar with constant "
                                                     "infinity");
                 ctx->create_limbed_range_constraint(scalar.hi().get_witness_index(),
                                                     cycle_scalar::HI_BITS,
-                                                    ROM_TABLE_BITS,
+                                                    table_bits,
                                                     "fixed_batch_mul: hi range constraint for scalar with constant "
                                                     "infinity");
                 continue;
@@ -1161,7 +1169,7 @@ cycle_group<Builder> cycle_group<Builder>::fixed_batch_mul(const std::vector<cyc
     // Run the plookup-based Straus algorithm
     Element offset_accumulator = -constant_acc;
     const auto [accumulator, offset_generator_delta] =
-        _fixed_base_plookup_batch_mul_internal(plookup_scalars, plookup_points, offset_generators);
+        _fixed_base_plookup_batch_mul_internal(plookup_scalars, plookup_points, offset_generators, table_bits);
     offset_accumulator += offset_generator_delta;
 
     // Subtract offset. Since all points are constants and linearly independent of offset generators,
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp
index d38de4e50867..4d1cf831e3cc 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp
@@ -133,17 +133,19 @@ template <typename Builder> class cycle_group {
 
     static cycle_group fixed_batch_mul(const std::vector<cycle_group>& constant_points,
                                        const std::vector<BigScalarField>& scalars,
-                                       GeneratorContext context = {})
+                                       GeneratorContext context = {},
+                                       size_t table_bits = ROM_TABLE_BITS)
     {
         std::vector<cycle_scalar> cycle_scalars;
         for (auto scalar : scalars) {
             cycle_scalars.emplace_back(scalar);
         }
-        return fixed_batch_mul(constant_points, cycle_scalars, context);
+        return fixed_batch_mul(constant_points, cycle_scalars, context, table_bits);
     }
     static cycle_group fixed_batch_mul(const std::vector<cycle_group>& constant_points,
                                        const std::vector<cycle_scalar>& scalars,
-                                       const GeneratorContext& context = {});
+                                       const GeneratorContext& context = {},
+                                       size_t table_bits = ROM_TABLE_BITS);
     cycle_group operator*(const cycle_scalar& scalar) const;
     cycle_group& operator*=(const cycle_scalar& scalar);
     cycle_group operator*(const BigScalarField& scalar) const;
@@ -245,7 +247,8 @@ template <typename Builder> class cycle_group {
     static batch_mul_internal_output _fixed_base_plookup_batch_mul_internal(
         std::span<cycle_scalar> scalars,
         std::span<AffineElement const> base_points,
-        std::span<AffineElement const> offset_generators);
+        std::span<AffineElement const> offset_generators,
+        size_t table_bits = ROM_TABLE_BITS);
 
     // Internal implementation for unconditional_add and unconditional_subtract
     cycle_group _unconditional_add_or_subtract(const cycle_group& other,
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp
index 0c48b48a87c9..568be7e95d8c 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp
@@ -2123,9 +2123,8 @@ TYPED_TEST(CycleGroupTest, TestFixedBatchMulSinglePoint)
     Element expected = element * scalar;
 
     std::vector<cycle_group_ct> points{ cycle_group_ct(element) };
-    std::vector<typename cycle_group_ct::cycle_scalar> scalars{
-        cycle_group_ct::cycle_scalar::from_witness(&builder, scalar)
-    };
+    std::vector<typename cycle_group_ct::cycle_scalar> scalars{ cycle_group_ct::cycle_scalar::from_witness(&builder,
+                                                                                                           scalar) };
 
     auto result = cycle_group_ct::fixed_batch_mul(points, scalars);
     EXPECT_EQ(result.get_value(), AffineElement(expected));
@@ -2146,9 +2145,8 @@ TYPED_TEST(CycleGroupTest, TestFixedBatchMulZeroScalar)
     typename Group::Fr zero_scalar = 0;
 
     std::vector<cycle_group_ct> points{ cycle_group_ct(element) };
-    std::vector<typename cycle_group_ct::cycle_scalar> scalars{
-        cycle_group_ct::cycle_scalar::from_witness(&builder, zero_scalar)
-    };
+    std::vector<typename cycle_group_ct::cycle_scalar> scalars{ cycle_group_ct::cycle_scalar::from_witness(
+        &builder, zero_scalar) };
 
     auto result = cycle_group_ct::fixed_batch_mul(points, scalars);
     EXPECT_TRUE(result.is_point_at_infinity().get_value());
@@ -2183,8 +2181,18 @@ TYPED_TEST(CycleGroupTest, TestFixedBatchMulGateComparison)
         expected += native_points[i] * native_scalars[i];
     }
 
+    // Helper: sum all lookup table rows across the builder's registered tables
+    auto total_table_rows = [](const Builder& b) {
+        size_t total = 0;
+        for (const auto& table : b.get_lookup_tables()) {
+            total += table.size();
+        }
+        return total;
+    };
+
     // --- ROM-based batch_mul ---
     size_t rom_gates;
+    size_t rom_table_rows;
     {
         Builder rom_builder;
         std::vector<cycle_group_ct> points;
@@ -2196,11 +2204,13 @@ TYPED_TEST(CycleGroupTest, TestFixedBatchMulGateComparison)
         auto result = cycle_group_ct::batch_mul(points, scalars);
         EXPECT_EQ(result.get_value(), AffineElement(expected));
         EXPECT_TRUE(CircuitChecker::check(rom_builder));
+        rom_table_rows = total_table_rows(rom_builder);
         rom_gates = rom_builder.get_num_finalized_gates_inefficient();
     }
 
-    // --- Plookup-based fixed_batch_mul ---
+    // --- Plookup-based fixed_batch_mul (4-bit) ---
     size_t plookup_gates;
+    size_t plookup_table_rows;
     {
         Builder plookup_builder;
         std::vector<cycle_group_ct> points;
@@ -2209,18 +2219,46 @@ TYPED_TEST(CycleGroupTest, TestFixedBatchMulGateComparison)
             points.emplace_back(cycle_group_ct(native_points[i]));
             scalars.emplace_back(cycle_group_ct::cycle_scalar::from_witness(&plookup_builder, native_scalars[i]));
         }
-        auto result = cycle_group_ct::fixed_batch_mul(points, scalars);
+        auto result = cycle_group_ct::fixed_batch_mul(points, scalars); // default table_bits=4
         EXPECT_EQ(result.get_value(), AffineElement(expected));
         EXPECT_TRUE(CircuitChecker::check(plookup_builder));
+        plookup_table_rows = total_table_rows(plookup_builder);
         plookup_gates = plookup_builder.get_num_finalized_gates_inefficient();
     }
 
-    info("batch_mul (ROM) gates:          ", rom_gates);
-    info("fixed_batch_mul (plookup) gates: ", plookup_gates);
-    info("gate savings:                    ", static_cast<int64_t>(rom_gates) - static_cast<int64_t>(plookup_gates));
+    // --- Plookup-based fixed_batch_mul with 8-bit tables ---
+    // 8-bit is valid since cycle_scalar::LO_BITS (128) is divisible by 8.
+    // Table rows are preprocessed (not witness rows) so increasing table size from 16→256 entries
+    // is essentially free; only the number of rounds (64→32) and reads/adds matter for gate count.
+    size_t plookup_8bit_gates;
+    size_t plookup_8bit_table_rows;
+    {
+        Builder plookup_builder;
+        std::vector<cycle_group_ct> points;
+        std::vector<typename cycle_group_ct::cycle_scalar> scalars;
+        for (size_t i = 0; i < num_points; ++i) {
+            points.emplace_back(cycle_group_ct(native_points[i]));
+            scalars.emplace_back(cycle_group_ct::cycle_scalar::from_witness(&plookup_builder, native_scalars[i]));
+        }
+        auto result = cycle_group_ct::fixed_batch_mul(points, scalars, {}, 8);
+        EXPECT_EQ(result.get_value(), AffineElement(expected));
+        EXPECT_TRUE(CircuitChecker::check(plookup_builder));
+        plookup_8bit_table_rows = total_table_rows(plookup_builder);
+        plookup_8bit_gates = plookup_builder.get_num_finalized_gates_inefficient();
+    }
+
+    info("                              gates   table_rows");
+    info("batch_mul (ROM, 4-bit):      ", rom_gates, "  ", rom_table_rows);
+    info("fixed_batch_mul (4-bit):     ", plookup_gates, "  ", plookup_table_rows);
+    info("fixed_batch_mul (8-bit):     ", plookup_8bit_gates, "  ", plookup_8bit_table_rows);
+    info("4-bit savings vs ROM:        ", static_cast<int64_t>(rom_gates) - static_cast<int64_t>(plookup_gates));
+    info("8-bit savings vs ROM:        ", static_cast<int64_t>(rom_gates) - static_cast<int64_t>(plookup_8bit_gates));
+    info("8-bit savings vs 4-bit:      ",
+         static_cast<int64_t>(plookup_gates) - static_cast<int64_t>(plookup_8bit_gates));
 
-    // fixed_batch_mul should be strictly cheaper than ROM-based batch_mul
+    // Both plookup variants should be cheaper than ROM; 8-bit should be cheapest
     EXPECT_LT(plookup_gates, rom_gates);
+    EXPECT_LT(plookup_8bit_gates, plookup_gates);
 }
 
 #pragma GCC diagnostic pop

From ee3f550172bc85e2ab5a18ea71e088f28d22a392 Mon Sep 17 00:00:00 2001
From: suyash67 <suyashnbagad1997@gmail.com>
Date: Sun, 5 Apr 2026 10:19:17 +0000
Subject: [PATCH 4/6] get below 2^23.

---
 .../barretenberg/commitment_schemes/ipa/ipa.hpp  | 16 ++++++++++++----
 .../dsl/acir_format/gate_count_constants.hpp     |  2 +-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
index b0bd4b72f780..4aa5d23bd9c8 100644
--- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
+++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -805,13 +805,21 @@ template <typename Curve_, size_t log_poly_length = CONST_ECCVM_LOG_N> class IPA
         // Compute G_zero
         // In the native verifier, this uses pippenger. Here we use fixed_batch_mul since all SRS points are
         // circuit constants, which uses plookup tables instead of ROM tables and is significantly cheaper.
-        // We use 8-bit tables (table_bits=8, 32 rounds) rather than the default 4-bit (64 rounds) because
-        // table rows are preprocessed and don't cost witness rows; halving the rounds halves lookup/add gates.
-        // 8-bit is valid since cycle_scalar::LO_BITS (128) is evenly divisible by 8.
+        // We use 8-bit tables (table_bits=8, 32 rounds) to minimise gate count. However, with N=32768 SRS points
+        // and 8-bit tables, the total table rows = 32768 × 256 = 2^23 exactly. The 5 mandatory overhead rows
+        // (NUM_DISABLED_ROWS_IN_SUMCHECK=4, NUM_ZERO_ROWS=1) push the total to 2^23+5, forcing dyadic_size = 2^24.
+        // To stay within 2^23 we handle the first SRS point separately with a 4-bit table (16 entries instead of
+        // 256): total table rows = 16 + 32767×256 = 8,388,368 < 2^23, giving dyadic_size = 2^23 and ~2× speedup.
         std::vector<Commitment> srs_elements = vk.get_monomial_points();
         BB_ASSERT_GTE(srs_elements.size(), poly_length, "Not enough SRS points for IPA!");
         srs_elements.resize(poly_length);
-        Commitment computed_G_zero = Commitment::fixed_batch_mul(srs_elements, s_vec, {}, 8);
+        std::vector<Commitment> first_srs_point(1, srs_elements[0]);
+        std::vector<Fr> first_s_scalar(1, s_vec[0]);
+        std::vector<Commitment> remaining_srs(srs_elements.begin() + 1, srs_elements.end());
+        std::vector<Fr> remaining_s(s_vec.begin() + 1, s_vec.end());
+        Commitment first_term = Commitment::fixed_batch_mul(first_srs_point, first_s_scalar, {}, /*table_bits=*/4);
+        Commitment remaining_term = Commitment::fixed_batch_mul(remaining_srs, remaining_s, {}, /*table_bits=*/8);
+        Commitment computed_G_zero = first_term.unconditional_add(remaining_term);
         // check the computed G_zero and the claimed G_zero are the same.
         claimed_G_zero.assert_equal(computed_G_zero);
         BB_ASSERT_EQ(computed_G_zero.get_value(), claimed_G_zero.get_value(), "G_zero doesn't match received G_zero.");
diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp
index cc316058617e..89cf4baf1754 100644
--- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp
+++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp
@@ -55,7 +55,7 @@ template <typename Builder> inline constexpr size_t ASSERT_EQUALITY = ZERO_GATE
 // Honk Recursion Constants
 // ========================================
 
-inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 6351560;
+inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 6351579;
 
 template <typename RecursiveFlavor>
 constexpr std::tuple<size_t, size_t> HONK_RECURSION_CONSTANTS(

From 6d90fae423daa5d55b9dd7b882da9e1efab66d0d Mon Sep 17 00:00:00 2001
From: suyash67 <suyashnbagad1997@gmail.com>
Date: Sun, 5 Apr 2026 11:09:33 +0000
Subject: [PATCH 5/6] use operator* for first point.

---
 .../cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp    | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
index 4aa5d23bd9c8..85478908e02d 100644
--- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
+++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -808,16 +808,13 @@ template <typename Curve_, size_t log_poly_length = CONST_ECCVM_LOG_N> class IPA
         // We use 8-bit tables (table_bits=8, 32 rounds) to minimise gate count. However, with N=32768 SRS points
         // and 8-bit tables, the total table rows = 32768 × 256 = 2^23 exactly. The 5 mandatory overhead rows
         // (NUM_DISABLED_ROWS_IN_SUMCHECK=4, NUM_ZERO_ROWS=1) push the total to 2^23+5, forcing dyadic_size = 2^24.
-        // To stay within 2^23 we handle the first SRS point separately with a 4-bit table (16 entries instead of
-        // 256): total table rows = 16 + 32767×256 = 8,388,368 < 2^23, giving dyadic_size = 2^23 and ~2× speedup.
+        // To stay within 2^23 we handle the first SRS point separately using operator*.
         std::vector<Commitment> srs_elements = vk.get_monomial_points();
         BB_ASSERT_GTE(srs_elements.size(), poly_length, "Not enough SRS points for IPA!");
         srs_elements.resize(poly_length);
-        std::vector<Commitment> first_srs_point(1, srs_elements[0]);
-        std::vector<Fr> first_s_scalar(1, s_vec[0]);
         std::vector<Commitment> remaining_srs(srs_elements.begin() + 1, srs_elements.end());
         std::vector<Fr> remaining_s(s_vec.begin() + 1, s_vec.end());
-        Commitment first_term = Commitment::fixed_batch_mul(first_srs_point, first_s_scalar, {}, /*table_bits=*/4);
+        Commitment first_term = srs_elements[0] * s_vec[0];
         Commitment remaining_term = Commitment::fixed_batch_mul(remaining_srs, remaining_s, {}, /*table_bits=*/8);
         Commitment computed_G_zero = first_term.unconditional_add(remaining_term);
         // check the computed G_zero and the claimed G_zero are the same.

From 7b62fea1d13adc7920820bd3a16cc21c848d2b05 Mon Sep 17 00:00:00 2001
From: suyash67 <suyashnbagad1997@gmail.com>
Date: Mon, 6 Apr 2026 09:46:52 +0000
Subject: [PATCH 6/6] bench root rollup circuit.

---
 .../src/barretenberg/benchmark/CMakeLists.txt |   1 +
 .../root_rollup_bench/CMakeLists.txt          |   9 +
 .../root_rollup_bench/root_rollup.bench.cpp   | 250 ++++++++++++
 .../primitives/group/STRAUS_MSM_ALGORITHM.md  | 385 ++++++++++++++++++
 4 files changed, 645 insertions(+)
 create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/CMakeLists.txt
 create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/root_rollup.bench.cpp
 create mode 100644 barretenberg/cpp/src/barretenberg/stdlib/primitives/group/STRAUS_MSM_ALGORITHM.md

diff --git a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt
index 12d08f15e49a..890382d5e867 100644
--- a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt
+++ b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt
@@ -11,3 +11,4 @@ add_subdirectory(indexed_tree_bench)
 add_subdirectory(append_only_tree_bench)
 add_subdirectory(ultra_bench)
 add_subdirectory(circuit_construction_bench)
+add_subdirectory(root_rollup_bench)
diff --git a/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/CMakeLists.txt
new file mode 100644
index 000000000000..b0eb686766f7
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/CMakeLists.txt
@@ -0,0 +1,9 @@
+barretenberg_module(
+  root_rollup_bench
+  dsl
+  ultra_honk
+)
+
+if(NOT WASM AND NOT FUZZING)
+    target_link_libraries(root_rollup_bench PRIVATE vm2_stub)
+endif()
diff --git a/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/root_rollup.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/root_rollup.bench.cpp
new file mode 100644
index 000000000000..9f92205ae23f
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/root_rollup.bench.cpp
@@ -0,0 +1,250 @@
+/**
+ * @brief Benchmark for root rollup circuit proving and verification.
+ *
+ * Constructs the root rollup circuit (2 recursive Honk verifications + IPA verification)
+ * and benchmarks the full UltraZK proving pipeline. This is the same circuit as the
+ * GateCountRootRollup test in honk_recursion_constraint.test.cpp, but exercises the prover.
+ *
+ * Usage:
+ *   HARDWARE_CONCURRENCY=32 ./bin/root_rollup_bench
+ */
+#include <benchmark/benchmark.h>
+#include <sys/resource.h>
+
+#include "barretenberg/common/bb_bench.hpp"
+#include "barretenberg/dsl/acir_format/acir_format.hpp"
+#include "barretenberg/dsl/acir_format/acir_to_constraint_buf.hpp"
+#include "barretenberg/dsl/acir_format/honk_recursion_constraint.hpp"
+#include "barretenberg/dsl/acir_format/recursion_constraint.hpp"
+#include "barretenberg/dsl/acir_format/serde/index.hpp"
+#include "barretenberg/dsl/acir_format/utils.hpp"
+#include "barretenberg/dsl/acir_format/witness_constant.hpp"
+#include "barretenberg/numeric/uint256/uint256.hpp"
+#include "barretenberg/special_public_inputs/special_public_inputs.hpp"
+#include "barretenberg/stdlib/primitives/circuit_builders/circuit_builders.hpp"
+#include "barretenberg/stdlib_circuit_builders/mock_circuits.hpp"
+#include "barretenberg/ultra_honk/prover_instance.hpp"
+#include "barretenberg/ultra_honk/ultra_prover.hpp"
+#include "barretenberg/ultra_honk/ultra_verifier.hpp"
+
+using namespace acir_format;
+using namespace bb;
+
+namespace {
+
+using RecursiveFlavor = UltraRecursiveFlavor_<UltraCircuitBuilder>;
+using InnerFlavor = RecursiveFlavor::NativeFlavor;
+using InnerBuilder = InnerFlavor::CircuitBuilder;
+using InnerIO = bb::stdlib::recursion::honk::RollupIO;
+using InnerProverInstance = ProverInstance_<InnerFlavor>;
+using InnerVerificationKey = InnerFlavor::VerificationKey;
+using InnerProver = UltraProver_<InnerFlavor>;
+
+static constexpr size_t NUM_PUBLIC_INPUTS = 2;
+static constexpr uint32_t INNER_PROOF_TYPE = ROLLUP_HONK;
+
+// Helpers to convert RecursionConstraint -> Acir::Opcode (extracted from test_class.hpp to avoid gtest dependency)
+Acir::FunctionInput witness_to_function_input(uint32_t witness_index)
+{
+    return Acir::FunctionInput{ .value =
+                                    Acir::FunctionInput::Witness{ .value = Acir::Witness{ .value = witness_index } } };
+}
+
+Acir::FunctionInput witness_or_constant_to_function_input(const WitnessOrConstant<bb::fr>& input)
+{
+    if (input.is_constant) {
+        return Acir::FunctionInput{ .value = Acir::FunctionInput::Constant{ .value = input.value.to_buffer() } };
+    }
+    return Acir::FunctionInput{ .value =
+                                    Acir::FunctionInput::Witness{ .value = Acir::Witness{ .value = input.index } } };
+}
+
+Acir::Opcode recursion_constraint_to_acir_opcode(const RecursionConstraint& constraint)
+{
+    std::vector<Acir::FunctionInput> verification_key;
+    for (const auto& key_idx : constraint.key) {
+        verification_key.push_back(witness_to_function_input(key_idx));
+    }
+    std::vector<Acir::FunctionInput> proof;
+    for (const auto& proof_idx : constraint.proof) {
+        proof.push_back(witness_to_function_input(proof_idx));
+    }
+    std::vector<Acir::FunctionInput> public_inputs;
+    for (const auto& pub_input_idx : constraint.public_inputs) {
+        public_inputs.push_back(witness_to_function_input(pub_input_idx));
+    }
+    return Acir::Opcode{ .value = Acir::Opcode::BlackBoxFuncCall{
+                              .value = Acir::BlackBoxFuncCall{
+                                  .value = Acir::BlackBoxFuncCall::RecursiveAggregation{
+                                      .verification_key = std::move(verification_key),
+                                      .proof = std::move(proof),
+                                      .public_inputs = std::move(public_inputs),
+                                      .key_hash = witness_to_function_input(constraint.key_hash),
+                                      .proof_type = constraint.proof_type,
+                                      .predicate = witness_or_constant_to_function_input(constraint.predicate),
+                                  } } } };
+}
+
+AcirFormat constraints_to_acir_format(const std::vector<RecursionConstraint>& constraints)
+{
+    std::vector<Acir::Opcode> opcodes;
+    for (const auto& c : constraints) {
+        opcodes.push_back(recursion_constraint_to_acir_opcode(c));
+    }
+    Acir::Circuit circuit{
+        .function_name = "root_rollup_bench",
+        .opcodes = opcodes,
+        .private_parameters = {},
+        .public_parameters = Acir::PublicInputs{ .value = {} },
+        .return_values = Acir::PublicInputs{ .value = {} },
+        .assert_messages = {},
+    };
+    return circuit_serde_to_acir_format(circuit);
+}
+
+InnerBuilder create_inner_circuit()
+{
+    InnerBuilder builder;
+    MockCircuits::add_arithmetic_gates(builder);
+    MockCircuits::add_lookup_gates(builder);
+    for (size_t idx = 0; idx < NUM_PUBLIC_INPUTS; idx++) {
+        builder.add_public_variable(InnerBuilder::FF::random_element());
+    }
+    InnerIO::add_default(builder);
+    return builder;
+}
+
+std::pair<RecursionConstraint, WitnessVector> circuit_to_recursion_constraint(InnerBuilder& builder)
+{
+    for (size_t idx = builder.num_public_inputs(); idx < NUM_PUBLIC_INPUTS; idx++) {
+        builder.add_public_variable(InnerBuilder::FF::random_element());
+    }
+    auto prover_instance = std::make_shared<InnerProverInstance>(builder);
+    auto verification_key = std::make_shared<InnerVerificationKey>(prover_instance->get_precomputed());
+    InnerProver prover(prover_instance, verification_key);
+    auto proof = prover.construct_proof();
+
+    WitnessVector witness_values;
+    RecursionConstraint constraint = recursion_data_to_recursion_constraint(witness_values,
+                                                                            proof,
+                                                                            verification_key->to_field_elements(),
+                                                                            verification_key->hash(),
+                                                                            bb::fr::one(),
+                                                                            builder.num_public_inputs() -
+                                                                                InnerIO::PUBLIC_INPUTS_SIZE,
+                                                                            INNER_PROOF_TYPE);
+    return { constraint, witness_values };
+}
+
+void generate_root_rollup_constraints(std::vector<RecursionConstraint>& honk_recursion_constraints,
+                                      WitnessVector& witness_values)
+{
+    std::vector<RecursionConstraint> constraints;
+    std::vector<WitnessVector> witness_vectors;
+
+    for (size_t idx = 0; idx < 2; idx++) {
+        auto builder = create_inner_circuit();
+        auto [constraint, witnesses] = circuit_to_recursion_constraint(builder);
+        constraints.emplace_back(std::move(constraint));
+        witness_vectors.emplace_back(std::move(witnesses));
+    }
+
+    for (auto [constraint, witnesses] : zip_view(constraints, witness_vectors)) {
+        uint32_t offset = static_cast<uint32_t>(witness_values.size());
+        auto shift = [&offset](std::vector<uint32_t>& indices) {
+            for (auto& index : indices) {
+                index += offset;
+            }
+        };
+        shift(constraint.key);
+        shift(constraint.proof);
+        shift(constraint.public_inputs);
+        constraint.key_hash += offset;
+        constraint.predicate.index += offset;
+        constraint.proof_type = static_cast<uint32_t>(ROOT_ROLLUP_HONK);
+        witness_values.insert(witness_values.end(), witnesses.begin(), witnesses.end());
+    }
+
+    honk_recursion_constraints = std::move(constraints);
+}
+
+size_t get_peak_rss_mib()
+{
+    struct rusage usage {};
+    getrusage(RUSAGE_SELF, &usage);
+    return static_cast<size_t>(usage.ru_maxrss) / 1024; // Linux: ru_maxrss is in KB
+}
+
+} // namespace
+
+static void root_rollup_prove(benchmark::State& state)
+{
+    bb::srs::init_file_crs_factory(bb::srs::bb_crs_path());
+
+    for (auto _ : state) {
+        state.PauseTiming();
+
+        info("Generating root rollup constraints (2 inner circuits)...");
+        std::vector<RecursionConstraint> constraints;
+        WitnessVector witness_values;
+        generate_root_rollup_constraints(constraints, witness_values);
+
+        info("Building outer circuit...");
+        AcirFormat constraint_system = constraints_to_acir_format(constraints);
+        AcirProgram program{ constraint_system, witness_values };
+        ProgramMetadata metadata{ .has_ipa_claim = false };
+        auto builder = create_circuit<UltraCircuitBuilder>(program, metadata);
+
+        size_t num_gates = builder.get_num_finalized_gates_inefficient();
+        info("Root rollup circuit: ", num_gates, " gates");
+
+        info("Creating prover instance...");
+        auto prover_instance = std::make_shared<ProverInstance_<UltraZKFlavor>>(builder);
+        auto verification_key =
+            std::make_shared<UltraZKFlavor::VerificationKey>(prover_instance->get_precomputed());
+
+        size_t dyadic_size = prover_instance->dyadic_size();
+        info("Dyadic size: ", dyadic_size, " (log2: ", numeric::get_msb(dyadic_size), ")");
+
+        size_t rss_before = get_peak_rss_mib();
+        info("Peak RSS before proving: ", rss_before, " MiB");
+
+        UltraZKProver prover(prover_instance, verification_key);
+
+        info("Starting proof construction...");
+        state.ResumeTiming();
+
+        auto proof = prover.construct_proof();
+
+        state.PauseTiming();
+
+        size_t rss_after = get_peak_rss_mib();
+        info("Peak RSS after proving: ", rss_after, " MiB");
+
+        info("Verifying proof...");
+        auto vk_and_hash = std::make_shared<UltraZKFlavor::VKAndHash>(verification_key);
+        UltraZKVerifier verifier(vk_and_hash);
+        auto output = verifier.verify_proof(proof);
+        info(output.result ? "Proof verified successfully" : "ERROR: Proof verification FAILED");
+
+        state.ResumeTiming();
+    }
+}
+
+BENCHMARK(root_rollup_prove)->Unit(benchmark::kMillisecond)->Iterations(1);
+
+int main(int argc, char** argv)
+{
+    bb::detail::use_bb_bench = true;
+
+    ::benchmark::Initialize(&argc, argv);
+    if (::benchmark::ReportUnrecognizedArguments(argc, argv))
+        return 1;
+    ::benchmark::RunSpecifiedBenchmarks();
+    ::benchmark::Shutdown();
+
+    std::cout << "\n=== Detailed BB_BENCH Profiling Stats ===\n";
+    bb::detail::GLOBAL_BENCH_STATS.print_aggregate_counts_hierarchical(std::cout);
+
+    return 0;
+}
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/STRAUS_MSM_ALGORITHM.md b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/STRAUS_MSM_ALGORITHM.md
new file mode 100644
index 000000000000..9d4cf7dfd3b0
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/STRAUS_MSM_ALGORITHM.md
@@ -0,0 +1,385 @@
+# Straus Multi-Scalar Multiplication: Precise Mathematical Description
+
+This document describes the exact step-by-step mathematics of the Straus MSM algorithm as
+implemented in `cycle_group` (`cycle_group.cpp`). All notation is made concrete so that every
+formula corresponds to an exact line of code.
+
+---
+
+## 1. Problem Statement
+
+Given $N$ elliptic curve points $P_0, P_1, \ldots, P_{N-1}$ on the Grumpkin curve and $N$
+scalars $s_0, s_1, \ldots, s_{N-1}$ in the Grumpkin scalar field
+$\mathbb{F}_r$ (equivalently, the BN254 base field), compute in-circuit the multi-scalar
+multiplication (MSM):
+
+$$\text{MSM} = \sum_{j=0}^{N-1} s_j \cdot P_j$$
+
+---
+
+## 2. Parameters
+
+| Symbol               | Value                        | Source                   |
+| -------------------- | ---------------------------- | ------------------------ |
+| $\texttt{NUM\_BITS}$ | $254$                        | `cycle_scalar::NUM_BITS` |
+| $\texttt{LO\_BITS}$  | $128$                        | `cycle_scalar::LO_BITS`  |
+| $\texttt{HI\_BITS}$  | $126$                        | `cycle_scalar::HI_BITS`  |
+| $w$                  | $4$                          | `ROM_TABLE_BITS`         |
+| $R$                  | $\lceil 254 / 4 \rceil = 64$ | `num_rounds`             |
+| $T$                  | $2^w = 16$                   | table size per point     |
+
+---
+
+## 3. Scalar Representation (`cycle_scalar`)
+
+Each 254-bit scalar $s \in \mathbb{F}_r$ is split into two **limbs**:
+
+$$s = s_{\text{lo}} + 2^{128} \cdot s_{\text{hi}}$$
+
+where $s_{\text{lo}} \in [0, 2^{128})$ is a 128-bit integer and $s_{\text{hi}} \in [0, 2^{126})$
+is a 126-bit integer. Both limbs are represented as native `field_t` circuit elements. Crucially,
+the range constraints on these limbs are **deferred** to the MSM algorithm — `cycle_scalar`
+alone does not add range-constraint gates.
+
+---
+
+## 4. Scalar Decomposition into $w$-Bit Slices (`straus_scalar_slices`)
+
+Each limb is independently decomposed into $w = 4$ bit slices via `create_limbed_range_constraint`,
+which simultaneously performs the decomposition and enforces the range constraint in-circuit.
+
+### 4.1 Lo-limb slices
+
+$s_{\text{lo}}$ (128 bits, exactly divisible by 4) is split into $128/4 = 32$ slices:
+
+$$s_{\text{lo},k} = \left\lfloor \frac{s_{\text{lo}}}{16^k} \right\rfloor \bmod 16, \quad k = 0, 1, \ldots, 31$$
+
+Each slice satisfies $s_{\text{lo},k} \in \{0,\ldots,15\}$. The lo-limb is reconstructed as:
+
+$$s_{\text{lo}} = \sum_{k=0}^{31} s_{\text{lo},k} \cdot 16^k$$
+
+### 4.2 Hi-limb slices
+
+$s_{\text{hi}}$ (126 bits, $126 = 31 \cdot 4 + 2$) is split into 32 slices:
+
+$$s_{\text{hi},k} = \left\lfloor \frac{s_{\text{hi}}}{16^k} \right\rfloor \bmod 16, \quad k = 0, 1, \ldots, 30$$
+$$s_{\text{hi},31} = \left\lfloor \frac{s_{\text{hi}}}{16^{31}} \right\rfloor \bmod 4 \quad\text{(2-bit slice)}$$
+
+All slices $s_{\text{hi},k} \in \{0,\ldots,15\}$ for $k \le 30$; the final slice $s_{\text{hi},31} \in \{0,1,2,3\}$.
+The hi-limb is reconstructed as:
+
+$$s_{\text{hi}} = \sum_{k=0}^{31} s_{\text{hi},k} \cdot 16^k$$
+
+### 4.3 Unified slice vector
+
+The two decompositions are concatenated into a single vector of $R = 64$ slices:
+
+$$\sigma[k] = \begin{cases} s_{\text{lo},k} & 0 \le k \le 31 \\ s_{\text{hi},k-32} & 32 \le k \le 63 \end{cases}$$
+
+The full scalar reconstruct identity is:
+
+$$s = \sum_{k=0}^{63} \sigma[k] \cdot 16^k$$
+
+because:
+
+$$\sum_{k=0}^{63} \sigma[k] \cdot 16^k = \sum_{k=0}^{31} s_{\text{lo},k} \cdot 16^k + \sum_{k=0}^{31} s_{\text{hi},k} \cdot 16^{k+32} = s_{\text{lo}} + 2^{128} \cdot s_{\text{hi}} = s$$
+
+At step $i$ of the Straus loop (0-indexed, MSB-first), the slice accessed is:
+
+$$\sigma_{\text{round}(i)} = \sigma[R - 1 - i] = \sigma[63 - i]$$
+
+so round $i = 0$ processes the **most significant** slice $\sigma[63]$, and round $i = 63$ processes
+the **least significant** slice $\sigma[0]$.
+
+---
+
+## 5. Lookup Table Construction
+
+For each point $P_j$ with associated offset generator $G_{j+1}$, a lookup table $\mathcal{T}_j$ of
+size $T = 16$ is precomputed:
+
+$$\mathcal{T}_j[v] = G_{j+1} + v \cdot P_j, \quad v = 0, 1, \ldots, 15$$
+
+The offset generator $G_{j+1}$ is drawn from a domain-separated hash-to-curve
+(`"cycle_group_offset_generator"`) and is linearly independent of all $P_j$ and of each other.
+It ensures $\mathcal{T}_j[0] = G_{j+1} \ne \mathcal{O}$, preventing the point-at-infinity edge
+case when a slice value is zero.
+
+Two implementations exist with different circuit costs:
+
+| Implementation                        | Table stored as                           | Construction cost                                           | Read cost              |
+| ------------------------------------- | ----------------------------------------- | ----------------------------------------------------------- | ---------------------- |
+| `straus_lookup_table` (variable-base) | ROM array (witnesses)                     | $15$ `unconditional_add` gates per table + ROM finalization | 1 ROM gate per read    |
+| `straus_plookup_table` (fixed-base)   | Plookup `BasicTable` (proving polynomial) | **0 gates** (table data is not in the trace)                | 1 lookup gate per read |
+
+### 5.1 Full Table Structure for $w = 4$ bits
+
+For a single base point $P$ and offset generator $G$, the table $\mathcal{T}$ has $2^4 = 16$ entries.
+
+**Construction (projective arithmetic, then batch-normalize):**
+
+$$\mathcal{T}[0] = G, \quad \mathcal{T}[v] = \mathcal{T}[v-1] + P \text{ for } v = 1, \ldots, 15$$
+
+**Complete 16-entry table ($w = 4$):**
+
+| Index $v$ | Binary $v_3 v_2 v_1 v_0$ | Table entry $\mathcal{T}[v] = G + v \cdot P$ |
+| --------- | ------------------------- | --------------------------------------------- |
+| 0         | `0000`                    | $G$                                           |
+| 1         | `0001`                    | $G + P$                                       |
+| 2         | `0010`                    | $G + 2P$                                      |
+| 3         | `0011`                    | $G + 3P$                                      |
+| 4         | `0100`                    | $G + 4P$                                      |
+| 5         | `0101`                    | $G + 5P$                                      |
+| 6         | `0110`                    | $G + 6P$                                      |
+| 7         | `0111`                    | $G + 7P$                                      |
+| 8         | `1000`                    | $G + 8P$                                      |
+| 9         | `1001`                    | $G + 9P$                                      |
+| 10        | `1010`                    | $G + 10P$                                     |
+| 11        | `1011`                    | $G + 11P$                                     |
+| 12        | `1100`                    | $G + 12P$                                     |
+| 13        | `1101`                    | $G + 13P$                                     |
+| 14        | `1110`                    | $G + 14P$                                     |
+| 15        | `1111`                    | $G + 15P$                                     |
+
+**`BasicTable` column mapping (as stored in the proving polynomial):**
+
+$$\texttt{column\_1}[v] = v, \quad \texttt{column\_2}[v] = \mathcal{T}[v].x, \quad \texttt{column\_3}[v] = \mathcal{T}[v].y$$
+
+Concretely, with affine coordinates $(x_v, y_v) = \mathcal{T}[v]$:
+
+| `column_1` (key) | `column_2` ($x$-coordinate) | `column_3` ($y$-coordinate) |
+| ---------------- | --------------------------- | --------------------------- |
+| 0                | $x_0 = G_x$                 | $y_0 = G_y$                 |
+| 1                | $x_1$                       | $y_1$                       |
+| 2                | $x_2$                       | $y_2$                       |
+| $\vdots$         | $\vdots$                    | $\vdots$                    |
+| 15               | $x_{15}$                    | $y_{15}$                    |
+
+**Plookup gate for a single read at index $v$ (witness $w_1$):**
+
+A single lookup gate constrains the triple $(w_1, w_2, w_3)$ to be a valid row $(v,\, x_v,\, y_v)$ of the table:
+
+$$w_1 = v, \quad w_2 = \texttt{column\_2}[v] = x_v, \quad w_3 = \texttt{column\_3}[v] = y_v$$
+
+Gate selectors: $q_{\text{lookup}} = 1$, $q_3 = \texttt{table\_index}$, $q_2 = q_m = q_c = q_1 = q_4 = 0$ (step sizes all zero, indicating a standalone lookup with no chained accumulation).
+
+**Why $\mathcal{T}[0] = G \ne \mathcal{O}$:** The offset generator $G$ is a hash-to-curve output linearly independent of $P$, so $G \ne \mathcal{O}$ by construction. Even when a scalar slice $\sigma[k] = 0$ (which occurs for any scalar whose $k$-th 4-bit chunk is zero — e.g., $s = 16$ has $\sigma[0] = 0$), the table read returns $\mathcal{T}[0] = G \ne \mathcal{O}$, making `unconditional_add` safe.
+
+---
+
+## 6. The Offset Generator Mechanism
+
+### 6.1 Why it is needed
+
+The implementation uses `unconditional_add` for all in-circuit additions, which requires that the
+two operand points have **distinct** $x$-coordinates. Without offset generators, two failure modes arise:
+
+1. **Zero slice:** If $\sigma_j[k] = 0$ for some $j, k$, then $\mathcal{T}_j[0] = \mathcal{O}$
+   (the point at infinity). Even with non-zero scalars, slices can be zero — e.g.,
+   $s = 16$ has $\sigma[0] = 0$.
+2. **Accumulator collision:** The rolling accumulator could coincidentally share an
+   $x$-coordinate with an upcoming table entry.
+
+### 6.2 Offset generator set
+
+$N + 1$ linearly independent points are used:
+
+$$G_0, G_1, G_2, \ldots, G_N \in E(\mathbb{F}_q)$$
+
+all distinct, hash-to-curve outputs linearly independent of every $P_j$.
+
+- $G_0$: initial accumulator value
+- $G_{j+1}$: offset embedded in table $\mathcal{T}_j$
+
+### 6.3 Tracking the total offset
+
+A **native** (non-circuit) parallel computation tracks the accumulated contribution of the offset
+generators. Define the offset accumulator $\Delta$, initialised as:
+
+$$\Delta_{\text{init}} = G_0$$
+
+In each round $i$ the same doublings and additions are applied to $\Delta$ as to the main
+accumulator, but using the **offset generators** in place of the table reads:
+
+- **Doublings (rounds $i \ge 1$):** $\Delta \leftarrow 2^w \cdot \Delta$ (4 consecutive doublings $= \times 16$)
+- **Additions:** $\Delta \leftarrow \Delta + G_{j+1}$ for each $j = 0, \ldots, N-1$
+
+The closed-form value of $\Delta$ after the complete $R = 64$ rounds is derived below.
+
+---
+
+## 7. The Straus Algorithm — Step by Step
+
+### 7.1 Initialisation
+
+$$A \leftarrow G_0, \qquad \Delta \leftarrow G_0$$
+
+### 7.2 Main Loop
+
+For $i = 0, 1, \ldots, R-1$ (i.e., $64$ rounds):
+
+**Step 7.2a — Doublings (skip when $i = 0$):**
+
+If $i \ge 1$, perform $w = 4$ point doublings in-circuit:
+
+$$A \leftarrow 2^4 \cdot A = 16 \cdot A$$
+
+and natively:
+
+$$\Delta \leftarrow 16 \cdot \Delta$$
+
+**Step 7.2b — Table lookups and additions:**
+
+For each point index $j = 0, 1, \ldots, N-1$:
+
+1. Read the scalar slice for this round: $v = \sigma_j[R - 1 - i] = \sigma_j[63 - i]$
+2. Look up: $Q \leftarrow \mathcal{T}_j[v] = G_{j+1} + v \cdot P_j$
+3. Add in-circuit: $A \leftarrow A + Q$
+4. Update offset natively: $\Delta \leftarrow \Delta + G_{j+1}$
+
+(For the variable-base case, step 3 uses a conditional safety check on $x$-coordinates unless the
+`unconditional_add` flag is set.)
+
+### 7.3 State at the end of round $i$
+
+After completing round $i$ (both doublings and all $N$ additions), the accumulated value satisfies
+the recurrence:
+
+$$A_0 = G_0 + \sum_{j=0}^{N-1} \mathcal{T}_j[\sigma_j[63]]$$
+
+$$A_i = 16 \cdot A_{i-1} + \sum_{j=0}^{N-1} \mathcal{T}_j[\sigma_j[63-i]], \quad i \ge 1$$
+
+Unrolling this recurrence over all 64 rounds yields:
+
+$$A_{63} = 16^{63} \cdot G_0 + \sum_{i=0}^{63} 16^{63-i} \cdot \sum_{j=0}^{N-1} \mathcal{T}_j[\sigma_j[63-i]]$$
+
+Substituting $k = 63 - i$:
+
+$$A_{63} = 16^{63} \cdot G_0 + \sum_{j=0}^{N-1} \sum_{k=0}^{63} 16^{k} \cdot \mathcal{T}_j[\sigma_j[k]]$$
+
+Expanding the table definition $\mathcal{T}_j[v] = G_{j+1} + v \cdot P_j$:
+
+$$A_{63} = 16^{63} \cdot G_0 + \sum_{j=0}^{N-1} \left[ G_{j+1} \cdot \sum_{k=0}^{63} 16^{k} + P_j \cdot \sum_{k=0}^{63} \sigma_j[k] \cdot 16^{k} \right]$$
+
+Using the geometric sum $\displaystyle\sum_{k=0}^{63} 16^k = \frac{16^{64}-1}{15}$ and the scalar reconstruction identity $\displaystyle\sum_{k=0}^{63} \sigma_j[k] \cdot 16^k = s_j$:
+
+$$\boxed{A_{63} = 16^{63} \cdot G_0 + \sum_{j=0}^{N-1} G_{j+1} \cdot \frac{16^{64}-1}{15} + \sum_{j=0}^{N-1} s_j \cdot P_j}$$
+
+### 7.4 Offset accumulator value
+
+Applying the same recurrence to $\Delta$:
+
+$$\Delta_0 = G_0 + \sum_{j=0}^{N-1} G_{j+1}$$
+$$\Delta_i = 16 \cdot \Delta_{i-1} + \sum_{j=0}^{N-1} G_{j+1}, \quad i \ge 1$$
+
+This has the closed-form solution:
+
+$$\Delta_{63} = 16^{63} \cdot G_0 + \left(\sum_{j=0}^{N-1} G_{j+1}\right) \cdot \sum_{k=0}^{63} 16^{k} = 16^{63} \cdot G_0 + \left(\sum_{j=0}^{N-1} G_{j+1}\right) \cdot \frac{16^{64}-1}{15}$$
+
+### 7.5 Cancellation
+
+Subtracting the offset:
+
+$$A_{63} - \Delta_{63} = \sum_{j=0}^{N-1} s_j \cdot P_j$$
+
+This is the desired MSM result. $\square$
+
+---
+
+## 8. Outer Function: `batch_mul` / `fixed_batch_mul`
+
+The outer function partitions the $N$ input pairs $(P_j, s_j)$ into categories before calling the
+internal algorithm:
+
+| Category    | Condition                                                   | Treatment                                                               |
+| ----------- | ----------------------------------------------------------- | ----------------------------------------------------------------------- |
+| **Case 1**  | $P_j$ constant **and** $s_j$ constant                       | Accumulate natively into `constant_acc` (0 gates)                       |
+| **Case 2A** | $P_j$ is one of the two hardcoded generators, $s_j$ witness | Use `_fixed_base_batch_mul_internal` (precomputed plookup multi-tables) |
+| **Case 2B** | $P_j$ constant (not a hardcoded generator), $s_j$ witness   | Use `_variable_base_batch_mul_internal` with ROM tables                 |
+| **Case 3**  | $P_j$ witness                                               | Use `_variable_base_batch_mul_internal` with ROM tables                 |
+
+`fixed_batch_mul` (new) handles **Case 2B** points using plookup `BasicTable`s instead of ROM
+arrays, with an otherwise identical Straus computation.
+
+### 8.1 Result Assembly
+
+Let $C$ = `constant_acc` $= \sum_{\text{Case 1}} s_j \cdot P_j$ (constant, free).
+
+The internal function returns $(A_{63},\, \Delta_{63})$. The outer function computes:
+
+$$\text{Result} = A_{63} - (- C + \Delta_{63}) = A_{63} - \Delta_{63} + C = \sum_j s_j \cdot P_j + C$$
+
+which is the full MSM over all $N$ pairs.
+
+The subtraction is executed as an `unconditional_add` with $-\Delta_{63} + C$ (a constant point)
+when $C \ne \mathcal{O}$, or as a full `operator-` otherwise.
+
+---
+
+## 9. Circuit Gate Cost (per Internal Call)
+
+The following counts assume $N$ points, $R = 64$ rounds, $w = 4$ bits.
+
+### 9.1 Scalar decomposition
+
+Each scalar $s_j$ contributes two `create_limbed_range_constraint` calls:
+
+- lo (128 bits, 32 slices of 4 bits): 32 range gates
+- hi (126 bits, 32 slices, last is 2-bit): 32 range gates
+
+Total across $N$ scalars: $64N$ range-constraint gates.
+
+### 9.2 Table construction
+
+**Variable-base ROM** (`straus_lookup_table`): for each point $P_j$:
+
+- 15 `unconditional_add` gates to populate $\mathcal{T}_j[1], \ldots, \mathcal{T}_j[15]$
+- 2 witness conversions (1 gate each) for $P_j$ and $G_{j+1}$
+- ROM finalisation: $O(T \log T)$ sorted-ROM gates per table
+
+Total construction: $\approx 17N + O(16N\log 16)$ gates.
+
+**Fixed-base Plookup** (`straus_plookup_table`): **0 gates**. Table data lives entirely in the
+proving polynomial (not in the arithmetic trace).
+
+### 9.3 Main Straus loop
+
+| Operation                                                    | Count                              | Gate cost                   |
+| ------------------------------------------------------------ | ---------------------------------- | --------------------------- |
+| Doublings                                                    | $(R-1) \cdot w = 63 \cdot 4 = 252$ | 252 gates                   |
+| Table reads (ROM or plookup)                                 | $R \cdot N = 64N$                  | $64N$ gates                 |
+| `unconditional_add`                                          | $R \cdot N = 64N$                  | $64N$ gates                 |
+| $x$-coord batch collision check (variable-base, witness pts) | 1 assertion                        | $\approx 2 \cdot 64N$ gates |
+
+Total Straus loop: $\approx 252 + 128N$ gates (plus collision check if applicable).
+
+### 9.4 Final offset subtraction
+
+1 group subtraction (or `unconditional_add` with negated constant): $\approx 2$–$5$ gates.
+
+### 9.5 Summary comparison (128-point MSM, constant base points)
+
+| Method                      | Table construction                   | ROM finalization      | Straus loop                                           | Total (approx)   |
+| --------------------------- | ------------------------------------ | --------------------- | ----------------------------------------------------- | ---------------- |
+| `batch_mul` (ROM)           | $\approx 17 \times 128 = 2176$ gates | $\sim 12{,}000$ gates | $\approx 252 + 128 \times 128 \approx 16{,}636$ gates | **41,201 gates** |
+| `fixed_batch_mul` (plookup) | **0**                                | **0**                 | $\approx 252 + 128 \times 128 \approx 16{,}636$ gates | **26,083 gates** |
+
+The plookup approach eliminates all table-construction and ROM-finalization gates, reducing the
+total by **~37%** for 128 points. At 32,768 SRS points (IPA), the absolute savings are
+proportionally larger.
+
+---
+
+## 10. Correctness of the Scalar Reconstruction
+
+**Claim:** $\displaystyle\sum_{k=0}^{63} \sigma_j[k] \cdot 16^k = s_j$ for every scalar $s_j \in [0, 2^{254})$.
+
+**Proof:**
+
+$$\sum_{k=0}^{63} \sigma_j[k] \cdot 16^k = \underbrace{\sum_{k=0}^{31} s_{\text{lo},k} \cdot 16^k}_{= s_{\text{lo}}} + \underbrace{\sum_{k=0}^{31} s_{\text{hi},k} \cdot 16^{k+32}}_{= s_{\text{hi}} \cdot 16^{32} = s_{\text{hi}} \cdot 2^{128}} = s_{\text{lo}} + 2^{128} \cdot s_{\text{hi}} = s_j \qquad \square$$
+
+**Range validity:** The `create_limbed_range_constraint` call on each limb simultaneously decomposes
+the limb into $w$-bit slices and proves in-circuit that each slice lies in $\{0,\ldots,2^w - 1\}$.
+The final (partial) slice of $s_{\text{hi}}$ has only 2 bits ($s_{\text{hi},31} \in \{0,1,2,3\}$)
+and the constraint uses only the 2 valid bits; when used as a $(16$-entry) table index it always
+reads a valid entry.