From 7dffda8fe811671c2822d9ad23f185fe8e5a265c Mon Sep 17 00:00:00 2001 From: suyash67 Date: Sat, 4 Apr 2026 07:40:48 +0000 Subject: [PATCH 1/6] fixed base tables in cycle group batch_mul. --- .../stdlib/primitives/group/cycle_group.cpp | 209 ++++++++++++++++++ .../stdlib/primitives/group/cycle_group.hpp | 24 +- .../primitives/group/cycle_group.test.cpp | 143 ++++++++++++ .../primitives/group/straus_plookup_table.cpp | 140 ++++++++++++ .../primitives/group/straus_plookup_table.hpp | 54 +++++ 5 files changed, 569 insertions(+), 1 deletion(-) create mode 100644 barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.cpp create mode 100644 barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.hpp diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp index cd1c8acfbe54..9dd85c8292d4 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp +++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp @@ -968,6 +968,215 @@ typename cycle_group::batch_mul_internal_output cycle_group::_ return { accumulator, offset_generator_accumulator }; } +/** + * @brief Internal algorithm to perform a fixed-base batch mul using plookup tables. + * + * @details Computes a batch mul of constant base points using the Straus multiscalar multiplication algorithm. + * For each constant base point, a plookup table (BasicTable) is created with (1 << ROM_TABLE_BITS) entries. + * Unlike ROM tables, plookup tables have zero construction cost and zero finalization overhead. + * Each table read costs exactly 1 lookup gate. + * + * @param scalars Witness scalars to multiply with base points + * @param base_points Constant affine points (SRS elements or similar) + * @param offset_generators Offset points to prevent infinity edge cases (size = base_points.size() + 1) + * @return {accumulator, offset_generator_delta} where result = accumulator - offset_generator_delta + */ +template +typename cycle_group::batch_mul_internal_output cycle_group::_fixed_base_plookup_batch_mul_internal( + const std::span scalars, + const std::span base_points, + const std::span offset_generators) +{ + BB_ASSERT_EQ(!scalars.empty(), true, "Empty scalars provided to fixed base plookup batch mul!"); + BB_ASSERT_EQ(scalars.size(), base_points.size(), "Points/scalars size mismatch in fixed base plookup batch mul!"); + BB_ASSERT_EQ(offset_generators.size(), base_points.size() + 1, "Too few offset generators provided!"); + const size_t num_points = scalars.size(); + + Builder* context = nullptr; + for (const auto& scalar : scalars) { + if (context = scalar.get_context(); context != nullptr) { + break; + } + } + BB_ASSERT(context != nullptr); + + constexpr size_t num_rounds = numeric::ceil_div(cycle_scalar::NUM_BITS, ROM_TABLE_BITS); + + // Decompose each scalar into ROM_TABLE_BITS-bit slices (also enforces range constraints) + std::vector scalar_slices; + scalar_slices.reserve(num_points); + for (const auto& scalar : scalars) { + scalar_slices.emplace_back(context, scalar, ROM_TABLE_BITS); + } + + // Create plookup tables for each constant base point (zero gate cost) + std::vector point_tables; + point_tables.reserve(num_points); + for (size_t i = 0; i < num_points; ++i) { + point_tables.emplace_back(context, base_points[i], offset_generators[i + 1], ROM_TABLE_BITS); + } + + // Compute all intermediate points natively for use as hints in the in-circuit Straus algorithm. + // Using projective coordinates + batch normalize to avoid per-operation modular inversions. + std::vector operation_transcript; + Element offset_generator_accumulator = offset_generators[0]; + { + // Build native straus tables + std::vector> native_straus_tables; + for (size_t i = 0; i < num_points; ++i) { + std::vector table(1UL << ROM_TABLE_BITS); + table[0] = Element(offset_generators[i + 1]); + Element base_proj(base_points[i]); + for (size_t j = 1; j < table.size(); ++j) { + table[j] = table[j - 1] + base_proj; + } + native_straus_tables.emplace_back(std::move(table)); + } + + // Perform Straus algorithm natively + Element accumulator = offset_generators[0]; + for (size_t i = 0; i < num_rounds; ++i) { + if (i != 0) { + for (size_t j = 0; j < ROM_TABLE_BITS; ++j) { + accumulator = accumulator.dbl(); + operation_transcript.push_back(accumulator); + offset_generator_accumulator = offset_generator_accumulator.dbl(); + } + } + for (size_t j = 0; j < num_points; ++j) { + auto slice_value = static_cast(scalar_slices[j].slices_native[num_rounds - i - 1]); + const Element point = native_straus_tables[j][slice_value]; + accumulator += point; + operation_transcript.push_back(accumulator); + offset_generator_accumulator += Element(offset_generators[j + 1]); + } + } + } + + // Batch-normalize all hint points + Element::batch_normalize(operation_transcript.data(), operation_transcript.size()); + std::vector operation_hints; + operation_hints.reserve(operation_transcript.size()); + for (const Element& element : operation_transcript) { + operation_hints.emplace_back(element.x, element.y); + } + + // Execute Straus algorithm in-circuit using plookup reads and precomputed hints + AffineElement* hint_ptr = operation_hints.data(); + cycle_group accumulator = offset_generators[0]; + + for (size_t i = 0; i < num_rounds; ++i) { + if (i != 0) { + for (size_t j = 0; j < ROM_TABLE_BITS; ++j) { + accumulator = accumulator.dbl(*hint_ptr); + hint_ptr++; + } + } + for (size_t j = 0; j < num_points; ++j) { + const field_t scalar_slice = scalar_slices[j][num_rounds - i - 1]; + const cycle_group point = point_tables[j].read(scalar_slice); + // Safe to use unconditional_add: all base points are constants hence linearly independent of offset + // generators + accumulator = accumulator.unconditional_add(point, *hint_ptr); + hint_ptr++; + } + } + + accumulator.set_origin_tag(OriginTag::constant()); + return { accumulator, AffineElement(offset_generator_accumulator) }; +} + +/** + * @brief Fixed-base multiscalar multiplication using plookup tables. + * + * @details Optimized MSM for the case where all base points are circuit constants (e.g. SRS elements). + * Uses plookup tables instead of ROM tables, eliminating table construction gates and finalization overhead. + * All base points MUST be constants; witness base points will trigger an assertion failure. + * + * @param constant_points Vector of constant cycle_group points + * @param scalars Vector of cycle_scalar values (may be witnesses or constants) + * @param context Generator context for offset generators + * @return cycle_group The result of sum(scalars[i] * constant_points[i]) + */ +template +cycle_group cycle_group::fixed_batch_mul(const std::vector& constant_points, + const std::vector& scalars, + const GeneratorContext& context) +{ + BB_ASSERT_EQ(scalars.size(), constant_points.size(), "Points/scalars size mismatch in fixed_batch_mul!"); + + if (scalars.empty()) { + return cycle_group{ Group::point_at_infinity }; + } + + // Merge all tags + OriginTag result_tag = OriginTag::constant(); + for (auto [point, scalar] : zip_view(constant_points, scalars)) { + result_tag = OriginTag(result_tag, OriginTag(point.get_origin_tag(), scalar.get_origin_tag())); + } + + std::vector plookup_scalars; + std::vector plookup_points; + bool has_non_constant_component = false; + Element constant_acc = Group::point_at_infinity; + + for (const auto [point, scalar] : zip_view(constant_points, scalars)) { + BB_ASSERT(point.is_constant()); + if (scalar.is_constant()) { + // Both constant: compute natively + constant_acc += point.get_value() * scalar.get_value(); + } else { + if (point.get_value().is_point_at_infinity()) { + // Constant infinity contributes nothing, but still need range constraints on scalar + auto* ctx = scalar.get_context(); + ctx->create_limbed_range_constraint(scalar.lo().get_witness_index(), + cycle_scalar::LO_BITS, + ROM_TABLE_BITS, + "fixed_batch_mul: lo range constraint for scalar with constant " + "infinity"); + ctx->create_limbed_range_constraint(scalar.hi().get_witness_index(), + cycle_scalar::HI_BITS, + ROM_TABLE_BITS, + "fixed_batch_mul: hi range constraint for scalar with constant " + "infinity"); + continue; + } + plookup_scalars.push_back(scalar); + plookup_points.push_back(point.get_value()); + has_non_constant_component = true; + } + } + + if (!has_non_constant_component) { + auto result = cycle_group(constant_acc); + result.set_origin_tag(result_tag); + return result; + } + + // Compute offset generators + const size_t num_offset_generators = plookup_points.size() + 1; + const std::span offset_generators = + context.generators->get(num_offset_generators, 0, OFFSET_GENERATOR_DOMAIN_SEPARATOR); + + // Run the plookup-based Straus algorithm + Element offset_accumulator = -constant_acc; + const auto [accumulator, offset_generator_delta] = + _fixed_base_plookup_batch_mul_internal(plookup_scalars, plookup_points, offset_generators); + offset_accumulator += offset_generator_delta; + + // Subtract offset. Since all points are constants and linearly independent of offset generators, + // we can safely use unconditional_add when constant_acc is non-trivial. + cycle_group result; + if (!constant_acc.is_point_at_infinity()) { + result = accumulator.unconditional_add(AffineElement(-offset_accumulator)); + } else { + result = accumulator - cycle_group(AffineElement(offset_accumulator)); + } + + result.set_origin_tag(result_tag); + return result; +} + /** * @brief Multiscalar multiplication algorithm. * diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp index 641f52e2fc1f..d38de4e50867 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp +++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.hpp @@ -14,6 +14,7 @@ #include "barretenberg/stdlib/primitives/field/field.hpp" #include "barretenberg/stdlib/primitives/group/cycle_scalar.hpp" #include "barretenberg/stdlib/primitives/group/straus_lookup_table.hpp" +#include "barretenberg/stdlib/primitives/group/straus_plookup_table.hpp" #include "barretenberg/stdlib/primitives/group/straus_scalar_slice.hpp" #include "barretenberg/stdlib_circuit_builders/plookup_tables/fixed_base/fixed_base_params.hpp" #include "barretenberg/transcript/origin_tag.hpp" @@ -52,6 +53,7 @@ template class cycle_group { using BigScalarField = stdlib::bigfield; using cycle_scalar = ::bb::stdlib::cycle_scalar; using straus_lookup_table = ::bb::stdlib::straus_lookup_table; + using straus_plookup_table = ::bb::stdlib::straus_plookup_table; using straus_scalar_slices = ::bb::stdlib::straus_scalar_slices; // Bit-size for scalars represented in the ROM lookup tables used in the variable-base MSM algorithm @@ -128,6 +130,20 @@ template class cycle_group { static cycle_group batch_mul(const std::vector& base_points, const std::vector& scalars, const GeneratorContext& context = {}); + + static cycle_group fixed_batch_mul(const std::vector& constant_points, + const std::vector& scalars, + GeneratorContext context = {}) + { + std::vector cycle_scalars; + for (auto scalar : scalars) { + cycle_scalars.emplace_back(scalar); + } + return fixed_batch_mul(constant_points, cycle_scalars, context); + } + static cycle_group fixed_batch_mul(const std::vector& constant_points, + const std::vector& scalars, + const GeneratorContext& context = {}); cycle_group operator*(const cycle_scalar& scalar) const; cycle_group& operator*=(const cycle_scalar& scalar); cycle_group operator*(const BigScalarField& scalar) const; @@ -205,8 +221,9 @@ template class cycle_group { } private: - // Allow straus_lookup_table to access the private constructor for efficiency + // Allow straus_lookup_table and straus_plookup_table to access the private constructor for efficiency friend class ::bb::stdlib::straus_lookup_table; + friend class ::bb::stdlib::straus_plookup_table; // Private constructor that allows explicit control over infinity flag. // Use public constructors or factory methods instead - they auto-detect infinity from coordinates. @@ -225,6 +242,11 @@ template class cycle_group { static batch_mul_internal_output _fixed_base_batch_mul_internal(std::span scalars, std::span base_points); + static batch_mul_internal_output _fixed_base_plookup_batch_mul_internal( + std::span scalars, + std::span base_points, + std::span offset_generators); + // Internal implementation for unconditional_add and unconditional_subtract cycle_group _unconditional_add_or_subtract(const cycle_group& other, bool is_addition, diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp index 15416d1a5988..0c48b48a87c9 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp +++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp @@ -2080,4 +2080,147 @@ TYPED_TEST(CycleGroupTest, TestInfinityAutoDetectionInConstructor) EXPECT_FALSE(builder.failed()); EXPECT_TRUE(CircuitChecker::check(builder)); } + +/** + * @brief Test fixed_batch_mul correctness with constant points and witness scalars + */ +TYPED_TEST(CycleGroupTest, TestFixedBatchMul) +{ + STDLIB_TYPE_ALIASES; + auto builder = Builder(); + + constexpr size_t num_points = 8; + std::vector points; + std::vector scalars; + Element expected = Group::point_at_infinity; + + for (size_t i = 0; i < num_points; ++i) { + auto element = TestFixture::generators[i]; + typename Group::Fr scalar = Group::Fr::random_element(&engine); + expected += (element * scalar); + // Points are constant, scalars are witnesses + points.emplace_back(cycle_group_ct(element)); + scalars.emplace_back(cycle_group_ct::cycle_scalar::from_witness(&builder, scalar)); + } + + auto result = cycle_group_ct::fixed_batch_mul(points, scalars); + EXPECT_EQ(result.get_value(), AffineElement(expected)); + + EXPECT_FALSE(builder.failed()); + EXPECT_TRUE(CircuitChecker::check(builder)); +} + +/** + * @brief Test fixed_batch_mul with a single constant point + */ +TYPED_TEST(CycleGroupTest, TestFixedBatchMulSinglePoint) +{ + STDLIB_TYPE_ALIASES; + auto builder = Builder(); + + auto element = TestFixture::generators[0]; + typename Group::Fr scalar = Group::Fr::random_element(&engine); + Element expected = element * scalar; + + std::vector points{ cycle_group_ct(element) }; + std::vector scalars{ + cycle_group_ct::cycle_scalar::from_witness(&builder, scalar) + }; + + auto result = cycle_group_ct::fixed_batch_mul(points, scalars); + EXPECT_EQ(result.get_value(), AffineElement(expected)); + + EXPECT_FALSE(builder.failed()); + EXPECT_TRUE(CircuitChecker::check(builder)); +} + +/** + * @brief Test fixed_batch_mul with a zero scalar + */ +TYPED_TEST(CycleGroupTest, TestFixedBatchMulZeroScalar) +{ + STDLIB_TYPE_ALIASES; + auto builder = Builder(); + + auto element = TestFixture::generators[0]; + typename Group::Fr zero_scalar = 0; + + std::vector points{ cycle_group_ct(element) }; + std::vector scalars{ + cycle_group_ct::cycle_scalar::from_witness(&builder, zero_scalar) + }; + + auto result = cycle_group_ct::fixed_batch_mul(points, scalars); + EXPECT_TRUE(result.is_point_at_infinity().get_value()); + + EXPECT_FALSE(builder.failed()); + EXPECT_TRUE(CircuitChecker::check(builder)); +} + +/** + * @brief Profiling comparison: fixed_batch_mul (plookup) vs batch_mul (ROM) for constant points + * @details Both approaches compute the same MSM on constant base points with witness scalars. + * fixed_batch_mul should use significantly fewer gates due to zero table construction + * and zero finalization overhead. + */ +TYPED_TEST(CycleGroupTest, TestFixedBatchMulGateComparison) +{ + STDLIB_TYPE_ALIASES; + + constexpr size_t num_points = 128; + + // Generate random constant points and witness scalars + std::vector native_points; + std::vector native_scalars; + for (size_t i = 0; i < num_points; ++i) { + native_points.push_back(Group::one * Group::Fr::random_element(&engine)); + native_scalars.push_back(Group::Fr::random_element(&engine)); + } + + // Compute expected result natively + Element expected = Group::point_at_infinity; + for (size_t i = 0; i < num_points; ++i) { + expected += native_points[i] * native_scalars[i]; + } + + // --- ROM-based batch_mul --- + size_t rom_gates; + { + Builder rom_builder; + std::vector points; + std::vector scalars; + for (size_t i = 0; i < num_points; ++i) { + points.emplace_back(cycle_group_ct(native_points[i])); + scalars.emplace_back(cycle_group_ct::cycle_scalar::from_witness(&rom_builder, native_scalars[i])); + } + auto result = cycle_group_ct::batch_mul(points, scalars); + EXPECT_EQ(result.get_value(), AffineElement(expected)); + EXPECT_TRUE(CircuitChecker::check(rom_builder)); + rom_gates = rom_builder.get_num_finalized_gates_inefficient(); + } + + // --- Plookup-based fixed_batch_mul --- + size_t plookup_gates; + { + Builder plookup_builder; + std::vector points; + std::vector scalars; + for (size_t i = 0; i < num_points; ++i) { + points.emplace_back(cycle_group_ct(native_points[i])); + scalars.emplace_back(cycle_group_ct::cycle_scalar::from_witness(&plookup_builder, native_scalars[i])); + } + auto result = cycle_group_ct::fixed_batch_mul(points, scalars); + EXPECT_EQ(result.get_value(), AffineElement(expected)); + EXPECT_TRUE(CircuitChecker::check(plookup_builder)); + plookup_gates = plookup_builder.get_num_finalized_gates_inefficient(); + } + + info("batch_mul (ROM) gates: ", rom_gates); + info("fixed_batch_mul (plookup) gates: ", plookup_gates); + info("gate savings: ", static_cast(rom_gates) - static_cast(plookup_gates)); + + // fixed_batch_mul should be strictly cheaper than ROM-based batch_mul + EXPECT_LT(plookup_gates, rom_gates); +} + #pragma GCC diagnostic pop diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.cpp new file mode 100644 index 000000000000..2f4bdb718918 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.cpp @@ -0,0 +1,140 @@ +#include "./straus_plookup_table.hpp" +#include "./cycle_group.hpp" +#include "barretenberg/stdlib/primitives/circuit_builders/circuit_builders.hpp" + +namespace bb::stdlib { + +/** + * @brief Construct a plookup-based Straus lookup table for a constant base point. + * + * @details Creates a BasicTable with (1 << table_bits) entries of the form: + * { offset_generator + i * base_point } for i in [0, 1 << table_bits) + * + * The table is pushed directly into the builder's lookup_tables deque. Table data becomes part of the + * proving polynomial (zero gate cost). Each subsequent read costs exactly 1 lookup gate. + * + * @param context The circuit builder + * @param base_point Constant base point (must not be a witness) + * @param offset_generator Offset to prevent point-at-infinity edge cases + * @param table_bits Number of bits per table (table has 1 << table_bits entries) + */ +template +straus_plookup_table::straus_plookup_table(Builder* context, + const AffineElement& base_point, + const AffineElement& offset_generator, + size_t table_bits) + : _context(context) +{ + const size_t table_size = 1UL << table_bits; + + // Compute native table entries using projective coordinates, then batch-normalize + std::vector projective_points(table_size); + projective_points[0] = Element(offset_generator); + Element base_proj(base_point); + for (size_t i = 1; i < table_size; ++i) { + projective_points[i] = projective_points[i - 1] + base_proj; + } + Element::batch_normalize(projective_points.data(), table_size); + + native_table.resize(table_size); + for (size_t i = 0; i < table_size; ++i) { + native_table[i] = AffineElement(projective_points[i].x, projective_points[i].y); + } + + // Create a BasicTable and populate its columns + plookup::BasicTable table; + table.id = plookup::BasicTableId::KECCAK_RHO_9; // unused sentinel; table_index is what matters + table.use_twin_keys = false; + table.column_1_step_size = bb::fr(0); + table.column_2_step_size = bb::fr(0); + table.column_3_step_size = bb::fr(0); + table.get_values_from_key = nullptr; + + table.column_1.resize(table_size); + table.column_2.resize(table_size); + table.column_3.resize(table_size); + for (size_t i = 0; i < table_size; ++i) { + table.column_1[i] = bb::fr(i); + table.column_2[i] = native_table[i].x; + table.column_3[i] = native_table[i].y; + } + + // Assign table_index and push into the builder's lookup_tables deque + table.table_index = context->get_num_lookup_tables(); + auto& tables = context->get_lookup_tables(); + tables.emplace_back(std::move(table)); + _table = &tables.back(); + + // This table is built entirely from native constants (base_point and offset_generator are AffineElements), + // so the tag is pure constant. If left as the default FREE_WITNESS, merging with a transcript-tagged + // scalar index in read() would trigger "free witness interacting with origin" errors. + tag = OriginTag::constant(); +} + +/** + * @brief Read from the plookup table at the given index. + * + * @details Creates a single lookup gate that constrains: (index, x, y) is a valid row in this table. + * The index's own witness is reused as wire_1 of the gate (not a new variable), so the gate directly + * constrains the scalar slice to a valid (x, y) point — matching the pattern of + * create_gates_from_plookup_accumulators where key_a_index is reused in the first lookup gate. + * + * @param _index The lookup index (witness or constant field element, typically a 4-bit scalar slice) + * @return cycle_group The point at native_table[index] + */ +template cycle_group straus_plookup_table::read(const field_t& _index) +{ + // A plookup gate key must be a witness; convert constants to a witness constrained to the constant value + // (mirrors the same pattern in straus_lookup_table::read and create_gates_from_plookup_accumulators). + field_t index(_index); + if (index.is_constant()) { + index = field_t::from_witness(_context, _index.get_value()); + index.assert_equal(_index.get_value()); + } + + // Get native index value and look up the corresponding point + auto native_index = static_cast(uint256_t(index.get_value())); + BB_ASSERT(native_index < native_table.size()); + const auto& point = native_table[native_index]; + + // Create witnesses for x and y outputs + auto x_idx = _context->add_variable(point.x); + auto y_idx = _context->add_variable(point.y); + + // Record lookup entry in the table's lookup_gates (needed for read_counts construction) + plookup::BasicTable::LookupEntry entry; + entry.key = { uint256_t(native_index), 0 }; + entry.value = { point.x, point.y }; + _table->lookup_gates.emplace_back(entry); + + // Write lookup gate reusing the index's own witness index as the key (wire_1). + // This matches the pattern in create_gates_from_plookup_accumulators where key_a_index is reused + // in the first (and here only) lookup gate, ensuring the key is the actual scalar slice witness. + auto& block = _context->blocks.lookup; + block.populate_wires(index.get_witness_index(), x_idx, y_idx, _context->zero_idx()); + block.set_gate_selector(1); + block.q_3().emplace_back(bb::fr(_table->table_index)); // table identifier + block.q_2().emplace_back(0); // column_1 step size (0 = standalone lookup) + block.q_m().emplace_back(0); // column_2 step size + block.q_c().emplace_back(0); // column_3 step size + block.q_1().emplace_back(0); + block.q_4().emplace_back(0); + + _context->check_selector_length_consistency(); + _context->increment_num_gates(); + + // Wrap output witnesses in field_t and propagate origin tag from the index + field_t x = field_t::from_witness_index(_context, x_idx); + field_t y = field_t::from_witness_index(_context, y_idx); + OriginTag merged_tag(tag, index.get_origin_tag()); + x.set_origin_tag(merged_tag); + y.set_origin_tag(merged_tag); + + // Result is never at infinity due to offset generator in every table entry + return cycle_group(x, y, /*is_infinity=*/bool_t(_context, false), /*assert_on_curve=*/false); +} + +template class straus_plookup_table; +template class straus_plookup_table; + +} // namespace bb::stdlib diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.hpp new file mode 100644 index 000000000000..0d422deaced0 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/straus_plookup_table.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include "barretenberg/stdlib/primitives/field/field.hpp" +#include "barretenberg/stdlib_circuit_builders/plookup_tables/types.hpp" +#include "barretenberg/transcript/origin_tag.hpp" +#include + +namespace bb::stdlib { + +// Forward declaration +template class cycle_group; + +/** + * @brief straus_plookup_table computes a plookup-based lookup table of size 1 << table_bits + * + * @details For a CONSTANT base_point [P] and offset_generator point [G], where N = 1 << table_bits, + * the following is computed: + * + * { [G] + 0.[P], [G] + 1.[P], ..., [G] + (N - 1).[P] } + * + * Unlike straus_lookup_table (which uses ROM tables), this class creates plookup BasicTable entries. + * Plookup tables have zero construction cost (table data is part of the proving polynomial) and each + * read costs exactly 1 lookup gate with no finalization overhead. This makes them significantly cheaper + * than ROM tables for fixed/constant base points. + * + * @note This class requires the base point to be a circuit constant (not a witness). For witness base + * points, use straus_lookup_table instead. + * + * @note The offset generator [G] prevents point-at-infinity edge cases, same as in straus_lookup_table. + */ +template class straus_plookup_table { + public: + using field_t = stdlib::field_t; + using bool_t = stdlib::bool_t; + using Curve = typename Builder::EmbeddedCurve; + using Group = typename Curve::Group; + using Element = typename Curve::Element; + using AffineElement = typename Curve::AffineElement; + + straus_plookup_table() = default; + straus_plookup_table(Builder* context, + const AffineElement& base_point, + const AffineElement& offset_generator, + size_t table_bits); + cycle_group read(const field_t& index); + + private: + Builder* _context = nullptr; + plookup::BasicTable* _table = nullptr; // pointer into builder's lookup_tables deque + std::vector native_table; // precomputed table entries for witness generation + OriginTag tag; +}; + +} // namespace bb::stdlib From 5ca8fa7a906ff3a51d34ca165cedc0edf11899ae Mon Sep 17 00:00:00 2001 From: suyash67 Date: Sat, 4 Apr 2026 07:41:09 +0000 Subject: [PATCH 2/6] ipa verifier uses fixed base batch_mul. --- .../cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp | 5 +++-- .../barretenberg/dsl/acir_format/gate_count_constants.hpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index e4a3b8555ca2..bde8c2417f10 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -803,11 +803,12 @@ template class IPA } // Compute G_zero - // In the native verifier, this uses pippenger. Here we use batch_mul. + // In the native verifier, this uses pippenger. Here we use fixed_batch_mul since all SRS points are + // circuit constants, which uses plookup tables instead of ROM tables and is significantly cheaper. std::vector srs_elements = vk.get_monomial_points(); BB_ASSERT_GTE(srs_elements.size(), poly_length, "Not enough SRS points for IPA!"); srs_elements.resize(poly_length); - Commitment computed_G_zero = Commitment::batch_mul(srs_elements, s_vec); + Commitment computed_G_zero = Commitment::fixed_batch_mul(srs_elements, s_vec); // check the computed G_zero and the claimed G_zero are the same. claimed_G_zero.assert_equal(computed_G_zero); BB_ASSERT_EQ(computed_G_zero.get_value(), claimed_G_zero.get_value(), "G_zero doesn't match received G_zero."); diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp index 1daf3fa3d135..7dc4c5d65a46 100644 --- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp +++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp @@ -55,7 +55,7 @@ template inline constexpr size_t ASSERT_EQUALITY = ZERO_GATE // Honk Recursion Constants // ======================================== -inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 12904885; +inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 9038443; template constexpr std::tuple HONK_RECURSION_CONSTANTS( From 33e1e07cef6b5d3dbec7435ef037c06fd4487583 Mon Sep 17 00:00:00 2001 From: suyash67 Date: Sun, 5 Apr 2026 09:51:40 +0000 Subject: [PATCH 3/6] use 8-bit tables. --- .../commitment_schemes/ipa/ipa.hpp | 5 +- .../dsl/acir_format/gate_count_constants.hpp | 2 +- .../stdlib/primitives/group/cycle_group.cpp | 32 ++++++---- .../stdlib/primitives/group/cycle_group.hpp | 11 ++-- .../primitives/group/cycle_group.test.cpp | 62 +++++++++++++++---- 5 files changed, 82 insertions(+), 30 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index bde8c2417f10..b0bd4b72f780 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -805,10 +805,13 @@ template class IPA // Compute G_zero // In the native verifier, this uses pippenger. Here we use fixed_batch_mul since all SRS points are // circuit constants, which uses plookup tables instead of ROM tables and is significantly cheaper. + // We use 8-bit tables (table_bits=8, 32 rounds) rather than the default 4-bit (64 rounds) because + // table rows are preprocessed and don't cost witness rows; halving the rounds halves lookup/add gates. + // 8-bit is valid since cycle_scalar::LO_BITS (128) is evenly divisible by 8. std::vector srs_elements = vk.get_monomial_points(); BB_ASSERT_GTE(srs_elements.size(), poly_length, "Not enough SRS points for IPA!"); srs_elements.resize(poly_length); - Commitment computed_G_zero = Commitment::fixed_batch_mul(srs_elements, s_vec); + Commitment computed_G_zero = Commitment::fixed_batch_mul(srs_elements, s_vec, {}, 8); // check the computed G_zero and the claimed G_zero are the same. claimed_G_zero.assert_equal(computed_G_zero); BB_ASSERT_EQ(computed_G_zero.get_value(), claimed_G_zero.get_value(), "G_zero doesn't match received G_zero."); diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp index 7dc4c5d65a46..cc316058617e 100644 --- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp +++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp @@ -55,7 +55,7 @@ template inline constexpr size_t ASSERT_EQUALITY = ZERO_GATE // Honk Recursion Constants // ======================================== -inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 9038443; +inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 6351560; template constexpr std::tuple HONK_RECURSION_CONSTANTS( diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp index 9dd85c8292d4..852175b3c033 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp +++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp @@ -985,7 +985,8 @@ template typename cycle_group::batch_mul_internal_output cycle_group::_fixed_base_plookup_batch_mul_internal( const std::span scalars, const std::span base_points, - const std::span offset_generators) + const std::span offset_generators, + const size_t table_bits) { BB_ASSERT_EQ(!scalars.empty(), true, "Empty scalars provided to fixed base plookup batch mul!"); BB_ASSERT_EQ(scalars.size(), base_points.size(), "Points/scalars size mismatch in fixed base plookup batch mul!"); @@ -999,21 +1000,27 @@ typename cycle_group::batch_mul_internal_output cycle_group::_ } } BB_ASSERT(context != nullptr); + BB_ASSERT_EQ(cycle_scalar::LO_BITS % table_bits, + 0UL, + "table_bits must evenly divide cycle_scalar::LO_BITS. The Straus algorithm splits the scalar " + "into lo/hi limbs and decomposes each separately; if LO_BITS is not a multiple of table_bits, " + "the hi-limb slices start at the wrong bit-offset and the MSM result is incorrect. " + "Valid values for table_bits (given LO_BITS=128) are: 1, 2, 4, 8, 16, 32, 64, 128."); - constexpr size_t num_rounds = numeric::ceil_div(cycle_scalar::NUM_BITS, ROM_TABLE_BITS); + const size_t num_rounds = numeric::ceil_div(cycle_scalar::NUM_BITS, table_bits); - // Decompose each scalar into ROM_TABLE_BITS-bit slices (also enforces range constraints) + // Decompose each scalar into table_bits-bit slices (also enforces range constraints) std::vector scalar_slices; scalar_slices.reserve(num_points); for (const auto& scalar : scalars) { - scalar_slices.emplace_back(context, scalar, ROM_TABLE_BITS); + scalar_slices.emplace_back(context, scalar, table_bits); } // Create plookup tables for each constant base point (zero gate cost) std::vector point_tables; point_tables.reserve(num_points); for (size_t i = 0; i < num_points; ++i) { - point_tables.emplace_back(context, base_points[i], offset_generators[i + 1], ROM_TABLE_BITS); + point_tables.emplace_back(context, base_points[i], offset_generators[i + 1], table_bits); } // Compute all intermediate points natively for use as hints in the in-circuit Straus algorithm. @@ -1024,7 +1031,7 @@ typename cycle_group::batch_mul_internal_output cycle_group::_ // Build native straus tables std::vector> native_straus_tables; for (size_t i = 0; i < num_points; ++i) { - std::vector table(1UL << ROM_TABLE_BITS); + std::vector table(1UL << table_bits); table[0] = Element(offset_generators[i + 1]); Element base_proj(base_points[i]); for (size_t j = 1; j < table.size(); ++j) { @@ -1037,7 +1044,7 @@ typename cycle_group::batch_mul_internal_output cycle_group::_ Element accumulator = offset_generators[0]; for (size_t i = 0; i < num_rounds; ++i) { if (i != 0) { - for (size_t j = 0; j < ROM_TABLE_BITS; ++j) { + for (size_t j = 0; j < table_bits; ++j) { accumulator = accumulator.dbl(); operation_transcript.push_back(accumulator); offset_generator_accumulator = offset_generator_accumulator.dbl(); @@ -1067,7 +1074,7 @@ typename cycle_group::batch_mul_internal_output cycle_group::_ for (size_t i = 0; i < num_rounds; ++i) { if (i != 0) { - for (size_t j = 0; j < ROM_TABLE_BITS; ++j) { + for (size_t j = 0; j < table_bits; ++j) { accumulator = accumulator.dbl(*hint_ptr); hint_ptr++; } @@ -1101,7 +1108,8 @@ typename cycle_group::batch_mul_internal_output cycle_group::_ template cycle_group cycle_group::fixed_batch_mul(const std::vector& constant_points, const std::vector& scalars, - const GeneratorContext& context) + const GeneratorContext& context, + const size_t table_bits) { BB_ASSERT_EQ(scalars.size(), constant_points.size(), "Points/scalars size mismatch in fixed_batch_mul!"); @@ -1131,12 +1139,12 @@ cycle_group cycle_group::fixed_batch_mul(const std::vectorcreate_limbed_range_constraint(scalar.lo().get_witness_index(), cycle_scalar::LO_BITS, - ROM_TABLE_BITS, + table_bits, "fixed_batch_mul: lo range constraint for scalar with constant " "infinity"); ctx->create_limbed_range_constraint(scalar.hi().get_witness_index(), cycle_scalar::HI_BITS, - ROM_TABLE_BITS, + table_bits, "fixed_batch_mul: hi range constraint for scalar with constant " "infinity"); continue; @@ -1161,7 +1169,7 @@ cycle_group cycle_group::fixed_batch_mul(const std::vector class cycle_group { static cycle_group fixed_batch_mul(const std::vector& constant_points, const std::vector& scalars, - GeneratorContext context = {}) + GeneratorContext context = {}, + size_t table_bits = ROM_TABLE_BITS) { std::vector cycle_scalars; for (auto scalar : scalars) { cycle_scalars.emplace_back(scalar); } - return fixed_batch_mul(constant_points, cycle_scalars, context); + return fixed_batch_mul(constant_points, cycle_scalars, context, table_bits); } static cycle_group fixed_batch_mul(const std::vector& constant_points, const std::vector& scalars, - const GeneratorContext& context = {}); + const GeneratorContext& context = {}, + size_t table_bits = ROM_TABLE_BITS); cycle_group operator*(const cycle_scalar& scalar) const; cycle_group& operator*=(const cycle_scalar& scalar); cycle_group operator*(const BigScalarField& scalar) const; @@ -245,7 +247,8 @@ template class cycle_group { static batch_mul_internal_output _fixed_base_plookup_batch_mul_internal( std::span scalars, std::span base_points, - std::span offset_generators); + std::span offset_generators, + size_t table_bits = ROM_TABLE_BITS); // Internal implementation for unconditional_add and unconditional_subtract cycle_group _unconditional_add_or_subtract(const cycle_group& other, diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp index 0c48b48a87c9..568be7e95d8c 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp +++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.test.cpp @@ -2123,9 +2123,8 @@ TYPED_TEST(CycleGroupTest, TestFixedBatchMulSinglePoint) Element expected = element * scalar; std::vector points{ cycle_group_ct(element) }; - std::vector scalars{ - cycle_group_ct::cycle_scalar::from_witness(&builder, scalar) - }; + std::vector scalars{ cycle_group_ct::cycle_scalar::from_witness(&builder, + scalar) }; auto result = cycle_group_ct::fixed_batch_mul(points, scalars); EXPECT_EQ(result.get_value(), AffineElement(expected)); @@ -2146,9 +2145,8 @@ TYPED_TEST(CycleGroupTest, TestFixedBatchMulZeroScalar) typename Group::Fr zero_scalar = 0; std::vector points{ cycle_group_ct(element) }; - std::vector scalars{ - cycle_group_ct::cycle_scalar::from_witness(&builder, zero_scalar) - }; + std::vector scalars{ cycle_group_ct::cycle_scalar::from_witness( + &builder, zero_scalar) }; auto result = cycle_group_ct::fixed_batch_mul(points, scalars); EXPECT_TRUE(result.is_point_at_infinity().get_value()); @@ -2183,8 +2181,18 @@ TYPED_TEST(CycleGroupTest, TestFixedBatchMulGateComparison) expected += native_points[i] * native_scalars[i]; } + // Helper: sum all lookup table rows across the builder's registered tables + auto total_table_rows = [](const Builder& b) { + size_t total = 0; + for (const auto& table : b.get_lookup_tables()) { + total += table.size(); + } + return total; + }; + // --- ROM-based batch_mul --- size_t rom_gates; + size_t rom_table_rows; { Builder rom_builder; std::vector points; @@ -2196,11 +2204,13 @@ TYPED_TEST(CycleGroupTest, TestFixedBatchMulGateComparison) auto result = cycle_group_ct::batch_mul(points, scalars); EXPECT_EQ(result.get_value(), AffineElement(expected)); EXPECT_TRUE(CircuitChecker::check(rom_builder)); + rom_table_rows = total_table_rows(rom_builder); rom_gates = rom_builder.get_num_finalized_gates_inefficient(); } - // --- Plookup-based fixed_batch_mul --- + // --- Plookup-based fixed_batch_mul (4-bit) --- size_t plookup_gates; + size_t plookup_table_rows; { Builder plookup_builder; std::vector points; @@ -2209,18 +2219,46 @@ TYPED_TEST(CycleGroupTest, TestFixedBatchMulGateComparison) points.emplace_back(cycle_group_ct(native_points[i])); scalars.emplace_back(cycle_group_ct::cycle_scalar::from_witness(&plookup_builder, native_scalars[i])); } - auto result = cycle_group_ct::fixed_batch_mul(points, scalars); + auto result = cycle_group_ct::fixed_batch_mul(points, scalars); // default table_bits=4 EXPECT_EQ(result.get_value(), AffineElement(expected)); EXPECT_TRUE(CircuitChecker::check(plookup_builder)); + plookup_table_rows = total_table_rows(plookup_builder); plookup_gates = plookup_builder.get_num_finalized_gates_inefficient(); } - info("batch_mul (ROM) gates: ", rom_gates); - info("fixed_batch_mul (plookup) gates: ", plookup_gates); - info("gate savings: ", static_cast(rom_gates) - static_cast(plookup_gates)); + // --- Plookup-based fixed_batch_mul with 8-bit tables --- + // 8-bit is valid since cycle_scalar::LO_BITS (128) is divisible by 8. + // Table rows are preprocessed (not witness rows) so increasing table size from 16→256 entries + // is essentially free; only the number of rounds (64→32) and reads/adds matter for gate count. + size_t plookup_8bit_gates; + size_t plookup_8bit_table_rows; + { + Builder plookup_builder; + std::vector points; + std::vector scalars; + for (size_t i = 0; i < num_points; ++i) { + points.emplace_back(cycle_group_ct(native_points[i])); + scalars.emplace_back(cycle_group_ct::cycle_scalar::from_witness(&plookup_builder, native_scalars[i])); + } + auto result = cycle_group_ct::fixed_batch_mul(points, scalars, {}, 8); + EXPECT_EQ(result.get_value(), AffineElement(expected)); + EXPECT_TRUE(CircuitChecker::check(plookup_builder)); + plookup_8bit_table_rows = total_table_rows(plookup_builder); + plookup_8bit_gates = plookup_builder.get_num_finalized_gates_inefficient(); + } + + info(" gates table_rows"); + info("batch_mul (ROM, 4-bit): ", rom_gates, " ", rom_table_rows); + info("fixed_batch_mul (4-bit): ", plookup_gates, " ", plookup_table_rows); + info("fixed_batch_mul (8-bit): ", plookup_8bit_gates, " ", plookup_8bit_table_rows); + info("4-bit savings vs ROM: ", static_cast(rom_gates) - static_cast(plookup_gates)); + info("8-bit savings vs ROM: ", static_cast(rom_gates) - static_cast(plookup_8bit_gates)); + info("8-bit savings vs 4-bit: ", + static_cast(plookup_gates) - static_cast(plookup_8bit_gates)); - // fixed_batch_mul should be strictly cheaper than ROM-based batch_mul + // Both plookup variants should be cheaper than ROM; 8-bit should be cheapest EXPECT_LT(plookup_gates, rom_gates); + EXPECT_LT(plookup_8bit_gates, plookup_gates); } #pragma GCC diagnostic pop From ee3f550172bc85e2ab5a18ea71e088f28d22a392 Mon Sep 17 00:00:00 2001 From: suyash67 Date: Sun, 5 Apr 2026 10:19:17 +0000 Subject: [PATCH 4/6] get below 2^23. --- .../barretenberg/commitment_schemes/ipa/ipa.hpp | 16 ++++++++++++---- .../dsl/acir_format/gate_count_constants.hpp | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index b0bd4b72f780..4aa5d23bd9c8 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -805,13 +805,21 @@ template class IPA // Compute G_zero // In the native verifier, this uses pippenger. Here we use fixed_batch_mul since all SRS points are // circuit constants, which uses plookup tables instead of ROM tables and is significantly cheaper. - // We use 8-bit tables (table_bits=8, 32 rounds) rather than the default 4-bit (64 rounds) because - // table rows are preprocessed and don't cost witness rows; halving the rounds halves lookup/add gates. - // 8-bit is valid since cycle_scalar::LO_BITS (128) is evenly divisible by 8. + // We use 8-bit tables (table_bits=8, 32 rounds) to minimise gate count. However, with N=32768 SRS points + // and 8-bit tables, the total table rows = 32768 × 256 = 2^23 exactly. The 5 mandatory overhead rows + // (NUM_DISABLED_ROWS_IN_SUMCHECK=4, NUM_ZERO_ROWS=1) push the total to 2^23+5, forcing dyadic_size = 2^24. + // To stay within 2^23 we handle the first SRS point separately with a 4-bit table (16 entries instead of + // 256): total table rows = 16 + 32767×256 = 8,388,368 < 2^23, giving dyadic_size = 2^23 and ~2× speedup. std::vector srs_elements = vk.get_monomial_points(); BB_ASSERT_GTE(srs_elements.size(), poly_length, "Not enough SRS points for IPA!"); srs_elements.resize(poly_length); - Commitment computed_G_zero = Commitment::fixed_batch_mul(srs_elements, s_vec, {}, 8); + std::vector first_srs_point(1, srs_elements[0]); + std::vector first_s_scalar(1, s_vec[0]); + std::vector remaining_srs(srs_elements.begin() + 1, srs_elements.end()); + std::vector remaining_s(s_vec.begin() + 1, s_vec.end()); + Commitment first_term = Commitment::fixed_batch_mul(first_srs_point, first_s_scalar, {}, /*table_bits=*/4); + Commitment remaining_term = Commitment::fixed_batch_mul(remaining_srs, remaining_s, {}, /*table_bits=*/8); + Commitment computed_G_zero = first_term.unconditional_add(remaining_term); // check the computed G_zero and the claimed G_zero are the same. claimed_G_zero.assert_equal(computed_G_zero); BB_ASSERT_EQ(computed_G_zero.get_value(), claimed_G_zero.get_value(), "G_zero doesn't match received G_zero."); diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp index cc316058617e..89cf4baf1754 100644 --- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp +++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/gate_count_constants.hpp @@ -55,7 +55,7 @@ template inline constexpr size_t ASSERT_EQUALITY = ZERO_GATE // Honk Recursion Constants // ======================================== -inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 6351560; +inline constexpr size_t ROOT_ROLLUP_GATE_COUNT = 6351579; template constexpr std::tuple HONK_RECURSION_CONSTANTS( From 6d90fae423daa5d55b9dd7b882da9e1efab66d0d Mon Sep 17 00:00:00 2001 From: suyash67 Date: Sun, 5 Apr 2026 11:09:33 +0000 Subject: [PATCH 5/6] use operator* for first point. --- .../cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index 4aa5d23bd9c8..85478908e02d 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -808,16 +808,13 @@ template class IPA // We use 8-bit tables (table_bits=8, 32 rounds) to minimise gate count. However, with N=32768 SRS points // and 8-bit tables, the total table rows = 32768 × 256 = 2^23 exactly. The 5 mandatory overhead rows // (NUM_DISABLED_ROWS_IN_SUMCHECK=4, NUM_ZERO_ROWS=1) push the total to 2^23+5, forcing dyadic_size = 2^24. - // To stay within 2^23 we handle the first SRS point separately with a 4-bit table (16 entries instead of - // 256): total table rows = 16 + 32767×256 = 8,388,368 < 2^23, giving dyadic_size = 2^23 and ~2× speedup. + // To stay within 2^23 we handle the first SRS point separately using operator*. std::vector srs_elements = vk.get_monomial_points(); BB_ASSERT_GTE(srs_elements.size(), poly_length, "Not enough SRS points for IPA!"); srs_elements.resize(poly_length); - std::vector first_srs_point(1, srs_elements[0]); - std::vector first_s_scalar(1, s_vec[0]); std::vector remaining_srs(srs_elements.begin() + 1, srs_elements.end()); std::vector remaining_s(s_vec.begin() + 1, s_vec.end()); - Commitment first_term = Commitment::fixed_batch_mul(first_srs_point, first_s_scalar, {}, /*table_bits=*/4); + Commitment first_term = srs_elements[0] * s_vec[0]; Commitment remaining_term = Commitment::fixed_batch_mul(remaining_srs, remaining_s, {}, /*table_bits=*/8); Commitment computed_G_zero = first_term.unconditional_add(remaining_term); // check the computed G_zero and the claimed G_zero are the same. From 7b62fea1d13adc7920820bd3a16cc21c848d2b05 Mon Sep 17 00:00:00 2001 From: suyash67 Date: Mon, 6 Apr 2026 09:46:52 +0000 Subject: [PATCH 6/6] bench root rollup circuit. --- .../src/barretenberg/benchmark/CMakeLists.txt | 1 + .../root_rollup_bench/CMakeLists.txt | 9 + .../root_rollup_bench/root_rollup.bench.cpp | 250 ++++++++++++ .../primitives/group/STRAUS_MSM_ALGORITHM.md | 385 ++++++++++++++++++ 4 files changed, 645 insertions(+) create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/CMakeLists.txt create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/root_rollup.bench.cpp create mode 100644 barretenberg/cpp/src/barretenberg/stdlib/primitives/group/STRAUS_MSM_ALGORITHM.md diff --git a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt index 12d08f15e49a..890382d5e867 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt +++ b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt @@ -11,3 +11,4 @@ add_subdirectory(indexed_tree_bench) add_subdirectory(append_only_tree_bench) add_subdirectory(ultra_bench) add_subdirectory(circuit_construction_bench) +add_subdirectory(root_rollup_bench) diff --git a/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/CMakeLists.txt new file mode 100644 index 000000000000..b0eb686766f7 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/CMakeLists.txt @@ -0,0 +1,9 @@ +barretenberg_module( + root_rollup_bench + dsl + ultra_honk +) + +if(NOT WASM AND NOT FUZZING) + target_link_libraries(root_rollup_bench PRIVATE vm2_stub) +endif() diff --git a/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/root_rollup.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/root_rollup.bench.cpp new file mode 100644 index 000000000000..9f92205ae23f --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/root_rollup_bench/root_rollup.bench.cpp @@ -0,0 +1,250 @@ +/** + * @brief Benchmark for root rollup circuit proving and verification. + * + * Constructs the root rollup circuit (2 recursive Honk verifications + IPA verification) + * and benchmarks the full UltraZK proving pipeline. This is the same circuit as the + * GateCountRootRollup test in honk_recursion_constraint.test.cpp, but exercises the prover. + * + * Usage: + * HARDWARE_CONCURRENCY=32 ./bin/root_rollup_bench + */ +#include +#include + +#include "barretenberg/common/bb_bench.hpp" +#include "barretenberg/dsl/acir_format/acir_format.hpp" +#include "barretenberg/dsl/acir_format/acir_to_constraint_buf.hpp" +#include "barretenberg/dsl/acir_format/honk_recursion_constraint.hpp" +#include "barretenberg/dsl/acir_format/recursion_constraint.hpp" +#include "barretenberg/dsl/acir_format/serde/index.hpp" +#include "barretenberg/dsl/acir_format/utils.hpp" +#include "barretenberg/dsl/acir_format/witness_constant.hpp" +#include "barretenberg/numeric/uint256/uint256.hpp" +#include "barretenberg/special_public_inputs/special_public_inputs.hpp" +#include "barretenberg/stdlib/primitives/circuit_builders/circuit_builders.hpp" +#include "barretenberg/stdlib_circuit_builders/mock_circuits.hpp" +#include "barretenberg/ultra_honk/prover_instance.hpp" +#include "barretenberg/ultra_honk/ultra_prover.hpp" +#include "barretenberg/ultra_honk/ultra_verifier.hpp" + +using namespace acir_format; +using namespace bb; + +namespace { + +using RecursiveFlavor = UltraRecursiveFlavor_; +using InnerFlavor = RecursiveFlavor::NativeFlavor; +using InnerBuilder = InnerFlavor::CircuitBuilder; +using InnerIO = bb::stdlib::recursion::honk::RollupIO; +using InnerProverInstance = ProverInstance_; +using InnerVerificationKey = InnerFlavor::VerificationKey; +using InnerProver = UltraProver_; + +static constexpr size_t NUM_PUBLIC_INPUTS = 2; +static constexpr uint32_t INNER_PROOF_TYPE = ROLLUP_HONK; + +// Helpers to convert RecursionConstraint -> Acir::Opcode (extracted from test_class.hpp to avoid gtest dependency) +Acir::FunctionInput witness_to_function_input(uint32_t witness_index) +{ + return Acir::FunctionInput{ .value = + Acir::FunctionInput::Witness{ .value = Acir::Witness{ .value = witness_index } } }; +} + +Acir::FunctionInput witness_or_constant_to_function_input(const WitnessOrConstant& input) +{ + if (input.is_constant) { + return Acir::FunctionInput{ .value = Acir::FunctionInput::Constant{ .value = input.value.to_buffer() } }; + } + return Acir::FunctionInput{ .value = + Acir::FunctionInput::Witness{ .value = Acir::Witness{ .value = input.index } } }; +} + +Acir::Opcode recursion_constraint_to_acir_opcode(const RecursionConstraint& constraint) +{ + std::vector verification_key; + for (const auto& key_idx : constraint.key) { + verification_key.push_back(witness_to_function_input(key_idx)); + } + std::vector proof; + for (const auto& proof_idx : constraint.proof) { + proof.push_back(witness_to_function_input(proof_idx)); + } + std::vector public_inputs; + for (const auto& pub_input_idx : constraint.public_inputs) { + public_inputs.push_back(witness_to_function_input(pub_input_idx)); + } + return Acir::Opcode{ .value = Acir::Opcode::BlackBoxFuncCall{ + .value = Acir::BlackBoxFuncCall{ + .value = Acir::BlackBoxFuncCall::RecursiveAggregation{ + .verification_key = std::move(verification_key), + .proof = std::move(proof), + .public_inputs = std::move(public_inputs), + .key_hash = witness_to_function_input(constraint.key_hash), + .proof_type = constraint.proof_type, + .predicate = witness_or_constant_to_function_input(constraint.predicate), + } } } }; +} + +AcirFormat constraints_to_acir_format(const std::vector& constraints) +{ + std::vector opcodes; + for (const auto& c : constraints) { + opcodes.push_back(recursion_constraint_to_acir_opcode(c)); + } + Acir::Circuit circuit{ + .function_name = "root_rollup_bench", + .opcodes = opcodes, + .private_parameters = {}, + .public_parameters = Acir::PublicInputs{ .value = {} }, + .return_values = Acir::PublicInputs{ .value = {} }, + .assert_messages = {}, + }; + return circuit_serde_to_acir_format(circuit); +} + +InnerBuilder create_inner_circuit() +{ + InnerBuilder builder; + MockCircuits::add_arithmetic_gates(builder); + MockCircuits::add_lookup_gates(builder); + for (size_t idx = 0; idx < NUM_PUBLIC_INPUTS; idx++) { + builder.add_public_variable(InnerBuilder::FF::random_element()); + } + InnerIO::add_default(builder); + return builder; +} + +std::pair circuit_to_recursion_constraint(InnerBuilder& builder) +{ + for (size_t idx = builder.num_public_inputs(); idx < NUM_PUBLIC_INPUTS; idx++) { + builder.add_public_variable(InnerBuilder::FF::random_element()); + } + auto prover_instance = std::make_shared(builder); + auto verification_key = std::make_shared(prover_instance->get_precomputed()); + InnerProver prover(prover_instance, verification_key); + auto proof = prover.construct_proof(); + + WitnessVector witness_values; + RecursionConstraint constraint = recursion_data_to_recursion_constraint(witness_values, + proof, + verification_key->to_field_elements(), + verification_key->hash(), + bb::fr::one(), + builder.num_public_inputs() - + InnerIO::PUBLIC_INPUTS_SIZE, + INNER_PROOF_TYPE); + return { constraint, witness_values }; +} + +void generate_root_rollup_constraints(std::vector& honk_recursion_constraints, + WitnessVector& witness_values) +{ + std::vector constraints; + std::vector witness_vectors; + + for (size_t idx = 0; idx < 2; idx++) { + auto builder = create_inner_circuit(); + auto [constraint, witnesses] = circuit_to_recursion_constraint(builder); + constraints.emplace_back(std::move(constraint)); + witness_vectors.emplace_back(std::move(witnesses)); + } + + for (auto [constraint, witnesses] : zip_view(constraints, witness_vectors)) { + uint32_t offset = static_cast(witness_values.size()); + auto shift = [&offset](std::vector& indices) { + for (auto& index : indices) { + index += offset; + } + }; + shift(constraint.key); + shift(constraint.proof); + shift(constraint.public_inputs); + constraint.key_hash += offset; + constraint.predicate.index += offset; + constraint.proof_type = static_cast(ROOT_ROLLUP_HONK); + witness_values.insert(witness_values.end(), witnesses.begin(), witnesses.end()); + } + + honk_recursion_constraints = std::move(constraints); +} + +size_t get_peak_rss_mib() +{ + struct rusage usage {}; + getrusage(RUSAGE_SELF, &usage); + return static_cast(usage.ru_maxrss) / 1024; // Linux: ru_maxrss is in KB +} + +} // namespace + +static void root_rollup_prove(benchmark::State& state) +{ + bb::srs::init_file_crs_factory(bb::srs::bb_crs_path()); + + for (auto _ : state) { + state.PauseTiming(); + + info("Generating root rollup constraints (2 inner circuits)..."); + std::vector constraints; + WitnessVector witness_values; + generate_root_rollup_constraints(constraints, witness_values); + + info("Building outer circuit..."); + AcirFormat constraint_system = constraints_to_acir_format(constraints); + AcirProgram program{ constraint_system, witness_values }; + ProgramMetadata metadata{ .has_ipa_claim = false }; + auto builder = create_circuit(program, metadata); + + size_t num_gates = builder.get_num_finalized_gates_inefficient(); + info("Root rollup circuit: ", num_gates, " gates"); + + info("Creating prover instance..."); + auto prover_instance = std::make_shared>(builder); + auto verification_key = + std::make_shared(prover_instance->get_precomputed()); + + size_t dyadic_size = prover_instance->dyadic_size(); + info("Dyadic size: ", dyadic_size, " (log2: ", numeric::get_msb(dyadic_size), ")"); + + size_t rss_before = get_peak_rss_mib(); + info("Peak RSS before proving: ", rss_before, " MiB"); + + UltraZKProver prover(prover_instance, verification_key); + + info("Starting proof construction..."); + state.ResumeTiming(); + + auto proof = prover.construct_proof(); + + state.PauseTiming(); + + size_t rss_after = get_peak_rss_mib(); + info("Peak RSS after proving: ", rss_after, " MiB"); + + info("Verifying proof..."); + auto vk_and_hash = std::make_shared(verification_key); + UltraZKVerifier verifier(vk_and_hash); + auto output = verifier.verify_proof(proof); + info(output.result ? "Proof verified successfully" : "ERROR: Proof verification FAILED"); + + state.ResumeTiming(); + } +} + +BENCHMARK(root_rollup_prove)->Unit(benchmark::kMillisecond)->Iterations(1); + +int main(int argc, char** argv) +{ + bb::detail::use_bb_bench = true; + + ::benchmark::Initialize(&argc, argv); + if (::benchmark::ReportUnrecognizedArguments(argc, argv)) + return 1; + ::benchmark::RunSpecifiedBenchmarks(); + ::benchmark::Shutdown(); + + std::cout << "\n=== Detailed BB_BENCH Profiling Stats ===\n"; + bb::detail::GLOBAL_BENCH_STATS.print_aggregate_counts_hierarchical(std::cout); + + return 0; +} diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/STRAUS_MSM_ALGORITHM.md b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/STRAUS_MSM_ALGORITHM.md new file mode 100644 index 000000000000..9d4cf7dfd3b0 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/STRAUS_MSM_ALGORITHM.md @@ -0,0 +1,385 @@ +# Straus Multi-Scalar Multiplication: Precise Mathematical Description + +This document describes the exact step-by-step mathematics of the Straus MSM algorithm as +implemented in `cycle_group` (`cycle_group.cpp`). All notation is made concrete so that every +formula corresponds to an exact line of code. + +--- + +## 1. Problem Statement + +Given $N$ elliptic curve points $P_0, P_1, \ldots, P_{N-1}$ on the Grumpkin curve and $N$ +scalars $s_0, s_1, \ldots, s_{N-1}$ in the Grumpkin scalar field +$\mathbb{F}_r$ (equivalently, the BN254 base field), compute in-circuit the multi-scalar +multiplication (MSM): + +$$\text{MSM} = \sum_{j=0}^{N-1} s_j \cdot P_j$$ + +--- + +## 2. Parameters + +| Symbol | Value | Source | +| -------------------- | ---------------------------- | ------------------------ | +| $\texttt{NUM\_BITS}$ | $254$ | `cycle_scalar::NUM_BITS` | +| $\texttt{LO\_BITS}$ | $128$ | `cycle_scalar::LO_BITS` | +| $\texttt{HI\_BITS}$ | $126$ | `cycle_scalar::HI_BITS` | +| $w$ | $4$ | `ROM_TABLE_BITS` | +| $R$ | $\lceil 254 / 4 \rceil = 64$ | `num_rounds` | +| $T$ | $2^w = 16$ | table size per point | + +--- + +## 3. Scalar Representation (`cycle_scalar`) + +Each 254-bit scalar $s \in \mathbb{F}_r$ is split into two **limbs**: + +$$s = s_{\text{lo}} + 2^{128} \cdot s_{\text{hi}}$$ + +where $s_{\text{lo}} \in [0, 2^{128})$ is a 128-bit integer and $s_{\text{hi}} \in [0, 2^{126})$ +is a 126-bit integer. Both limbs are represented as native `field_t` circuit elements. Crucially, +the range constraints on these limbs are **deferred** to the MSM algorithm — `cycle_scalar` +alone does not add range-constraint gates. + +--- + +## 4. Scalar Decomposition into $w$-Bit Slices (`straus_scalar_slices`) + +Each limb is independently decomposed into $w = 4$ bit slices via `create_limbed_range_constraint`, +which simultaneously performs the decomposition and enforces the range constraint in-circuit. + +### 4.1 Lo-limb slices + +$s_{\text{lo}}$ (128 bits, exactly divisible by 4) is split into $128/4 = 32$ slices: + +$$s_{\text{lo},k} = \left\lfloor \frac{s_{\text{lo}}}{16^k} \right\rfloor \bmod 16, \quad k = 0, 1, \ldots, 31$$ + +Each slice satisfies $s_{\text{lo},k} \in \{0,\ldots,15\}$. The lo-limb is reconstructed as: + +$$s_{\text{lo}} = \sum_{k=0}^{31} s_{\text{lo},k} \cdot 16^k$$ + +### 4.2 Hi-limb slices + +$s_{\text{hi}}$ (126 bits, $126 = 31 \cdot 4 + 2$) is split into 32 slices: + +$$s_{\text{hi},k} = \left\lfloor \frac{s_{\text{hi}}}{16^k} \right\rfloor \bmod 16, \quad k = 0, 1, \ldots, 30$$ +$$s_{\text{hi},31} = \left\lfloor \frac{s_{\text{hi}}}{16^{31}} \right\rfloor \bmod 4 \quad\text{(2-bit slice)}$$ + +All slices $s_{\text{hi},k} \in \{0,\ldots,15\}$ for $k \le 30$; the final slice $s_{\text{hi},31} \in \{0,1,2,3\}$. +The hi-limb is reconstructed as: + +$$s_{\text{hi}} = \sum_{k=0}^{31} s_{\text{hi},k} \cdot 16^k$$ + +### 4.3 Unified slice vector + +The two decompositions are concatenated into a single vector of $R = 64$ slices: + +$$\sigma[k] = \begin{cases} s_{\text{lo},k} & 0 \le k \le 31 \\ s_{\text{hi},k-32} & 32 \le k \le 63 \end{cases}$$ + +The full scalar reconstruct identity is: + +$$s = \sum_{k=0}^{63} \sigma[k] \cdot 16^k$$ + +because: + +$$\sum_{k=0}^{63} \sigma[k] \cdot 16^k = \sum_{k=0}^{31} s_{\text{lo},k} \cdot 16^k + \sum_{k=0}^{31} s_{\text{hi},k} \cdot 16^{k+32} = s_{\text{lo}} + 2^{128} \cdot s_{\text{hi}} = s$$ + +At step $i$ of the Straus loop (0-indexed, MSB-first), the slice accessed is: + +$$\sigma_{\text{round}(i)} = \sigma[R - 1 - i] = \sigma[63 - i]$$ + +so round $i = 0$ processes the **most significant** slice $\sigma[63]$, and round $i = 63$ processes +the **least significant** slice $\sigma[0]$. + +--- + +## 5. Lookup Table Construction + +For each point $P_j$ with associated offset generator $G_{j+1}$, a lookup table $\mathcal{T}_j$ of +size $T = 16$ is precomputed: + +$$\mathcal{T}_j[v] = G_{j+1} + v \cdot P_j, \quad v = 0, 1, \ldots, 15$$ + +The offset generator $G_{j+1}$ is drawn from a domain-separated hash-to-curve +(`"cycle_group_offset_generator"`) and is linearly independent of all $P_j$ and of each other. +It ensures $\mathcal{T}_j[0] = G_{j+1} \ne \mathcal{O}$, preventing the point-at-infinity edge +case when a slice value is zero. + +Two implementations exist with different circuit costs: + +| Implementation | Table stored as | Construction cost | Read cost | +| ------------------------------------- | ----------------------------------------- | ----------------------------------------------------------- | ---------------------- | +| `straus_lookup_table` (variable-base) | ROM array (witnesses) | $15$ `unconditional_add` gates per table + ROM finalization | 1 ROM gate per read | +| `straus_plookup_table` (fixed-base) | Plookup `BasicTable` (proving polynomial) | **0 gates** (table data is not in the trace) | 1 lookup gate per read | + +### 5.1 Full Table Structure for $w = 4$ bits + +For a single base point $P$ and offset generator $G$, the table $\mathcal{T}$ has $2^4 = 16$ entries. + +**Construction (projective arithmetic, then batch-normalize):** + +$$\mathcal{T}[0] = G, \quad \mathcal{T}[v] = \mathcal{T}[v-1] + P \text{ for } v = 1, \ldots, 15$$ + +**Complete 16-entry table ($w = 4$):** + +| Index $v$ | Binary $v_3 v_2 v_1 v_0$ | Table entry $\mathcal{T}[v] = G + v \cdot P$ | +| --------- | ------------------------- | --------------------------------------------- | +| 0 | `0000` | $G$ | +| 1 | `0001` | $G + P$ | +| 2 | `0010` | $G + 2P$ | +| 3 | `0011` | $G + 3P$ | +| 4 | `0100` | $G + 4P$ | +| 5 | `0101` | $G + 5P$ | +| 6 | `0110` | $G + 6P$ | +| 7 | `0111` | $G + 7P$ | +| 8 | `1000` | $G + 8P$ | +| 9 | `1001` | $G + 9P$ | +| 10 | `1010` | $G + 10P$ | +| 11 | `1011` | $G + 11P$ | +| 12 | `1100` | $G + 12P$ | +| 13 | `1101` | $G + 13P$ | +| 14 | `1110` | $G + 14P$ | +| 15 | `1111` | $G + 15P$ | + +**`BasicTable` column mapping (as stored in the proving polynomial):** + +$$\texttt{column\_1}[v] = v, \quad \texttt{column\_2}[v] = \mathcal{T}[v].x, \quad \texttt{column\_3}[v] = \mathcal{T}[v].y$$ + +Concretely, with affine coordinates $(x_v, y_v) = \mathcal{T}[v]$: + +| `column_1` (key) | `column_2` ($x$-coordinate) | `column_3` ($y$-coordinate) | +| ---------------- | --------------------------- | --------------------------- | +| 0 | $x_0 = G_x$ | $y_0 = G_y$ | +| 1 | $x_1$ | $y_1$ | +| 2 | $x_2$ | $y_2$ | +| $\vdots$ | $\vdots$ | $\vdots$ | +| 15 | $x_{15}$ | $y_{15}$ | + +**Plookup gate for a single read at index $v$ (witness $w_1$):** + +A single lookup gate constrains the triple $(w_1, w_2, w_3)$ to be a valid row $(v,\, x_v,\, y_v)$ of the table: + +$$w_1 = v, \quad w_2 = \texttt{column\_2}[v] = x_v, \quad w_3 = \texttt{column\_3}[v] = y_v$$ + +Gate selectors: $q_{\text{lookup}} = 1$, $q_3 = \texttt{table\_index}$, $q_2 = q_m = q_c = q_1 = q_4 = 0$ (step sizes all zero, indicating a standalone lookup with no chained accumulation). + +**Why $\mathcal{T}[0] = G \ne \mathcal{O}$:** The offset generator $G$ is a hash-to-curve output linearly independent of $P$, so $G \ne \mathcal{O}$ by construction. Even when a scalar slice $\sigma[k] = 0$ (which occurs for any scalar whose $k$-th 4-bit chunk is zero — e.g., $s = 16$ has $\sigma[0] = 0$), the table read returns $\mathcal{T}[0] = G \ne \mathcal{O}$, making `unconditional_add` safe. + +--- + +## 6. The Offset Generator Mechanism + +### 6.1 Why it is needed + +The implementation uses `unconditional_add` for all in-circuit additions, which requires that the +two operand points have **distinct** $x$-coordinates. Without offset generators, two failure modes arise: + +1. **Zero slice:** If $\sigma_j[k] = 0$ for some $j, k$, then $\mathcal{T}_j[0] = \mathcal{O}$ + (the point at infinity). Even with non-zero scalars, slices can be zero — e.g., + $s = 16$ has $\sigma[0] = 0$. +2. **Accumulator collision:** The rolling accumulator could coincidentally share an + $x$-coordinate with an upcoming table entry. + +### 6.2 Offset generator set + +$N + 1$ linearly independent points are used: + +$$G_0, G_1, G_2, \ldots, G_N \in E(\mathbb{F}_q)$$ + +all distinct, hash-to-curve outputs linearly independent of every $P_j$. + +- $G_0$: initial accumulator value +- $G_{j+1}$: offset embedded in table $\mathcal{T}_j$ + +### 6.3 Tracking the total offset + +A **native** (non-circuit) parallel computation tracks the accumulated contribution of the offset +generators. Define the offset accumulator $\Delta$, initialised as: + +$$\Delta_{\text{init}} = G_0$$ + +In each round $i$ the same doublings and additions are applied to $\Delta$ as to the main +accumulator, but using the **offset generators** in place of the table reads: + +- **Doublings (rounds $i \ge 1$):** $\Delta \leftarrow 2^w \cdot \Delta$ (4 consecutive doublings $= \times 16$) +- **Additions:** $\Delta \leftarrow \Delta + G_{j+1}$ for each $j = 0, \ldots, N-1$ + +The closed-form value of $\Delta$ after the complete $R = 64$ rounds is derived below. + +--- + +## 7. The Straus Algorithm — Step by Step + +### 7.1 Initialisation + +$$A \leftarrow G_0, \qquad \Delta \leftarrow G_0$$ + +### 7.2 Main Loop + +For $i = 0, 1, \ldots, R-1$ (i.e., $64$ rounds): + +**Step 7.2a — Doublings (skip when $i = 0$):** + +If $i \ge 1$, perform $w = 4$ point doublings in-circuit: + +$$A \leftarrow 2^4 \cdot A = 16 \cdot A$$ + +and natively: + +$$\Delta \leftarrow 16 \cdot \Delta$$ + +**Step 7.2b — Table lookups and additions:** + +For each point index $j = 0, 1, \ldots, N-1$: + +1. Read the scalar slice for this round: $v = \sigma_j[R - 1 - i] = \sigma_j[63 - i]$ +2. Look up: $Q \leftarrow \mathcal{T}_j[v] = G_{j+1} + v \cdot P_j$ +3. Add in-circuit: $A \leftarrow A + Q$ +4. Update offset natively: $\Delta \leftarrow \Delta + G_{j+1}$ + +(For the variable-base case, step 3 uses a conditional safety check on $x$-coordinates unless the +`unconditional_add` flag is set.) + +### 7.3 State at the end of round $i$ + +After completing round $i$ (both doublings and all $N$ additions), the accumulated value satisfies +the recurrence: + +$$A_0 = G_0 + \sum_{j=0}^{N-1} \mathcal{T}_j[\sigma_j[63]]$$ + +$$A_i = 16 \cdot A_{i-1} + \sum_{j=0}^{N-1} \mathcal{T}_j[\sigma_j[63-i]], \quad i \ge 1$$ + +Unrolling this recurrence over all 64 rounds yields: + +$$A_{63} = 16^{63} \cdot G_0 + \sum_{i=0}^{63} 16^{63-i} \cdot \sum_{j=0}^{N-1} \mathcal{T}_j[\sigma_j[63-i]]$$ + +Substituting $k = 63 - i$: + +$$A_{63} = 16^{63} \cdot G_0 + \sum_{j=0}^{N-1} \sum_{k=0}^{63} 16^{k} \cdot \mathcal{T}_j[\sigma_j[k]]$$ + +Expanding the table definition $\mathcal{T}_j[v] = G_{j+1} + v \cdot P_j$: + +$$A_{63} = 16^{63} \cdot G_0 + \sum_{j=0}^{N-1} \left[ G_{j+1} \cdot \sum_{k=0}^{63} 16^{k} + P_j \cdot \sum_{k=0}^{63} \sigma_j[k] \cdot 16^{k} \right]$$ + +Using the geometric sum $\displaystyle\sum_{k=0}^{63} 16^k = \frac{16^{64}-1}{15}$ and the scalar reconstruction identity $\displaystyle\sum_{k=0}^{63} \sigma_j[k] \cdot 16^k = s_j$: + +$$\boxed{A_{63} = 16^{63} \cdot G_0 + \sum_{j=0}^{N-1} G_{j+1} \cdot \frac{16^{64}-1}{15} + \sum_{j=0}^{N-1} s_j \cdot P_j}$$ + +### 7.4 Offset accumulator value + +Applying the same recurrence to $\Delta$: + +$$\Delta_0 = G_0 + \sum_{j=0}^{N-1} G_{j+1}$$ +$$\Delta_i = 16 \cdot \Delta_{i-1} + \sum_{j=0}^{N-1} G_{j+1}, \quad i \ge 1$$ + +This has the closed-form solution: + +$$\Delta_{63} = 16^{63} \cdot G_0 + \left(\sum_{j=0}^{N-1} G_{j+1}\right) \cdot \sum_{k=0}^{63} 16^{k} = 16^{63} \cdot G_0 + \left(\sum_{j=0}^{N-1} G_{j+1}\right) \cdot \frac{16^{64}-1}{15}$$ + +### 7.5 Cancellation + +Subtracting the offset: + +$$A_{63} - \Delta_{63} = \sum_{j=0}^{N-1} s_j \cdot P_j$$ + +This is the desired MSM result. $\square$ + +--- + +## 8. Outer Function: `batch_mul` / `fixed_batch_mul` + +The outer function partitions the $N$ input pairs $(P_j, s_j)$ into categories before calling the +internal algorithm: + +| Category | Condition | Treatment | +| ----------- | ----------------------------------------------------------- | ----------------------------------------------------------------------- | +| **Case 1** | $P_j$ constant **and** $s_j$ constant | Accumulate natively into `constant_acc` (0 gates) | +| **Case 2A** | $P_j$ is one of the two hardcoded generators, $s_j$ witness | Use `_fixed_base_batch_mul_internal` (precomputed plookup multi-tables) | +| **Case 2B** | $P_j$ constant (not a hardcoded generator), $s_j$ witness | Use `_variable_base_batch_mul_internal` with ROM tables | +| **Case 3** | $P_j$ witness | Use `_variable_base_batch_mul_internal` with ROM tables | + +`fixed_batch_mul` (new) handles **Case 2B** points using plookup `BasicTable`s instead of ROM +arrays, with an otherwise identical Straus computation. + +### 8.1 Result Assembly + +Let $C$ = `constant_acc` $= \sum_{\text{Case 1}} s_j \cdot P_j$ (constant, free). + +The internal function returns $(A_{63},\, \Delta_{63})$. The outer function computes: + +$$\text{Result} = A_{63} - (- C + \Delta_{63}) = A_{63} - \Delta_{63} + C = \sum_j s_j \cdot P_j + C$$ + +which is the full MSM over all $N$ pairs. + +The subtraction is executed as an `unconditional_add` with $-\Delta_{63} + C$ (a constant point) +when $C \ne \mathcal{O}$, or as a full `operator-` otherwise. + +--- + +## 9. Circuit Gate Cost (per Internal Call) + +The following counts assume $N$ points, $R = 64$ rounds, $w = 4$ bits. + +### 9.1 Scalar decomposition + +Each scalar $s_j$ contributes two `create_limbed_range_constraint` calls: + +- lo (128 bits, 32 slices of 4 bits): 32 range gates +- hi (126 bits, 32 slices, last is 2-bit): 32 range gates + +Total across $N$ scalars: $64N$ range-constraint gates. + +### 9.2 Table construction + +**Variable-base ROM** (`straus_lookup_table`): for each point $P_j$: + +- 15 `unconditional_add` gates to populate $\mathcal{T}_j[1], \ldots, \mathcal{T}_j[15]$ +- 2 witness conversions (1 gate each) for $P_j$ and $G_{j+1}$ +- ROM finalisation: $O(T \log T)$ sorted-ROM gates per table + +Total construction: $\approx 17N + O(16N\log 16)$ gates. + +**Fixed-base Plookup** (`straus_plookup_table`): **0 gates**. Table data lives entirely in the +proving polynomial (not in the arithmetic trace). + +### 9.3 Main Straus loop + +| Operation | Count | Gate cost | +| ------------------------------------------------------------ | ---------------------------------- | --------------------------- | +| Doublings | $(R-1) \cdot w = 63 \cdot 4 = 252$ | 252 gates | +| Table reads (ROM or plookup) | $R \cdot N = 64N$ | $64N$ gates | +| `unconditional_add` | $R \cdot N = 64N$ | $64N$ gates | +| $x$-coord batch collision check (variable-base, witness pts) | 1 assertion | $\approx 2 \cdot 64N$ gates | + +Total Straus loop: $\approx 252 + 128N$ gates (plus collision check if applicable). + +### 9.4 Final offset subtraction + +1 group subtraction (or `unconditional_add` with negated constant): $\approx 2$–$5$ gates. + +### 9.5 Summary comparison (128-point MSM, constant base points) + +| Method | Table construction | ROM finalization | Straus loop | Total (approx) | +| --------------------------- | ------------------------------------ | --------------------- | ----------------------------------------------------- | ---------------- | +| `batch_mul` (ROM) | $\approx 17 \times 128 = 2176$ gates | $\sim 12{,}000$ gates | $\approx 252 + 128 \times 128 \approx 16{,}636$ gates | **41,201 gates** | +| `fixed_batch_mul` (plookup) | **0** | **0** | $\approx 252 + 128 \times 128 \approx 16{,}636$ gates | **26,083 gates** | + +The plookup approach eliminates all table-construction and ROM-finalization gates, reducing the +total by **~37%** for 128 points. At 32,768 SRS points (IPA), the absolute savings are +proportionally larger. + +--- + +## 10. Correctness of the Scalar Reconstruction + +**Claim:** $\displaystyle\sum_{k=0}^{63} \sigma_j[k] \cdot 16^k = s_j$ for every scalar $s_j \in [0, 2^{254})$. + +**Proof:** + +$$\sum_{k=0}^{63} \sigma_j[k] \cdot 16^k = \underbrace{\sum_{k=0}^{31} s_{\text{lo},k} \cdot 16^k}_{= s_{\text{lo}}} + \underbrace{\sum_{k=0}^{31} s_{\text{hi},k} \cdot 16^{k+32}}_{= s_{\text{hi}} \cdot 16^{32} = s_{\text{hi}} \cdot 2^{128}} = s_{\text{lo}} + 2^{128} \cdot s_{\text{hi}} = s_j \qquad \square$$ + +**Range validity:** The `create_limbed_range_constraint` call on each limb simultaneously decomposes +the limb into $w$-bit slices and proves in-circuit that each slice lies in $\{0,\ldots,2^w - 1\}$. +The final (partial) slice of $s_{\text{hi}}$ has only 2 bits ($s_{\text{hi},31} \in \{0,1,2,3\}$) +and the constraint uses only the 2 valid bits; when used as a $(16$-entry) table index it always +reads a valid entry.