Barretenberg: src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp Source File

// === AUDIT STATUS ===

// internal:    { status: Planned, auditors: [Sergei], commit: }

// external_1:  { status: not started, auditors: [], commit: }

// external_2:  { status: not started, auditors: [], commit: }

// =====================

#include "barretenberg/common/assert.hpp"

#include "barretenberg/common/bb_bench.hpp"

#include "barretenberg/ecc/groups/precomputed_generators_bn254_impl.hpp"

#include "barretenberg/ecc/groups/precomputed_generators_grumpkin_impl.hpp"


#include "./process_buckets.hpp"

#include "./scalar_multiplication.hpp"

#include "barretenberg/common/thread.hpp"

#include "barretenberg/ecc/curves/bn254/bn254.hpp"

#include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp"

#include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"

#include "barretenberg/numeric/general/general.hpp"

#include "barretenberg/polynomials/polynomial.hpp"


#include "barretenberg/common/mem.hpp"

#include "barretenberg/numeric/bitop/get_msb.hpp"


namespace bb::scalar_multiplication {


// Naive double-and-add fallback for small inputs (< PIPPENGER_THRESHOLD points).


template <typename Curve> typename Curve::Element small_mul(const typename MSM<Curve>::MSMData& msm_data) noexcept

{

    const auto& scalars = msm_data.scalars;

    const auto& points = msm_data.points;

    const auto& scalar_indices = msm_data.scalar_indices;

    const size_t range = scalar_indices.size();


    typename Curve::Element r = Curve::Group::point_at_infinity;

    for (size_t i = 0; i < range; ++i) {

        typename Curve::Element f = points[scalar_indices[i]];

        r += f * scalars[scalar_indices[i]].to_montgomery_form();

    }

    return r;

}


template <typename Curve>


void MSM<Curve>::transform_scalar_and_get_nonzero_scalar_indices(std::span<typename Curve::ScalarField> scalars,

                                                                 std::vector<uint32_t>& nonzero_scalar_indices) noexcept

{

    std::vector<std::vector<uint32_t>> thread_indices(get_num_cpus());


    // Pass 1: Each thread converts from Montgomery and collects nonzero indices into its own vector

    parallel_for([&](const ThreadChunk& chunk) {

        BB_BENCH_TRACY_NAME("MSM::convert_scalars");

        BB_ASSERT_EQ(chunk.total_threads, thread_indices.size());

        auto range = chunk.range(scalars.size());

        if (range.empty()) {

            return;

        }

        std::vector<uint32_t>& thread_scalar_indices = thread_indices[chunk.thread_index];

        thread_scalar_indices.reserve(range.size());

        for (size_t i : range) {

            BB_ASSERT_DEBUG(i < scalars.size());

            auto& scalar = scalars[i];

            scalar.self_from_montgomery_form_reduced();


            if (!scalar.is_zero()) {

                thread_scalar_indices.push_back(static_cast<uint32_t>(i));

            }

        }

    });


    size_t num_entries = 0;

    for (const auto& indices : thread_indices) {

        num_entries += indices.size();

    }

    nonzero_scalar_indices.resize(num_entries);


    // Pass 2: Copy each thread's indices to the output vector (no branching)

    parallel_for([&](const ThreadChunk& chunk) {

        BB_BENCH_TRACY_NAME("MSM::copy_indices");

        BB_ASSERT_EQ(chunk.total_threads, thread_indices.size());

        size_t offset = 0;

        for (size_t i = 0; i < chunk.thread_index; ++i) {

            offset += thread_indices[i].size();

        }

        for (size_t i = offset; i < offset + thread_indices[chunk.thread_index].size(); ++i) {

            nonzero_scalar_indices[i] = thread_indices[chunk.thread_index][i - offset];

        }

    });

}


template <typename Curve>


void MSM<Curve>::compute_scalar_slice_weights(std::span<const typename Curve::ScalarField> scalars,

                                              std::span<const uint32_t> nonzero_indices,

                                              uint32_t bits_per_slice,

                                              std::vector<uint16_t>& weights) noexcept

{

    // weight = ceil(bit_length / bps) + FIXED_PER_SCALAR_WEIGHT. The fixed term approximates the

    // O(num_rounds) per-scalar overhead in build_schedule, sort_schedule, and reduce_buckets that

    // doesn't scale with bit_length. Without it, threads assigned many lightweight scalars end up

    // with disproportionate build/sort/reduce work (empirically observed via per-phase profiling).

    // Max is ceil(NUM_BITS_IN_FIELD / 1) + FIXED.

    static constexpr uint16_t FIXED_PER_SCALAR_WEIGHT = 4;

    static_assert(NUM_BITS_IN_FIELD + FIXED_PER_SCALAR_WEIGHT <= std::numeric_limits<uint16_t>::max(),

                  "slice-count weight overflows uint16_t");

    BB_ASSERT_GT(bits_per_slice, 0U);


    const size_t n = nonzero_indices.size();

    weights.resize(n);


    parallel_for([&](const ThreadChunk& chunk) {

        for (size_t k : chunk.range(n)) {

            const auto& scalar = scalars[nonzero_indices[k]];

            // Scalars were filtered for nonzero and are in non-Montgomery form, so get_msb()

            // returns a valid bit index in [0, NUM_BITS_IN_FIELD).

            const uint64_t msb = uint256_t{ scalar.data[0], scalar.data[1], scalar.data[2], scalar.data[3] }.get_msb();

            const size_t bit_length = static_cast<size_t>(msb) + 1;

            weights[k] =

                static_cast<uint16_t>((bit_length + bits_per_slice - 1) / bits_per_slice) + FIXED_PER_SCALAR_WEIGHT;

        }

    });

}


template <typename Curve>


std::vector<typename MSM<Curve>::ThreadWorkUnits> MSM<Curve>::partition_by_weight(

    std::span<const std::vector<uint16_t>> msm_scalar_weights, size_t num_threads) noexcept

{

    BB_ASSERT_GT(num_threads, 0U);

    std::vector<ThreadWorkUnits> work_units(num_threads);


    size_t grand_total_weight = 0;

    for (const auto& weights : msm_scalar_weights) {

        for (uint16_t w : weights) {

            grand_total_weight += w;

        }

    }

    if (grand_total_weight == 0) {

        return work_units;

    }


    const size_t weight_per_thread = numeric::ceil_div(grand_total_weight, num_threads);


    size_t thread_accumulated_weight = 0;

    size_t current_thread_idx = 0;

    for (size_t i = 0; i < msm_scalar_weights.size(); ++i) {

        const auto& weights = msm_scalar_weights[i];

        const size_t n = weights.size();


        size_t start = 0;

        for (size_t k = 0; k < n; ++k) {

            thread_accumulated_weight += weights[k];


            if (current_thread_idx < num_threads - 1 && thread_accumulated_weight >= weight_per_thread) {

                work_units[current_thread_idx].push_back(MSMWorkUnit{

                    .batch_msm_index = i,

                    .start_index = start,

                    .size = k + 1 - start,

                });

                start = k + 1;

                current_thread_idx++;

                thread_accumulated_weight = 0;

            }

        }

        if (start < n) {

            work_units[current_thread_idx].push_back(MSMWorkUnit{

                .batch_msm_index = i,

                .start_index = start,

                .size = n - start,

            });

        }

    }

    return work_units;

}


template <typename Curve>


std::vector<typename MSM<Curve>::ThreadWorkUnits> MSM<Curve>::get_work_units(

    std::span<std::span<ScalarField>> scalars, std::vector<std::vector<uint32_t>>& msm_scalar_indices) noexcept

{

    const size_t num_msms = scalars.size();

    msm_scalar_indices.resize(num_msms);


    // Weight scalars by their Pippenger cost (slice count + fixed overhead, see

    // compute_scalar_slice_weights) to improve thread balancing.

    std::vector<std::vector<uint16_t>> msm_scalar_weights(num_msms);

    size_t total_work = 0;

    for (size_t i = 0; i < num_msms; ++i) {

        transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]);

        const size_t n = msm_scalar_indices[i].size();

        total_work += n;

        if (n == 0) {

            continue;

        }

        const uint32_t bps = get_optimal_log_num_buckets(n);

        compute_scalar_slice_weights(scalars[i], msm_scalar_indices[i], bps, msm_scalar_weights[i]);

    }


    const size_t num_threads = get_num_cpus();


    // Only use a single work unit if we don't have enough work for every thread

    if (num_threads > total_work) {

        std::vector<ThreadWorkUnits> work_units(num_threads);

        for (size_t i = 0; i < num_msms; ++i) {

            work_units[0].push_back(MSMWorkUnit{

                .batch_msm_index = i,

                .start_index = 0,

                .size = msm_scalar_indices[i].size(),

            });

        }

        return work_units;

    }


    return partition_by_weight(msm_scalar_weights, num_threads);

}


template <typename Curve>


uint32_t MSM<Curve>::get_scalar_slice(const typename Curve::ScalarField& scalar,

                                      size_t round,

                                      size_t slice_size) noexcept

{

    constexpr size_t LIMB_BITS = 64;


    size_t hi_bit = NUM_BITS_IN_FIELD - (round * slice_size);

    size_t lo_bit = (hi_bit < slice_size) ? 0 : hi_bit - slice_size;


    BB_ASSERT_DEBUG(lo_bit < hi_bit);

    BB_ASSERT_DEBUG(hi_bit <= NUM_BITS_IN_FIELD); // Ensures hi_bit < 256, so end_limb <= 3


    size_t start_limb = lo_bit / LIMB_BITS;

    size_t end_limb = hi_bit / LIMB_BITS;

    size_t lo_slice_offset = lo_bit & (LIMB_BITS - 1);

    size_t actual_slice_size = hi_bit - lo_bit;

    size_t lo_slice_bits =

        (LIMB_BITS - lo_slice_offset < actual_slice_size) ? (LIMB_BITS - lo_slice_offset) : actual_slice_size;

    size_t hi_slice_bits = actual_slice_size - lo_slice_bits;


    uint64_t lo_slice = (scalar.data[start_limb] >> lo_slice_offset) & ((1ULL << lo_slice_bits) - 1);

    uint64_t hi_slice = (start_limb != end_limb) ? (scalar.data[end_limb] & ((1ULL << hi_slice_bits) - 1)) : 0;


    return static_cast<uint32_t>(lo_slice | (hi_slice << lo_slice_bits));

}


template <typename Curve> uint32_t MSM<Curve>::get_optimal_log_num_buckets(const size_t num_points) noexcept

{

    // Cost model: total_cost = num_rounds * (num_points + num_buckets * BUCKET_ACCUMULATION_COST)

    auto compute_cost = [&](uint32_t bits) {

        size_t rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, static_cast<size_t>(bits));

        size_t buckets = size_t{ 1 } << bits;

        return rounds * (num_points + buckets * BUCKET_ACCUMULATION_COST);

    };


    uint32_t best_bits = 1;

    size_t best_cost = compute_cost(1);

    for (uint32_t bits = 2; bits < MAX_SLICE_BITS; ++bits) {

        size_t cost = compute_cost(bits);

        if (cost < best_cost) {

            best_cost = cost;

            best_bits = bits;

        }

    }

    return best_bits;

}


template <typename Curve> bool MSM<Curve>::use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept

{

    if (num_points < AFFINE_TRICK_THRESHOLD) {

        return false;

    }


    // Affine trick requires log(N) modular inversions per Pippenger round.

    // It saves num_points * AFFINE_TRICK_SAVINGS_PER_OP field muls, plus

    // num_buckets * JACOBIAN_Z_NOT_ONE_PENALTY field muls (buckets have Z=1 with affine trick)


    // Cost of modular inversion via exponentiation:

    // - NUM_BITS_IN_FIELD squarings

    // - (NUM_BITS_IN_FIELD + 3) / 4 multiplications (4-bit windows)

    // - INVERSION_TABLE_COST multiplications for lookup table

    constexpr size_t COST_OF_INVERSION = NUM_BITS_IN_FIELD + ((NUM_BITS_IN_FIELD + 3) / 4) + INVERSION_TABLE_COST;


    double log2_num_points = log2(static_cast<double>(num_points));

    size_t savings_per_round = (num_points * AFFINE_TRICK_SAVINGS_PER_OP) + (num_buckets * JACOBIAN_Z_NOT_ONE_PENALTY);

    double inversion_cost_per_round = log2_num_points * static_cast<double>(COST_OF_INVERSION);


    return static_cast<double>(savings_per_round) > inversion_cost_per_round;

}


template <typename Curve>


void MSM<Curve>::add_affine_points(typename Curve::AffineElement* points,

                                   const size_t num_points,

                                   typename Curve::BaseField* scratch_space) noexcept

{

    using AffineElement = typename Curve::AffineElement;

    using BaseField = typename Curve::BaseField;


    // Pippenger-specific interleaved batch add with direct prefetch and no aliasing overhead.

    // The generic batch_affine_add_impl suffers from aliasing (lhs_base == rhs_base) causing

    // the compiler to reload lhs coordinates after writing output. This version avoids that.

    bb::group_elements::batch_affine_add_interleaved<AffineElement, BaseField>(points, num_points, scratch_space);

}


template <typename Curve>


typename Curve::Element MSM<Curve>::jacobian_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept

{

    const size_t size = msm_data.scalar_indices.size();

    const uint32_t bits_per_slice = get_optimal_log_num_buckets(size);

    const size_t num_buckets = size_t{ 1 } << bits_per_slice;

    const uint32_t num_rounds = static_cast<uint32_t>((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);

    const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice;


    JacobianBucketAccumulators bucket_data(num_buckets);

    Element msm_result = Curve::Group::point_at_infinity;


    for (uint32_t round = 0; round < num_rounds; ++round) {

        // Populate buckets using Jacobian accumulation

        for (size_t i = 0; i < size; ++i) {

            uint32_t idx = msm_data.scalar_indices[i];

            uint32_t bucket = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice);

            if (bucket > 0) {

                if (bucket_data.bucket_exists.get(bucket)) {

                    bucket_data.buckets[bucket] += msm_data.points[idx];

                } else {

                    bucket_data.buckets[bucket] = msm_data.points[idx];

                    bucket_data.bucket_exists.set(bucket, true);

                }

            }

        }


        // Reduce buckets and accumulate into result

        Element bucket_result = accumulate_buckets(bucket_data);

        bucket_data.bucket_exists.clear();


        uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice;

        for (uint32_t i = 0; i < num_doublings; ++i) {

            msm_result.self_dbl();

        }

        msm_result += bucket_result;

    }

    return msm_result;

}


template <typename Curve>


typename Curve::Element MSM<Curve>::affine_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept

{

    const size_t num_points = msm_data.scalar_indices.size();

    const uint32_t bits_per_slice = get_optimal_log_num_buckets(num_points);

    const size_t num_buckets = size_t{ 1 } << bits_per_slice;


    if (!use_affine_trick(num_points, num_buckets)) {

        return jacobian_pippenger_with_transformed_scalars(msm_data);

    }


    const uint32_t num_rounds = static_cast<uint32_t>((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);

    const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice;


    // Per-call allocation for WASM compatibility (thread_local causes issues in WASM)

    AffineAdditionData affine_data;

    BucketAccumulators bucket_data(num_buckets);


    Element msm_result = Curve::Group::point_at_infinity;


    for (uint32_t round = 0; round < num_rounds; ++round) {

        // Build point schedule for this round

        {

            for (size_t i = 0; i < num_points; ++i) {

                uint32_t idx = msm_data.scalar_indices[i];

                uint32_t bucket_idx = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice);

                msm_data.point_schedule[i] = PointScheduleEntry::create(idx, bucket_idx).data;

            }

        }


        // Sort by bucket and count zero-bucket entries

        size_t num_zero_bucket_entries =

            sort_point_schedule_and_count_zero_buckets(&msm_data.point_schedule[0], num_points, bits_per_slice);

        size_t round_size = num_points - num_zero_bucket_entries;


        // Accumulate points into buckets

        Element bucket_result = Curve::Group::point_at_infinity;

        if (round_size > 0) {

            std::span<uint64_t> schedule(&msm_data.point_schedule[num_zero_bucket_entries], round_size);

            batch_accumulate_points_into_buckets(schedule, msm_data.points, affine_data, bucket_data);

            bucket_result = accumulate_buckets(bucket_data);

            bucket_data.bucket_exists.clear();

        }


        // Combine into running result

        uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice;

        for (uint32_t i = 0; i < num_doublings; ++i) {

            msm_result.self_dbl();

        }

        msm_result += bucket_result;

    }


    return msm_result;

}


template <typename Curve>


void MSM<Curve>::batch_accumulate_points_into_buckets(std::span<const uint64_t> point_schedule,

                                                      std::span<const typename Curve::AffineElement> points,

                                                      MSM<Curve>::AffineAdditionData& affine_data,

                                                      MSM<Curve>::BucketAccumulators& bucket_data) noexcept

{


    if (point_schedule.empty()) {

        return;

    }


    size_t point_it = 0;

    size_t scratch_it = 0;

    const size_t num_points = point_schedule.size();

    const size_t prefetch_max = (num_points >= PREFETCH_LOOKAHEAD) ? (num_points - PREFETCH_LOOKAHEAD) : 0;

    const size_t last_index = num_points - 1;


    // Iterative loop - continues until all points processed and no work remains in scratch space

    while (point_it < num_points || scratch_it != 0) {

        // Step 1: Fill scratch space with up to BATCH_SIZE/2 independent additions

        while (((scratch_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < last_index)) {

            // Prefetch points we'll need soon (every PREFETCH_INTERVAL iterations)

            if ((point_it < prefetch_max) && ((point_it & PREFETCH_INTERVAL_MASK) == 0)) {

                for (size_t i = PREFETCH_LOOKAHEAD / 2; i < PREFETCH_LOOKAHEAD; ++i) {

                    PointScheduleEntry entry{ point_schedule[point_it + i] };

                    __builtin_prefetch(&points[entry.point_index()]);

                }

            }


            PointScheduleEntry lhs{ point_schedule[point_it] };

            PointScheduleEntry rhs{ point_schedule[point_it + 1] };


            process_bucket_pair(lhs.bucket_index(),

                                rhs.bucket_index(),

                                &points[lhs.point_index()],

                                &points[rhs.point_index()],

                                affine_data,

                                bucket_data,

                                scratch_it,

                                point_it);

        }


        // Handle the last point (odd count case) - separate to avoid bounds check on point_schedule[point_it + 1]

        if (point_it == last_index) {

            PointScheduleEntry last{ point_schedule[point_it] };

            process_single_point(

                last.bucket_index(), &points[last.point_index()], affine_data, bucket_data, scratch_it, point_it);

        }


        // Compute independent additions using Montgomery's batch inversion trick

        size_t num_points_to_add = scratch_it;

        if (num_points_to_add >= 2) {

            add_affine_points(

                affine_data.points_to_add.data(), num_points_to_add, affine_data.inversion_scratch_space.data());

        }


        // add_affine_points stores results in the top-half of scratch space

        AffineElement* affine_output = affine_data.points_to_add.data() + (num_points_to_add / 2);


        // Recirculate addition outputs back into scratch space or bucket accumulators

        size_t new_scratch_it = 0;

        size_t output_it = 0;

        size_t num_outputs = num_points_to_add / 2;


        while ((num_outputs > 1) && (output_it + 1 < num_outputs)) {

            uint32_t lhs_bucket = affine_data.addition_result_bucket_destinations[output_it];

            uint32_t rhs_bucket = affine_data.addition_result_bucket_destinations[output_it + 1];


            process_bucket_pair(lhs_bucket,

                                rhs_bucket,

                                &affine_output[output_it],

                                &affine_output[output_it + 1],

                                affine_data,

                                bucket_data,

                                new_scratch_it,

                                output_it);

        }


        // Handle the last output (odd count case)

        if (num_outputs > 0 && output_it == num_outputs - 1) {

            uint32_t bucket = affine_data.addition_result_bucket_destinations[output_it];

            process_single_point(

                bucket, &affine_output[output_it], affine_data, bucket_data, new_scratch_it, output_it);

        }


        // Continue with recirculated points

        scratch_it = new_scratch_it;

    }

}


template <typename Curve>


std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(

    std::span<std::span<const typename Curve::AffineElement>> points,

    std::span<std::span<ScalarField>> scalars,

    bool handle_edge_cases) noexcept

{

    BB_BENCH_NAME("MSM::batch_multi_scalar_mul");

    BB_ASSERT_EQ(points.size(), scalars.size());

    const size_t num_msms = points.size();


    std::vector<std::vector<uint32_t>> msm_scalar_indices;

    std::vector<ThreadWorkUnits> thread_work_units = get_work_units(scalars, msm_scalar_indices);

    const size_t num_cpus = get_num_cpus();

    std::vector<std::vector<std::pair<Element, size_t>>> thread_msm_results(num_cpus);

    BB_ASSERT_EQ(thread_work_units.size(), num_cpus);


    // Select Pippenger implementation once (hoisting branch outside hot loop)

    // Jacobian: safe, handles edge cases | Affine: faster, assumes linearly independent points

    auto pippenger_impl =

        handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars;


    // Once we have our work units, each thread can independently evaluate its assigned msms

    {

        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/evaluate_work_units");

        parallel_for(num_cpus, [&](size_t thread_idx) {

            BB_BENCH_TRACY_NAME("MSM::evaluate_work_units");

            if (!thread_work_units[thread_idx].empty()) {

                const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];

                std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];

                msm_results.reserve(msms.size());


                // Point schedule buffer for this thread - avoids per-work-unit heap allocation

                std::vector<uint64_t> point_schedule_buffer;


                for (const MSMWorkUnit& msm : msms) {

                    point_schedule_buffer.resize(msm.size);

                    MSMData msm_data =

                        MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);

                    Element msm_result =

                        (msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);


                    msm_results.emplace_back(msm_result, msm.batch_msm_index);

                }

            }

        });

    }


    // Accumulate results. Single-threaded, but negligible in practice.

    // Benchmarked (192-core, 256 threads): ~512us for 2^16 MSM (~1.2% of total), ~207us for 2^20 (<0.1%).

    std::vector<Element> results(num_msms, Curve::Group::point_at_infinity);

    {

        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/accumulate_results");

        for (const auto& single_thread_msm_results : thread_msm_results) {

            for (const auto& [element, index] : single_thread_msm_results) {

                results[index] += element;

            }

        }

    }

    {

        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/batch_normalize");

        Element::batch_normalize(results.data(), num_msms);

    }


    // Convert scalars back TO Montgomery form so they remain unchanged from caller's perspective

    {

        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/scalars_to_montgomery");

        for (auto& scalar_span : scalars) {

            parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {

                BB_BENCH_TRACY_NAME("MSM::scalars_to_montgomery/chunk");

                for (size_t i = start; i < end; ++i) {

                    scalar_span[i].self_to_montgomery_form();

                }

            });

        }

    }


    return std::vector<AffineElement>(results.begin(), results.end());

}


template <typename Curve>


typename Curve::AffineElement MSM<Curve>::msm(std::span<const typename Curve::AffineElement> points,

                                              PolynomialSpan<const ScalarField> scalars,

                                              bool handle_edge_cases) noexcept

{

    if (scalars.size() == 0) {

        return Curve::Group::affine_point_at_infinity;

    }

    const size_t num_scalars = scalars.size();

    BB_ASSERT_GTE(points.size(), scalars.start_index + num_scalars);


    // const_cast is safe: we convert from Montgomery, compute, then convert back.

    // Scalars are unchanged from the caller's perspective.

    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)

    ScalarField* scalar_ptr = const_cast<ScalarField*>(&scalars[scalars.start_index]);

    std::span<ScalarField> scalar_span(scalar_ptr, num_scalars);


    // Wrap into a size-1 batch and delegate to the general method that properly handles multi-threading

    std::array<std::span<const AffineElement>, 1> points_batch{ points.subspan(scalars.start_index) };

    std::array<std::span<ScalarField>, 1> scalars_batch{ scalar_span };


    auto results = batch_multi_scalar_mul(std::span(points_batch), std::span(scalars_batch), handle_edge_cases);

    return results[0];

}


template <typename Curve>


typename Curve::Element pippenger(PolynomialSpan<const typename Curve::ScalarField> scalars,

                                  std::span<const typename Curve::AffineElement> points,

                                  [[maybe_unused]] bool handle_edge_cases) noexcept

{

    return MSM<Curve>::msm(points, scalars, handle_edge_cases);

}


template <typename Curve>


typename Curve::Element pippenger_unsafe(PolynomialSpan<const typename Curve::ScalarField> scalars,

                                         std::span<const typename Curve::AffineElement> points) noexcept

{

    return MSM<Curve>::msm(points, scalars, false);

}


template curve::Grumpkin::Element pippenger<curve::Grumpkin>(PolynomialSpan<const curve::Grumpkin::ScalarField> scalars,

                                                             std::span<const curve::Grumpkin::AffineElement> points,

                                                             bool handle_edge_cases = true) noexcept;


template curve::Grumpkin::Element pippenger_unsafe<curve::Grumpkin>(

    PolynomialSpan<const curve::Grumpkin::ScalarField> scalars, std::span<const curve::Grumpkin::AffineElement> points);


template curve::BN254::Element pippenger<curve::BN254>(PolynomialSpan<const curve::BN254::ScalarField> scalars,

                                                       std::span<const curve::BN254::AffineElement> points,

                                                       bool handle_edge_cases = true);


template curve::BN254::Element pippenger_unsafe<curve::BN254>(PolynomialSpan<const curve::BN254::ScalarField> scalars,

                                                              std::span<const curve::BN254::AffineElement> points);


} // namespace bb::scalar_multiplication


template class bb::scalar_multiplication::MSM<bb::curve::Grumpkin>;

template class bb::scalar_multiplication::MSM<bb::curve::BN254>;

assert.hpp

BB_ASSERT_GTE
#define BB_ASSERT_GTE(left, right,...)
Definition assert.hpp:128

BB_ASSERT_GT
#define BB_ASSERT_GT(left, right,...)
Definition assert.hpp:113

BB_ASSERT_DEBUG
#define BB_ASSERT_DEBUG(expression,...)
Definition assert.hpp:55

BB_ASSERT_EQ
#define BB_ASSERT_EQ(actual, expected,...)
Definition assert.hpp:83

bb_bench.hpp

BB_BENCH_NAME
#define BB_BENCH_NAME(name)
Definition bb_bench.hpp:264

BB_BENCH_TRACY_NAME
#define BB_BENCH_TRACY_NAME(name)
Definition bb_bench.hpp:256

BitVector::get
BB_INLINE bool get(size_t index) const noexcept
Definition bitvector.hpp:44

BitVector::set
BB_INLINE void set(size_t index, bool value) noexcept
Definition bitvector.hpp:30

BitVector::clear
void clear()
Definition bitvector.hpp:52

bb::curve::Grumpkin::Element
typename Group::element Element
Definition grumpkin.hpp:63

bb::curve::Grumpkin::BaseField
bb::fr BaseField
Definition grumpkin.hpp:61

bb::curve::Grumpkin::AffineElement
typename Group::affine_element AffineElement
Definition grumpkin.hpp:64

bb::numeric::uint256_t
Definition uint256.hpp:32

bb::numeric::uint256_t::data
uint64_t data[4]
Definition uint256.hpp:219

bb::scalar_multiplication::MSM
Definition scalar_multiplication.hpp:19

bb::scalar_multiplication::MSM::BaseField
typename Curve::BaseField BaseField
Definition scalar_multiplication.hpp:23

bb::scalar_multiplication::MSM::use_affine_trick
static bool use_affine_trick(size_t num_points, size_t num_buckets) noexcept
Decide if batch inversion saves work vs Jacobian additions.
Definition scalar_multiplication.cpp:274

bb::scalar_multiplication::MSM::jacobian_pippenger_with_transformed_scalars
static Element jacobian_pippenger_with_transformed_scalars(MSMData &msm_data) noexcept
Pippenger using Jacobian buckets (handles edge cases: doubling, infinity)
Definition scalar_multiplication.cpp:312

bb::scalar_multiplication::MSM::get_scalar_slice
static uint32_t get_scalar_slice(const ScalarField &scalar, size_t round, size_t slice_size) noexcept
Extract c-bit slice from scalar for bucket index computation.
Definition scalar_multiplication.cpp:227

bb::scalar_multiplication::MSM::partition_by_weight
static std::vector< ThreadWorkUnits > partition_by_weight(std::span< const std::vector< uint16_t > > msm_scalar_weights, size_t num_threads) noexcept
Partition per-MSM scalar weights into num_threads work units of approximately equal cumulative weight...
Definition scalar_multiplication.cpp:121

bb::scalar_multiplication::MSM::affine_pippenger_with_transformed_scalars
static Element affine_pippenger_with_transformed_scalars(MSMData &msm_data) noexcept
Pippenger using affine buckets with batch inversion (faster, no edge case handling)
Definition scalar_multiplication.cpp:352

bb::scalar_multiplication::MSM::compute_scalar_slice_weights
static void compute_scalar_slice_weights(std::span< const ScalarField > scalars, std::span< const uint32_t > nonzero_indices, uint32_t bits_per_slice, std::vector< uint16_t > &weights) noexcept
Compute per-scalar slice-count weights ceil(bit_length / bits_per_slice).
Definition scalar_multiplication.cpp:89

bb::scalar_multiplication::MSM::add_affine_points
static void add_affine_points(AffineElement *points, const size_t num_points, typename Curve::BaseField *scratch_space) noexcept
Batch add n/2 independent point pairs using Montgomery's trick.
Definition scalar_multiplication.cpp:298

bb::scalar_multiplication::MSM::get_work_units
static std::vector< ThreadWorkUnits > get_work_units(std::span< std::span< ScalarField > > scalars, std::vector< std::vector< uint32_t > > &msm_scalar_indices) noexcept
Distribute multiple MSMs across threads with balanced bucket-accumulation work.
Definition scalar_multiplication.cpp:172

bb::scalar_multiplication::MSM::Element
typename Curve::Element Element
Definition scalar_multiplication.hpp:21

bb::scalar_multiplication::MSM::get_optimal_log_num_buckets
static uint32_t get_optimal_log_num_buckets(size_t num_points) noexcept
Compute optimal bits per slice by minimizing cost over c in [1, MAX_SLICE_BITS)
Definition scalar_multiplication.cpp:253

bb::scalar_multiplication::MSM::batch_multi_scalar_mul
static std::vector< AffineElement > batch_multi_scalar_mul(std::span< std::span< const AffineElement > > points, std::span< std::span< ScalarField > > scalars, bool handle_edge_cases=true) noexcept
Compute multiple MSMs in parallel with work balancing.
Definition scalar_multiplication.cpp:497

bb::scalar_multiplication::MSM::batch_accumulate_points_into_buckets
static void batch_accumulate_points_into_buckets(std::span< const uint64_t > point_schedule, std::span< const AffineElement > points, AffineAdditionData &affine_data, BucketAccumulators &bucket_data) noexcept
Process sorted point schedule into bucket accumulators using batched affine additions.
Definition scalar_multiplication.cpp:407

bb::scalar_multiplication::MSM::ScalarField
typename Curve::ScalarField ScalarField
Definition scalar_multiplication.hpp:22

bb::scalar_multiplication::MSM::AffineElement
typename Curve::AffineElement AffineElement
Definition scalar_multiplication.hpp:24

bb::scalar_multiplication::MSM::transform_scalar_and_get_nonzero_scalar_indices
static void transform_scalar_and_get_nonzero_scalar_indices(std::span< ScalarField > scalars, std::vector< uint32_t > &nonzero_scalar_indices) noexcept
Convert scalars from Montgomery form and collect indices of nonzero scalars.
Definition scalar_multiplication.cpp:42

VariableRefMutationOptions::index
@ index

bn254.hpp

grumpkin.hpp

Element
bb::curve::BN254::Element Element
Definition eccvm.fuzzer.cpp:21

offset
ssize_t offset
Definition engine.cpp:62

BN254
@ BN254
Definition fuzzer_constants.hpp:5

general.hpp

get_msb.hpp

mem.hpp

bb::numeric::ceil_div
constexpr T ceil_div(const T &numerator, const T &denominator)
Computes the ceiling of the division of two integral types.
Definition general.hpp:23

bb::scalar_multiplication
Definition process_buckets.cpp:11

bb::scalar_multiplication::small_mul
Curve::Element small_mul(const typename MSM< Curve >::MSMData &msm_data) noexcept
Definition scalar_multiplication.cpp:26

bb::scalar_multiplication::pippenger
Curve::Element pippenger(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, bool handle_edge_cases) noexcept
Safe MSM wrapper (defaults to handle_edge_cases=true)
Definition scalar_multiplication.cpp:601

bb::scalar_multiplication::sort_point_schedule_and_count_zero_buckets
size_t sort_point_schedule_and_count_zero_buckets(uint64_t *point_schedule, const size_t num_entries, const uint32_t bucket_index_bits) noexcept
Sort point schedule by bucket index and count zero-bucket entries.
Definition process_buckets.cpp:83

bb::scalar_multiplication::pippenger_unsafe
Curve::Element pippenger_unsafe(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points) noexcept
Fast MSM wrapper for linearly independent points (no edge case handling)
Definition scalar_multiplication.cpp:609

bb
Entry point for Barretenberg command-line interface.
Definition api.hpp:5

bb::get_num_cpus
size_t get_num_cpus()
Definition thread.cpp:33

bb::parallel_for
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111

bb::parallel_for_range
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
Definition thread.cpp:141

std
STL namespace.

std::get
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13

polynomial.hpp

precomputed_generators_bn254_impl.hpp

precomputed_generators_grumpkin_impl.hpp

process_buckets.hpp

scalar_multiplication.hpp

bb::PolynomialSpan
Definition polynomial.hpp:27

bb::ThreadChunk
Definition thread.hpp:149

bb::ThreadChunk::total_threads
size_t total_threads
Definition thread.hpp:151

bb::ThreadChunk::thread_index
size_t thread_index
Definition thread.hpp:150

bb::ThreadChunk::range
auto range(size_t size, size_t offset=0) const
Definition thread.hpp:152

bb::field< Bn254FqParams >

bb::scalar_multiplication::MSM::AffineAdditionData
Scratch space for batched affine point additions (one per thread)
Definition scalar_multiplication.hpp:171

bb::scalar_multiplication::MSM::BucketAccumulators
Affine bucket accumulators for the fast affine-trick Pippenger variant.
Definition scalar_multiplication.hpp:142

bb::scalar_multiplication::MSM::BucketAccumulators::bucket_exists
BitVector bucket_exists
Definition scalar_multiplication.hpp:144

bb::scalar_multiplication::MSM::JacobianBucketAccumulators
Jacobian bucket accumulators for the safe Pippenger variant.
Definition scalar_multiplication.hpp:159

bb::scalar_multiplication::MSM::JacobianBucketAccumulators::bucket_exists
BitVector bucket_exists
Definition scalar_multiplication.hpp:161

bb::scalar_multiplication::MSM::JacobianBucketAccumulators::buckets
std::vector< Element > buckets
Definition scalar_multiplication.hpp:160

bb::scalar_multiplication::MSM::MSMData
Container for MSM input data passed between algorithm stages.
Definition scalar_multiplication.hpp:105

bb::scalar_multiplication::MSM::MSMWorkUnit
MSMWorkUnit describes an MSM that may be part of a larger MSM.
Definition scalar_multiplication.hpp:94

bb::scalar_multiplication::MSM::MSMWorkUnit::batch_msm_index
size_t batch_msm_index
Definition scalar_multiplication.hpp:95

bb::scalar_multiplication::MSM::PointScheduleEntry
Packed point schedule entry: (point_index << 32) | bucket_index.
Definition scalar_multiplication.hpp:191

thread.hpp