Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
scalar_multiplication.hpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: Planned, auditors: [Sergei], commit: }
3// external_1: { status: not started, auditors: [], commit: }
4// external_2: { status: not started, auditors: [], commit: }
5// =====================
6
7#pragma once
10
14
15#include "./bitvector.hpp"
16#include "./process_buckets.hpp"
18
19template <typename Curve> class MSM {
20 public:
21 using Element = typename Curve::Element;
23 using BaseField = typename Curve::BaseField;
25
26 static constexpr size_t NUM_BITS_IN_FIELD = ScalarField::modulus.get_msb() + 1;
27
28 // ======================= Algorithm Tuning Constants =======================
29 //
30 // These constants control the behavior of the Pippenger MSM algorithm.
31 // They are empirically tuned for performance on typical hardware.
32
33 // Below this threshold, use naive scalar multiplication instead of Pippenger
34 static constexpr size_t PIPPENGER_THRESHOLD = 16;
35
36 // Below this threshold, the affine batch inversion trick is not beneficial
37 // (cost of inversions exceeds savings from cheaper affine additions)
38 static constexpr size_t AFFINE_TRICK_THRESHOLD = 128;
39
40 // Maximum bits per scalar slice (2^20 = 1M buckets, far beyond practical use)
41 static constexpr size_t MAX_SLICE_BITS = 20;
42 static_assert(MAX_SLICE_BITS < 64,
43 "get_scalar_slice uses 1ULL << lo_slice_bits where lo_slice_bits <= MAX_SLICE_BITS - 1; "
44 "shifting uint64_t by >= 64 is UB.");
45
46 // Number of points to look ahead for memory prefetching
47 static constexpr size_t PREFETCH_LOOKAHEAD = 32;
48
49 // Prefetch every N iterations (must be power of 2); mask is N-1 for efficient modulo
50 static constexpr size_t PREFETCH_INTERVAL = 16;
51 static constexpr size_t PREFETCH_INTERVAL_MASK = PREFETCH_INTERVAL - 1;
52
53 // ======================= Cost Model Constants =======================
54 //
55 // These constants define the relative costs of various operations,
56 // used to decide between algorithm variants.
57
58 // Cost of bucket accumulation relative to a single point addition
59 // (2 Jacobian adds per bucket, each ~2.5x cost of affine add)
60 static constexpr size_t BUCKET_ACCUMULATION_COST = 5;
61
62 // Field multiplications saved per group operation when using affine trick
63 static constexpr size_t AFFINE_TRICK_SAVINGS_PER_OP = 5;
64
65 // Extra cost of Jacobian group operation when Z coordinate != 1
66 static constexpr size_t JACOBIAN_Z_NOT_ONE_PENALTY = 5;
67
68 // Cost of computing 4-bit lookup table for modular exponentiation (14 muls)
69 static constexpr size_t INVERSION_TABLE_COST = 14;
70 // ===========================================================================
71
72 // Offset generator used in bucket reduction to probabilistically avoid incomplete-addition
73 // edge cases in the accumulator. Derived from domain-separated precomputed generators.
75 {
76 static const AffineElement offset_generator = []() {
78 return get_precomputed_generators<typename Curve::Group, "ECCVM_OFFSET_GENERATOR", 1>()[0];
79 } else {
80 return get_precomputed_generators<typename Curve::Group, "DEFAULT_DOMAIN_SEPARATOR", 8>()[0];
81 }
82 }();
83 return offset_generator;
84 }
85
94 struct MSMWorkUnit {
95 size_t batch_msm_index = 0;
96 size_t start_index = 0;
97 size_t size = 0;
98 };
100
105 struct MSMData {
106 std::span<const ScalarField> scalars; // Scalars (non-Montgomery form)
107 std::span<const AffineElement> points; // Input points
108 std::span<const uint32_t> scalar_indices; // Indices of nonzero scalars
109 std::span<uint64_t> point_schedule; // Scratch space for point scheduling
110
115 static MSMData from_work_unit(std::span<std::span<ScalarField>> all_scalars,
116 std::span<std::span<const AffineElement>> all_points,
117 const std::vector<std::vector<uint32_t>>& all_indices,
118 std::span<uint64_t> point_schedule_buffer,
119 const MSMWorkUnit& work_unit) noexcept
120 {
121 const auto& indices = all_indices[work_unit.batch_msm_index];
122 // Avoid indexing into an empty vector when all scalars are zero (work_unit.size == 0)
123 std::span<const uint32_t> scalar_indices =
124 work_unit.size > 0 ? std::span<const uint32_t>{ &indices[work_unit.start_index], work_unit.size }
125 : std::span<const uint32_t>{};
126 return MSMData{
127 .scalars = all_scalars[work_unit.batch_msm_index],
128 .points = all_points[work_unit.batch_msm_index],
129 .scalar_indices = scalar_indices,
130 .point_schedule = point_schedule_buffer,
131 };
132 }
133 };
134
143 std::vector<AffineElement> buckets;
145
146 BucketAccumulators(size_t num_buckets) noexcept
147 : buckets(num_buckets)
148 , bucket_exists(num_buckets)
149 {}
150 };
151
160 std::vector<Element> buckets;
162
164 : buckets(num_buckets)
165 , bucket_exists(num_buckets)
166 {}
167 };
172 static constexpr size_t BATCH_SIZE = 2048;
173 // when adding affine points, we have an edge case where the number of points in the batch can overflow by 2
174 static constexpr size_t BATCH_OVERFLOW_SIZE = 2;
175 std::vector<AffineElement> points_to_add;
176 std::vector<BaseField> inversion_scratch_space; // Used for Montgomery batch inversion denominators
178 AffineElement null_location{}; // Dummy write target for branchless conditional moves
179
185 };
186
192 uint64_t data;
193
194 [[nodiscard]] static constexpr PointScheduleEntry create(uint32_t point_index, uint32_t bucket_index) noexcept
195 {
196 return { (static_cast<uint64_t>(point_index) << 32) | bucket_index };
197 }
198 [[nodiscard]] constexpr uint32_t point_index() const noexcept { return static_cast<uint32_t>(data >> 32); }
199 [[nodiscard]] constexpr uint32_t bucket_index() const noexcept { return static_cast<uint32_t>(data); }
200 };
201
202 // ======================= Public Methods =======================
203 // See README.md for algorithm details and mathematical derivations.
204
210 static AffineElement msm(std::span<const AffineElement> points,
212 bool handle_edge_cases = false) noexcept;
213
219 static std::vector<AffineElement> batch_multi_scalar_mul(std::span<std::span<const AffineElement>> points,
220 std::span<std::span<ScalarField>> scalars,
221 bool handle_edge_cases = true) noexcept;
222
223 // ======================= Test-Visible Methods =======================
224 // Exposed for unit testing; not part of the public API.
225
226 static uint32_t get_num_rounds(size_t num_points) noexcept
227 {
228 const uint32_t bits_per_slice = get_optimal_log_num_buckets(num_points);
229 return static_cast<uint32_t>((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);
230 }
231
233 static void add_affine_points(AffineElement* points,
234 const size_t num_points,
235 typename Curve::BaseField* scratch_space) noexcept;
236
238 static uint32_t get_scalar_slice(const ScalarField& scalar, size_t round, size_t slice_size) noexcept;
239
241 static uint32_t get_optimal_log_num_buckets(size_t num_points) noexcept;
242
248 static std::vector<ThreadWorkUnits> partition_by_weight(std::span<const std::vector<uint16_t>> msm_scalar_weights,
249 size_t num_threads) noexcept;
250
253 std::span<const AffineElement> points,
254 AffineAdditionData& affine_data,
255 BucketAccumulators& bucket_data) noexcept;
256
258 template <typename BucketType> static Element accumulate_buckets(BucketType& bucket_accumulators) noexcept
259 {
260 auto& buckets = bucket_accumulators.buckets;
261 BB_ASSERT_DEBUG(buckets.size() > static_cast<size_t>(0));
262 int starting_index = static_cast<int>(buckets.size() - 1);
263 Element running_sum;
264 bool found_start = false;
265 while (!found_start && starting_index > 0) {
266 const size_t idx = static_cast<size_t>(starting_index);
267 if (bucket_accumulators.bucket_exists.get(idx)) {
268
269 running_sum = buckets[idx];
270 found_start = true;
271 } else {
272 starting_index -= 1;
273 }
274 }
275 if (!found_start) {
276 return Curve::Group::point_at_infinity;
277 }
278 BB_ASSERT_DEBUG(starting_index > 0);
279 const auto& offset_generator = get_offset_generator();
280 Element sum = running_sum + offset_generator;
281 for (int i = starting_index - 1; i > 0; --i) {
282 size_t idx = static_cast<size_t>(i);
283 BB_ASSERT_DEBUG(idx < bucket_accumulators.bucket_exists.size());
284 if (bucket_accumulators.bucket_exists.get(idx)) {
285 running_sum += buckets[idx];
286 }
287 sum += running_sum;
288 }
289 return sum - offset_generator;
290 }
291
292 private:
293 // ======================= Private Implementation =======================
294
297 std::vector<uint32_t>& nonzero_scalar_indices) noexcept;
298
303 static void compute_scalar_slice_weights(std::span<const ScalarField> scalars,
304 std::span<const uint32_t> nonzero_indices,
305 uint32_t bits_per_slice,
306 std::vector<uint16_t>& weights) noexcept;
307
314 std::vector<std::vector<uint32_t>>& msm_scalar_indices) noexcept;
315
317 static bool use_affine_trick(size_t num_points, size_t num_buckets) noexcept;
318
320 static Element jacobian_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept;
321
323 static Element affine_pippenger_with_transformed_scalars(MSMData& msm_data) noexcept;
324
325 // Helpers for batch_accumulate_points_into_buckets. Inlined for performance.
326
327 // Process single point: if bucket has accumulator, pair them for addition; else cache in bucket.
328 __attribute__((always_inline)) static void process_single_point(size_t bucket,
332 size_t& scratch_it,
333 size_t& point_it) noexcept
334 {
335 bool has_accumulator = bucket_data.bucket_exists.get(bucket);
336 if (has_accumulator) {
338 affine_data.points_to_add[scratch_it + 1] = bucket_data.buckets[bucket];
339 bucket_data.bucket_exists.set(bucket, false);
340 affine_data.addition_result_bucket_destinations[scratch_it >> 1] = static_cast<uint32_t>(bucket);
341 scratch_it += 2;
342 } else {
343 bucket_data.buckets[bucket] = *point_source;
344 bucket_data.bucket_exists.set(bucket, true);
345 }
347 }
348
349 // Branchless bucket pair processing. Updates point_it (by 2 if same bucket, else 1) and scratch_it.
350 // See README.md "batch_accumulate_points_into_buckets Algorithm" for case analysis.
351 __attribute__((always_inline)) static void process_bucket_pair(size_t lhs_bucket,
357 size_t& scratch_it,
358 size_t& point_it) noexcept
359 {
360 bool has_bucket_accumulator = bucket_data.bucket_exists.get(lhs_bucket);
361 bool buckets_match = lhs_bucket == rhs_bucket;
362 bool do_affine_add = buckets_match || has_bucket_accumulator;
363
365
370
372 dest_bucket = do_affine_add ? static_cast<uint32_t>(lhs_bucket) : dest_bucket;
373
376
377 bucket_data.bucket_exists.set(lhs_bucket, (has_bucket_accumulator && buckets_match) || !do_affine_add);
379 point_it += (do_affine_add && buckets_match) ? 2 : 1;
380 }
381};
382
384template <typename Curve>
387 bool handle_edge_cases = true) noexcept;
388
390template <typename Curve>
391typename Curve::Element pippenger_unsafe(PolynomialSpan<const typename Curve::ScalarField> scalars,
392 std::span<const typename Curve::AffineElement> points) noexcept;
393
394extern template class MSM<curve::Grumpkin>;
395extern template class MSM<curve::BN254>;
396
397} // namespace bb::scalar_multiplication
#define BB_ASSERT_DEBUG(expression,...)
Definition assert.hpp:55
Custom class to handle packed vectors of bits.
Definition bitvector.hpp:23
typename Group::element Element
Definition grumpkin.hpp:63
typename grumpkin::g1 Group
Definition grumpkin.hpp:62
typename Group::affine_element AffineElement
Definition grumpkin.hpp:64
const AffineElement AffineAdditionData BucketAccumulators size_t & scratch_it
typename Curve::BaseField BaseField
static constexpr size_t BUCKET_ACCUMULATION_COST
static bool use_affine_trick(size_t num_points, size_t num_buckets) noexcept
Decide if batch inversion saves work vs Jacobian additions.
static Element jacobian_pippenger_with_transformed_scalars(MSMData &msm_data) noexcept
Pippenger using Jacobian buckets (handles edge cases: doubling, infinity)
__attribute__((always_inline)) static void process_single_point(size_t bucket
static Element accumulate_buckets(BucketType &bucket_accumulators) noexcept
Reduce buckets to single point using running (suffix) sum from high to low: R = sum(k * B_k)
static constexpr size_t INVERSION_TABLE_COST
static uint32_t get_scalar_slice(const ScalarField &scalar, size_t round, size_t slice_size) noexcept
Extract c-bit slice from scalar for bucket index computation.
static constexpr size_t PREFETCH_INTERVAL
static std::vector< ThreadWorkUnits > partition_by_weight(std::span< const std::vector< uint16_t > > msm_scalar_weights, size_t num_threads) noexcept
Partition per-MSM scalar weights into num_threads work units of approximately equal cumulative weight...
static constexpr size_t PREFETCH_INTERVAL_MASK
const AffineElement AffineAdditionData & affine_data
static const AffineElement & get_offset_generator() noexcept
static AffineElement msm(std::span< const AffineElement > points, PolynomialSpan< const ScalarField > scalars, bool handle_edge_cases=false) noexcept
Main entry point for single MSM computation.
static Element affine_pippenger_with_transformed_scalars(MSMData &msm_data) noexcept
Pippenger using affine buckets with batch inversion (faster, no edge case handling)
static constexpr size_t AFFINE_TRICK_SAVINGS_PER_OP
static void compute_scalar_slice_weights(std::span< const ScalarField > scalars, std::span< const uint32_t > nonzero_indices, uint32_t bits_per_slice, std::vector< uint16_t > &weights) noexcept
Compute per-scalar slice-count weights ceil(bit_length / bits_per_slice).
size_t const AffineElement const AffineElement * rhs_source_if_match
static void add_affine_points(AffineElement *points, const size_t num_points, typename Curve::BaseField *scratch_space) noexcept
Batch add n/2 independent point pairs using Montgomery's trick.
const AffineElement AffineAdditionData BucketAccumulators & bucket_data
static constexpr size_t PREFETCH_LOOKAHEAD
static constexpr size_t PIPPENGER_THRESHOLD
static uint32_t get_num_rounds(size_t num_points) noexcept
static constexpr size_t JACOBIAN_Z_NOT_ONE_PENALTY
__attribute__((always_inline)) static void process_bucket_pair(size_t lhs_bucket
std::vector< MSMWorkUnit > ThreadWorkUnits
static std::vector< ThreadWorkUnits > get_work_units(std::span< std::span< ScalarField > > scalars, std::vector< std::vector< uint32_t > > &msm_scalar_indices) noexcept
Distribute multiple MSMs across threads with balanced bucket-accumulation work.
size_t const AffineElement * lhs_source
static constexpr size_t NUM_BITS_IN_FIELD
static uint32_t get_optimal_log_num_buckets(size_t num_points) noexcept
Compute optimal bits per slice by minimizing cost over c in [1, MAX_SLICE_BITS)
static std::vector< AffineElement > batch_multi_scalar_mul(std::span< std::span< const AffineElement > > points, std::span< std::span< ScalarField > > scalars, bool handle_edge_cases=true) noexcept
Compute multiple MSMs in parallel with work balancing.
static constexpr size_t AFFINE_TRICK_THRESHOLD
static void batch_accumulate_points_into_buckets(std::span< const uint64_t > point_schedule, std::span< const AffineElement > points, AffineAdditionData &affine_data, BucketAccumulators &bucket_data) noexcept
Process sorted point schedule into bucket accumulators using batched affine additions.
typename Curve::ScalarField ScalarField
const AffineElement AffineAdditionData BucketAccumulators size_t size_t &point_it noexcept
typename Curve::AffineElement AffineElement
static void transform_scalar_and_get_nonzero_scalar_indices(std::span< ScalarField > scalars, std::vector< uint32_t > &nonzero_scalar_indices) noexcept
Convert scalars from Montgomery form and collect indices of nonzero scalars.
bb::curve::BN254::Element Element
Curve::Element pippenger(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, bool handle_edge_cases) noexcept
Safe MSM wrapper (defaults to handle_edge_cases=true)
Curve::Element pippenger_unsafe(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points) noexcept
Fast MSM wrapper for linearly independent points (no edge case handling)
Inner sum(Cont< Inner, Args... > const &in)
Definition container.hpp:70
constexpr std::span< const typename Group::affine_element > get_precomputed_generators()
@ BN254
Definition types.hpp:10
STL namespace.
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
Scratch space for batched affine point additions (one per thread)
Affine bucket accumulators for the fast affine-trick Pippenger variant.
Jacobian bucket accumulators for the safe Pippenger variant.
Container for MSM input data passed between algorithm stages.
static MSMData from_work_unit(std::span< std::span< ScalarField > > all_scalars, std::span< std::span< const AffineElement > > all_points, const std::vector< std::vector< uint32_t > > &all_indices, std::span< uint64_t > point_schedule_buffer, const MSMWorkUnit &work_unit) noexcept
Factory method to construct MSMData from a work unit.
MSMWorkUnit describes an MSM that may be part of a larger MSM.
Packed point schedule entry: (point_index << 32) | bucket_index.
constexpr uint32_t point_index() const noexcept
uint64_t data
constexpr uint32_t bucket_index() const noexcept
static constexpr PointScheduleEntry create(uint32_t point_index, uint32_t bucket_index) noexcept