Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
scalar_multiplication.cpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: Planned, auditors: [Sergei], commit: }
3// external_1: { status: not started, auditors: [], commit: }
4// external_2: { status: not started, auditors: [], commit: }
5// =====================
10
11#include "./process_buckets.hpp"
19
22
24
25// Naive double-and-add fallback for small inputs (< PIPPENGER_THRESHOLD points).
26template <typename Curve> typename Curve::Element small_mul(const typename MSM<Curve>::MSMData& msm_data) noexcept
27{
28 const auto& scalars = msm_data.scalars;
29 const auto& points = msm_data.points;
30 const auto& scalar_indices = msm_data.scalar_indices;
31 const size_t range = scalar_indices.size();
32
33 typename Curve::Element r = Curve::Group::point_at_infinity;
34 for (size_t i = 0; i < range; ++i) {
35 typename Curve::Element f = points[scalar_indices[i]];
36 r += f * scalars[scalar_indices[i]].to_montgomery_form();
37 }
38 return r;
39}
40
41template <typename Curve>
43 std::vector<uint32_t>& nonzero_scalar_indices) noexcept
44{
46
47 // Pass 1: Each thread converts from Montgomery and collects nonzero indices into its own vector
48 parallel_for([&](const ThreadChunk& chunk) {
49 BB_BENCH_TRACY_NAME("MSM::convert_scalars");
50 BB_ASSERT_EQ(chunk.total_threads, thread_indices.size());
51 auto range = chunk.range(scalars.size());
52 if (range.empty()) {
53 return;
54 }
55 std::vector<uint32_t>& thread_scalar_indices = thread_indices[chunk.thread_index];
56 thread_scalar_indices.reserve(range.size());
57 for (size_t i : range) {
58 BB_ASSERT_DEBUG(i < scalars.size());
59 auto& scalar = scalars[i];
60 scalar.self_from_montgomery_form_reduced();
61
62 if (!scalar.is_zero()) {
63 thread_scalar_indices.push_back(static_cast<uint32_t>(i));
64 }
65 }
66 });
67
68 size_t num_entries = 0;
69 for (const auto& indices : thread_indices) {
70 num_entries += indices.size();
71 }
72 nonzero_scalar_indices.resize(num_entries);
73
74 // Pass 2: Copy each thread's indices to the output vector (no branching)
75 parallel_for([&](const ThreadChunk& chunk) {
76 BB_BENCH_TRACY_NAME("MSM::copy_indices");
77 BB_ASSERT_EQ(chunk.total_threads, thread_indices.size());
78 size_t offset = 0;
79 for (size_t i = 0; i < chunk.thread_index; ++i) {
80 offset += thread_indices[i].size();
81 }
82 for (size_t i = offset; i < offset + thread_indices[chunk.thread_index].size(); ++i) {
83 nonzero_scalar_indices[i] = thread_indices[chunk.thread_index][i - offset];
84 }
85 });
86}
87
88template <typename Curve>
90 std::span<const uint32_t> nonzero_indices,
91 uint32_t bits_per_slice,
92 std::vector<uint16_t>& weights) noexcept
93{
94 // weight = ceil(bit_length / bps) + FIXED_PER_SCALAR_WEIGHT. The fixed term approximates the
95 // O(num_rounds) per-scalar overhead in build_schedule, sort_schedule, and reduce_buckets that
96 // doesn't scale with bit_length. Without it, threads assigned many lightweight scalars end up
97 // with disproportionate build/sort/reduce work (empirically observed via per-phase profiling).
98 // Max is ceil(NUM_BITS_IN_FIELD / 1) + FIXED.
99 static constexpr uint16_t FIXED_PER_SCALAR_WEIGHT = 4;
100 static_assert(NUM_BITS_IN_FIELD + FIXED_PER_SCALAR_WEIGHT <= std::numeric_limits<uint16_t>::max(),
101 "slice-count weight overflows uint16_t");
102 BB_ASSERT_GT(bits_per_slice, 0U);
103
104 const size_t n = nonzero_indices.size();
105 weights.resize(n);
106
107 parallel_for([&](const ThreadChunk& chunk) {
108 for (size_t k : chunk.range(n)) {
109 const auto& scalar = scalars[nonzero_indices[k]];
110 // Scalars were filtered for nonzero and are in non-Montgomery form, so get_msb()
111 // returns a valid bit index in [0, NUM_BITS_IN_FIELD).
112 const uint64_t msb = uint256_t{ scalar.data[0], scalar.data[1], scalar.data[2], scalar.data[3] }.get_msb();
113 const size_t bit_length = static_cast<size_t>(msb) + 1;
114 weights[k] =
115 static_cast<uint16_t>((bit_length + bits_per_slice - 1) / bits_per_slice) + FIXED_PER_SCALAR_WEIGHT;
116 }
117 });
118}
119
120template <typename Curve>
122 std::span<const std::vector<uint16_t>> msm_scalar_weights, size_t num_threads) noexcept
123{
124 BB_ASSERT_GT(num_threads, 0U);
125 std::vector<ThreadWorkUnits> work_units(num_threads);
126
127 size_t grand_total_weight = 0;
128 for (const auto& weights : msm_scalar_weights) {
129 for (uint16_t w : weights) {
130 grand_total_weight += w;
131 }
132 }
133 if (grand_total_weight == 0) {
134 return work_units;
135 }
136
137 const size_t weight_per_thread = numeric::ceil_div(grand_total_weight, num_threads);
138
139 size_t thread_accumulated_weight = 0;
140 size_t current_thread_idx = 0;
141 for (size_t i = 0; i < msm_scalar_weights.size(); ++i) {
142 const auto& weights = msm_scalar_weights[i];
143 const size_t n = weights.size();
144
145 size_t start = 0;
146 for (size_t k = 0; k < n; ++k) {
147 thread_accumulated_weight += weights[k];
148
149 if (current_thread_idx < num_threads - 1 && thread_accumulated_weight >= weight_per_thread) {
150 work_units[current_thread_idx].push_back(MSMWorkUnit{
151 .batch_msm_index = i,
152 .start_index = start,
153 .size = k + 1 - start,
154 });
155 start = k + 1;
156 current_thread_idx++;
157 thread_accumulated_weight = 0;
158 }
159 }
160 if (start < n) {
161 work_units[current_thread_idx].push_back(MSMWorkUnit{
162 .batch_msm_index = i,
163 .start_index = start,
164 .size = n - start,
165 });
166 }
167 }
168 return work_units;
169}
170
171template <typename Curve>
173 std::span<std::span<ScalarField>> scalars, std::vector<std::vector<uint32_t>>& msm_scalar_indices) noexcept
174{
175 const size_t num_msms = scalars.size();
176 msm_scalar_indices.resize(num_msms);
177
178 // Weight scalars by their Pippenger cost (slice count + fixed overhead, see
179 // compute_scalar_slice_weights) to improve thread balancing.
180 std::vector<std::vector<uint16_t>> msm_scalar_weights(num_msms);
181 size_t total_work = 0;
182 for (size_t i = 0; i < num_msms; ++i) {
183 transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]);
184 const size_t n = msm_scalar_indices[i].size();
185 total_work += n;
186 if (n == 0) {
187 continue;
188 }
189 const uint32_t bps = get_optimal_log_num_buckets(n);
190 compute_scalar_slice_weights(scalars[i], msm_scalar_indices[i], bps, msm_scalar_weights[i]);
191 }
192
193 const size_t num_threads = get_num_cpus();
194
195 // Only use a single work unit if we don't have enough work for every thread
196 if (num_threads > total_work) {
197 std::vector<ThreadWorkUnits> work_units(num_threads);
198 for (size_t i = 0; i < num_msms; ++i) {
199 work_units[0].push_back(MSMWorkUnit{
200 .batch_msm_index = i,
201 .start_index = 0,
202 .size = msm_scalar_indices[i].size(),
203 });
204 }
205 return work_units;
206 }
207
208 return partition_by_weight(msm_scalar_weights, num_threads);
209}
210
226template <typename Curve>
227uint32_t MSM<Curve>::get_scalar_slice(const typename Curve::ScalarField& scalar,
228 size_t round,
229 size_t slice_size) noexcept
230{
231 constexpr size_t LIMB_BITS = 64;
232
233 size_t hi_bit = NUM_BITS_IN_FIELD - (round * slice_size);
234 size_t lo_bit = (hi_bit < slice_size) ? 0 : hi_bit - slice_size;
235
236 BB_ASSERT_DEBUG(lo_bit < hi_bit);
237 BB_ASSERT_DEBUG(hi_bit <= NUM_BITS_IN_FIELD); // Ensures hi_bit < 256, so end_limb <= 3
238
239 size_t start_limb = lo_bit / LIMB_BITS;
240 size_t end_limb = hi_bit / LIMB_BITS;
241 size_t lo_slice_offset = lo_bit & (LIMB_BITS - 1);
242 size_t actual_slice_size = hi_bit - lo_bit;
243 size_t lo_slice_bits =
244 (LIMB_BITS - lo_slice_offset < actual_slice_size) ? (LIMB_BITS - lo_slice_offset) : actual_slice_size;
245 size_t hi_slice_bits = actual_slice_size - lo_slice_bits;
246
247 uint64_t lo_slice = (scalar.data[start_limb] >> lo_slice_offset) & ((1ULL << lo_slice_bits) - 1);
248 uint64_t hi_slice = (start_limb != end_limb) ? (scalar.data[end_limb] & ((1ULL << hi_slice_bits) - 1)) : 0;
249
250 return static_cast<uint32_t>(lo_slice | (hi_slice << lo_slice_bits));
251}
252
253template <typename Curve> uint32_t MSM<Curve>::get_optimal_log_num_buckets(const size_t num_points) noexcept
254{
255 // Cost model: total_cost = num_rounds * (num_points + num_buckets * BUCKET_ACCUMULATION_COST)
256 auto compute_cost = [&](uint32_t bits) {
257 size_t rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, static_cast<size_t>(bits));
258 size_t buckets = size_t{ 1 } << bits;
259 return rounds * (num_points + buckets * BUCKET_ACCUMULATION_COST);
260 };
261
262 uint32_t best_bits = 1;
263 size_t best_cost = compute_cost(1);
264 for (uint32_t bits = 2; bits < MAX_SLICE_BITS; ++bits) {
265 size_t cost = compute_cost(bits);
266 if (cost < best_cost) {
267 best_cost = cost;
268 best_bits = bits;
269 }
270 }
271 return best_bits;
272}
273
274template <typename Curve> bool MSM<Curve>::use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept
275{
276 if (num_points < AFFINE_TRICK_THRESHOLD) {
277 return false;
278 }
279
280 // Affine trick requires log(N) modular inversions per Pippenger round.
281 // It saves num_points * AFFINE_TRICK_SAVINGS_PER_OP field muls, plus
282 // num_buckets * JACOBIAN_Z_NOT_ONE_PENALTY field muls (buckets have Z=1 with affine trick)
283
284 // Cost of modular inversion via exponentiation:
285 // - NUM_BITS_IN_FIELD squarings
286 // - (NUM_BITS_IN_FIELD + 3) / 4 multiplications (4-bit windows)
287 // - INVERSION_TABLE_COST multiplications for lookup table
288 constexpr size_t COST_OF_INVERSION = NUM_BITS_IN_FIELD + ((NUM_BITS_IN_FIELD + 3) / 4) + INVERSION_TABLE_COST;
289
290 double log2_num_points = log2(static_cast<double>(num_points));
291 size_t savings_per_round = (num_points * AFFINE_TRICK_SAVINGS_PER_OP) + (num_buckets * JACOBIAN_Z_NOT_ONE_PENALTY);
292 double inversion_cost_per_round = log2_num_points * static_cast<double>(COST_OF_INVERSION);
293
294 return static_cast<double>(savings_per_round) > inversion_cost_per_round;
295}
296
297template <typename Curve>
299 const size_t num_points,
300 typename Curve::BaseField* scratch_space) noexcept
301{
302 using AffineElement = typename Curve::AffineElement;
303 using BaseField = typename Curve::BaseField;
304
305 // Pippenger-specific interleaved batch add with direct prefetch and no aliasing overhead.
306 // The generic batch_affine_add_impl suffers from aliasing (lhs_base == rhs_base) causing
307 // the compiler to reload lhs coordinates after writing output. This version avoids that.
308 bb::group_elements::batch_affine_add_interleaved<AffineElement, BaseField>(points, num_points, scratch_space);
309}
310
311template <typename Curve>
313{
314 const size_t size = msm_data.scalar_indices.size();
315 const uint32_t bits_per_slice = get_optimal_log_num_buckets(size);
316 const size_t num_buckets = size_t{ 1 } << bits_per_slice;
317 const uint32_t num_rounds = static_cast<uint32_t>((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);
318 const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice;
319
320 JacobianBucketAccumulators bucket_data(num_buckets);
321 Element msm_result = Curve::Group::point_at_infinity;
322
323 for (uint32_t round = 0; round < num_rounds; ++round) {
324 // Populate buckets using Jacobian accumulation
325 for (size_t i = 0; i < size; ++i) {
326 uint32_t idx = msm_data.scalar_indices[i];
327 uint32_t bucket = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice);
328 if (bucket > 0) {
329 if (bucket_data.bucket_exists.get(bucket)) {
330 bucket_data.buckets[bucket] += msm_data.points[idx];
331 } else {
332 bucket_data.buckets[bucket] = msm_data.points[idx];
333 bucket_data.bucket_exists.set(bucket, true);
334 }
335 }
336 }
337
338 // Reduce buckets and accumulate into result
339 Element bucket_result = accumulate_buckets(bucket_data);
340 bucket_data.bucket_exists.clear();
341
342 uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice;
343 for (uint32_t i = 0; i < num_doublings; ++i) {
344 msm_result.self_dbl();
345 }
346 msm_result += bucket_result;
347 }
348 return msm_result;
349}
350
351template <typename Curve>
353{
354 const size_t num_points = msm_data.scalar_indices.size();
355 const uint32_t bits_per_slice = get_optimal_log_num_buckets(num_points);
356 const size_t num_buckets = size_t{ 1 } << bits_per_slice;
357
358 if (!use_affine_trick(num_points, num_buckets)) {
359 return jacobian_pippenger_with_transformed_scalars(msm_data);
360 }
361
362 const uint32_t num_rounds = static_cast<uint32_t>((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);
363 const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice;
364
365 // Per-call allocation for WASM compatibility (thread_local causes issues in WASM)
366 AffineAdditionData affine_data;
367 BucketAccumulators bucket_data(num_buckets);
368
369 Element msm_result = Curve::Group::point_at_infinity;
370
371 for (uint32_t round = 0; round < num_rounds; ++round) {
372 // Build point schedule for this round
373 {
374 for (size_t i = 0; i < num_points; ++i) {
375 uint32_t idx = msm_data.scalar_indices[i];
376 uint32_t bucket_idx = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice);
377 msm_data.point_schedule[i] = PointScheduleEntry::create(idx, bucket_idx).data;
378 }
379 }
380
381 // Sort by bucket and count zero-bucket entries
382 size_t num_zero_bucket_entries =
383 sort_point_schedule_and_count_zero_buckets(&msm_data.point_schedule[0], num_points, bits_per_slice);
384 size_t round_size = num_points - num_zero_bucket_entries;
385
386 // Accumulate points into buckets
387 Element bucket_result = Curve::Group::point_at_infinity;
388 if (round_size > 0) {
389 std::span<uint64_t> schedule(&msm_data.point_schedule[num_zero_bucket_entries], round_size);
390 batch_accumulate_points_into_buckets(schedule, msm_data.points, affine_data, bucket_data);
391 bucket_result = accumulate_buckets(bucket_data);
392 bucket_data.bucket_exists.clear();
393 }
394
395 // Combine into running result
396 uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice;
397 for (uint32_t i = 0; i < num_doublings; ++i) {
398 msm_result.self_dbl();
399 }
400 msm_result += bucket_result;
401 }
402
403 return msm_result;
404}
405
406template <typename Curve>
410 MSM<Curve>::BucketAccumulators& bucket_data) noexcept
411{
412
413 if (point_schedule.empty()) {
414 return;
415 }
416
417 size_t point_it = 0;
418 size_t scratch_it = 0;
419 const size_t num_points = point_schedule.size();
420 const size_t prefetch_max = (num_points >= PREFETCH_LOOKAHEAD) ? (num_points - PREFETCH_LOOKAHEAD) : 0;
421 const size_t last_index = num_points - 1;
422
423 // Iterative loop - continues until all points processed and no work remains in scratch space
424 while (point_it < num_points || scratch_it != 0) {
425 // Step 1: Fill scratch space with up to BATCH_SIZE/2 independent additions
426 while (((scratch_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < last_index)) {
427 // Prefetch points we'll need soon (every PREFETCH_INTERVAL iterations)
428 if ((point_it < prefetch_max) && ((point_it & PREFETCH_INTERVAL_MASK) == 0)) {
429 for (size_t i = PREFETCH_LOOKAHEAD / 2; i < PREFETCH_LOOKAHEAD; ++i) {
430 PointScheduleEntry entry{ point_schedule[point_it + i] };
431 __builtin_prefetch(&points[entry.point_index()]);
432 }
433 }
434
435 PointScheduleEntry lhs{ point_schedule[point_it] };
436 PointScheduleEntry rhs{ point_schedule[point_it + 1] };
437
438 process_bucket_pair(lhs.bucket_index(),
439 rhs.bucket_index(),
440 &points[lhs.point_index()],
441 &points[rhs.point_index()],
442 affine_data,
443 bucket_data,
444 scratch_it,
445 point_it);
446 }
447
448 // Handle the last point (odd count case) - separate to avoid bounds check on point_schedule[point_it + 1]
449 if (point_it == last_index) {
450 PointScheduleEntry last{ point_schedule[point_it] };
451 process_single_point(
452 last.bucket_index(), &points[last.point_index()], affine_data, bucket_data, scratch_it, point_it);
453 }
454
455 // Compute independent additions using Montgomery's batch inversion trick
456 size_t num_points_to_add = scratch_it;
457 if (num_points_to_add >= 2) {
458 add_affine_points(
459 affine_data.points_to_add.data(), num_points_to_add, affine_data.inversion_scratch_space.data());
460 }
461
462 // add_affine_points stores results in the top-half of scratch space
463 AffineElement* affine_output = affine_data.points_to_add.data() + (num_points_to_add / 2);
464
465 // Recirculate addition outputs back into scratch space or bucket accumulators
466 size_t new_scratch_it = 0;
467 size_t output_it = 0;
468 size_t num_outputs = num_points_to_add / 2;
469
470 while ((num_outputs > 1) && (output_it + 1 < num_outputs)) {
471 uint32_t lhs_bucket = affine_data.addition_result_bucket_destinations[output_it];
472 uint32_t rhs_bucket = affine_data.addition_result_bucket_destinations[output_it + 1];
473
474 process_bucket_pair(lhs_bucket,
475 rhs_bucket,
476 &affine_output[output_it],
477 &affine_output[output_it + 1],
478 affine_data,
479 bucket_data,
480 new_scratch_it,
481 output_it);
482 }
483
484 // Handle the last output (odd count case)
485 if (num_outputs > 0 && output_it == num_outputs - 1) {
486 uint32_t bucket = affine_data.addition_result_bucket_destinations[output_it];
487 process_single_point(
488 bucket, &affine_output[output_it], affine_data, bucket_data, new_scratch_it, output_it);
489 }
490
491 // Continue with recirculated points
492 scratch_it = new_scratch_it;
493 }
494}
495
496template <typename Curve>
499 std::span<std::span<ScalarField>> scalars,
500 bool handle_edge_cases) noexcept
501{
502 BB_BENCH_NAME("MSM::batch_multi_scalar_mul");
503 BB_ASSERT_EQ(points.size(), scalars.size());
504 const size_t num_msms = points.size();
505
506 std::vector<std::vector<uint32_t>> msm_scalar_indices;
507 std::vector<ThreadWorkUnits> thread_work_units = get_work_units(scalars, msm_scalar_indices);
508 const size_t num_cpus = get_num_cpus();
509 std::vector<std::vector<std::pair<Element, size_t>>> thread_msm_results(num_cpus);
510 BB_ASSERT_EQ(thread_work_units.size(), num_cpus);
511
512 // Select Pippenger implementation once (hoisting branch outside hot loop)
513 // Jacobian: safe, handles edge cases | Affine: faster, assumes linearly independent points
514 auto pippenger_impl =
515 handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars;
516
517 // Once we have our work units, each thread can independently evaluate its assigned msms
518 {
519 BB_BENCH_NAME("MSM::batch_multi_scalar_mul/evaluate_work_units");
520 parallel_for(num_cpus, [&](size_t thread_idx) {
521 BB_BENCH_TRACY_NAME("MSM::evaluate_work_units");
522 if (!thread_work_units[thread_idx].empty()) {
523 const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
524 std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
525 msm_results.reserve(msms.size());
526
527 // Point schedule buffer for this thread - avoids per-work-unit heap allocation
528 std::vector<uint64_t> point_schedule_buffer;
529
530 for (const MSMWorkUnit& msm : msms) {
531 point_schedule_buffer.resize(msm.size);
532 MSMData msm_data =
533 MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
534 Element msm_result =
535 (msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);
536
537 msm_results.emplace_back(msm_result, msm.batch_msm_index);
538 }
539 }
540 });
541 }
542
543 // Accumulate results. Single-threaded, but negligible in practice.
544 // Benchmarked (192-core, 256 threads): ~512us for 2^16 MSM (~1.2% of total), ~207us for 2^20 (<0.1%).
545 std::vector<Element> results(num_msms, Curve::Group::point_at_infinity);
546 {
547 BB_BENCH_NAME("MSM::batch_multi_scalar_mul/accumulate_results");
548 for (const auto& single_thread_msm_results : thread_msm_results) {
549 for (const auto& [element, index] : single_thread_msm_results) {
550 results[index] += element;
551 }
552 }
553 }
554 {
555 BB_BENCH_NAME("MSM::batch_multi_scalar_mul/batch_normalize");
556 Element::batch_normalize(results.data(), num_msms);
557 }
558
559 // Convert scalars back TO Montgomery form so they remain unchanged from caller's perspective
560 {
561 BB_BENCH_NAME("MSM::batch_multi_scalar_mul/scalars_to_montgomery");
562 for (auto& scalar_span : scalars) {
563 parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {
564 BB_BENCH_TRACY_NAME("MSM::scalars_to_montgomery/chunk");
565 for (size_t i = start; i < end; ++i) {
566 scalar_span[i].self_to_montgomery_form();
567 }
568 });
569 }
570 }
571
572 return std::vector<AffineElement>(results.begin(), results.end());
573}
574
575template <typename Curve>
578 bool handle_edge_cases) noexcept
579{
580 if (scalars.size() == 0) {
581 return Curve::Group::affine_point_at_infinity;
582 }
583 const size_t num_scalars = scalars.size();
584 BB_ASSERT_GTE(points.size(), scalars.start_index + num_scalars);
585
586 // const_cast is safe: we convert from Montgomery, compute, then convert back.
587 // Scalars are unchanged from the caller's perspective.
588 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
589 ScalarField* scalar_ptr = const_cast<ScalarField*>(&scalars[scalars.start_index]);
590 std::span<ScalarField> scalar_span(scalar_ptr, num_scalars);
591
592 // Wrap into a size-1 batch and delegate to the general method that properly handles multi-threading
593 std::array<std::span<const AffineElement>, 1> points_batch{ points.subspan(scalars.start_index) };
594 std::array<std::span<ScalarField>, 1> scalars_batch{ scalar_span };
595
596 auto results = batch_multi_scalar_mul(std::span(points_batch), std::span(scalars_batch), handle_edge_cases);
597 return results[0];
598}
599
600template <typename Curve>
603 [[maybe_unused]] bool handle_edge_cases) noexcept
604{
605 return MSM<Curve>::msm(points, scalars, handle_edge_cases);
606}
607
608template <typename Curve>
614
617 bool handle_edge_cases = true) noexcept;
618
619template curve::Grumpkin::Element pippenger_unsafe<curve::Grumpkin>(
620 PolynomialSpan<const curve::Grumpkin::ScalarField> scalars, std::span<const curve::Grumpkin::AffineElement> points);
621
622template curve::BN254::Element pippenger<curve::BN254>(PolynomialSpan<const curve::BN254::ScalarField> scalars,
623 std::span<const curve::BN254::AffineElement> points,
624 bool handle_edge_cases = true);
625
626template curve::BN254::Element pippenger_unsafe<curve::BN254>(PolynomialSpan<const curve::BN254::ScalarField> scalars,
627 std::span<const curve::BN254::AffineElement> points);
628
629} // namespace bb::scalar_multiplication
630
631template class bb::scalar_multiplication::MSM<bb::curve::Grumpkin>;
632template class bb::scalar_multiplication::MSM<bb::curve::BN254>;
#define BB_ASSERT_GTE(left, right,...)
Definition assert.hpp:128
#define BB_ASSERT_GT(left, right,...)
Definition assert.hpp:113
#define BB_ASSERT_DEBUG(expression,...)
Definition assert.hpp:55
#define BB_ASSERT_EQ(actual, expected,...)
Definition assert.hpp:83
#define BB_BENCH_NAME(name)
Definition bb_bench.hpp:264
#define BB_BENCH_TRACY_NAME(name)
Definition bb_bench.hpp:256
BB_INLINE bool get(size_t index) const noexcept
Definition bitvector.hpp:44
BB_INLINE void set(size_t index, bool value) noexcept
Definition bitvector.hpp:30
void clear()
Definition bitvector.hpp:52
typename Group::element Element
Definition grumpkin.hpp:63
typename Group::affine_element AffineElement
Definition grumpkin.hpp:64
typename Curve::BaseField BaseField
static bool use_affine_trick(size_t num_points, size_t num_buckets) noexcept
Decide if batch inversion saves work vs Jacobian additions.
static Element jacobian_pippenger_with_transformed_scalars(MSMData &msm_data) noexcept
Pippenger using Jacobian buckets (handles edge cases: doubling, infinity)
static uint32_t get_scalar_slice(const ScalarField &scalar, size_t round, size_t slice_size) noexcept
Extract c-bit slice from scalar for bucket index computation.
static std::vector< ThreadWorkUnits > partition_by_weight(std::span< const std::vector< uint16_t > > msm_scalar_weights, size_t num_threads) noexcept
Partition per-MSM scalar weights into num_threads work units of approximately equal cumulative weight...
static Element affine_pippenger_with_transformed_scalars(MSMData &msm_data) noexcept
Pippenger using affine buckets with batch inversion (faster, no edge case handling)
static void compute_scalar_slice_weights(std::span< const ScalarField > scalars, std::span< const uint32_t > nonzero_indices, uint32_t bits_per_slice, std::vector< uint16_t > &weights) noexcept
Compute per-scalar slice-count weights ceil(bit_length / bits_per_slice).
static void add_affine_points(AffineElement *points, const size_t num_points, typename Curve::BaseField *scratch_space) noexcept
Batch add n/2 independent point pairs using Montgomery's trick.
static std::vector< ThreadWorkUnits > get_work_units(std::span< std::span< ScalarField > > scalars, std::vector< std::vector< uint32_t > > &msm_scalar_indices) noexcept
Distribute multiple MSMs across threads with balanced bucket-accumulation work.
static uint32_t get_optimal_log_num_buckets(size_t num_points) noexcept
Compute optimal bits per slice by minimizing cost over c in [1, MAX_SLICE_BITS)
static std::vector< AffineElement > batch_multi_scalar_mul(std::span< std::span< const AffineElement > > points, std::span< std::span< ScalarField > > scalars, bool handle_edge_cases=true) noexcept
Compute multiple MSMs in parallel with work balancing.
static void batch_accumulate_points_into_buckets(std::span< const uint64_t > point_schedule, std::span< const AffineElement > points, AffineAdditionData &affine_data, BucketAccumulators &bucket_data) noexcept
Process sorted point schedule into bucket accumulators using batched affine additions.
typename Curve::ScalarField ScalarField
typename Curve::AffineElement AffineElement
static void transform_scalar_and_get_nonzero_scalar_indices(std::span< ScalarField > scalars, std::vector< uint32_t > &nonzero_scalar_indices) noexcept
Convert scalars from Montgomery form and collect indices of nonzero scalars.
bb::curve::BN254::Element Element
ssize_t offset
Definition engine.cpp:62
@ BN254
constexpr T ceil_div(const T &numerator, const T &denominator)
Computes the ceiling of the division of two integral types.
Definition general.hpp:23
Curve::Element small_mul(const typename MSM< Curve >::MSMData &msm_data) noexcept
Curve::Element pippenger(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, bool handle_edge_cases) noexcept
Safe MSM wrapper (defaults to handle_edge_cases=true)
size_t sort_point_schedule_and_count_zero_buckets(uint64_t *point_schedule, const size_t num_entries, const uint32_t bucket_index_bits) noexcept
Sort point schedule by bucket index and count zero-bucket entries.
Curve::Element pippenger_unsafe(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points) noexcept
Fast MSM wrapper for linearly independent points (no edge case handling)
Entry point for Barretenberg command-line interface.
Definition api.hpp:5
size_t get_num_cpus()
Definition thread.cpp:33
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
Definition thread.cpp:141
STL namespace.
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
size_t total_threads
Definition thread.hpp:151
size_t thread_index
Definition thread.hpp:150
auto range(size_t size, size_t offset=0) const
Definition thread.hpp:152
Scratch space for batched affine point additions (one per thread)
Affine bucket accumulators for the fast affine-trick Pippenger variant.
Jacobian bucket accumulators for the safe Pippenger variant.
Container for MSM input data passed between algorithm stages.
MSMWorkUnit describes an MSM that may be part of a larger MSM.
Packed point schedule entry: (point_index << 32) | bucket_index.