Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
polynomial_arithmetic.cpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: Complete, auditors: [Nishat], commit: 94f596f8b3bbbc216f9ad7dc33253256141156b2 }
3// external_1: { status: not started, auditors: [], commit: }
4// external_2: { status: not started, auditors: [], commit: }
5// =====================
6
12#include <math.h>
13#include <memory.h>
14#include <memory>
15#include <mutex>
16
18
19inline uint32_t reverse_bits(uint32_t x, uint32_t bit_length)
20{
21 x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
22 x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
23 x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
24 x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
25 return (((x >> 16) | (x << 16))) >> (32 - bit_length);
26}
27
28inline bool is_power_of_two(uint64_t x)
29{
30 return x && !(x & (x - 1));
31}
32
33template <typename Fr>
35 Fr* target,
36 const EvaluationDomain<Fr>& domain,
37 const Fr& generator_start,
38 const Fr& generator_shift,
39 const size_t generator_size)
40{
41 BB_ASSERT(generator_size % domain.num_threads == 0,
42 "generator_size must be divisible by num_threads to avoid silently skipping elements");
43 parallel_for(domain.num_threads, [&](size_t j) {
44 Fr thread_shift = generator_shift.pow(static_cast<uint64_t>(j * (generator_size / domain.num_threads)));
45 Fr work_generator = generator_start * thread_shift;
46 const size_t offset = j * (generator_size / domain.num_threads);
47 const size_t end = offset + (generator_size / domain.num_threads);
48 for (size_t i = offset; i < end; ++i) {
49 target[i] = coeffs[i] * work_generator;
50 work_generator *= generator_shift;
51 }
52 });
53}
54
55template <typename Fr>
56 requires SupportsFFT<Fr>
58 Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain, const Fr&, const std::vector<Fr*>& root_table)
59{
60 BB_ASSERT(coeffs != target, "fft_inner_parallel does not support in-place operation");
61 parallel_for(domain.num_threads, [&](size_t j) {
62 Fr temp_1;
63 Fr temp_2;
64 for (size_t i = (j * domain.thread_size); i < ((j + 1) * domain.thread_size); i += 2) {
65 uint32_t next_index_1 = (uint32_t)reverse_bits((uint32_t)i + 2, (uint32_t)domain.log2_size);
66 uint32_t next_index_2 = (uint32_t)reverse_bits((uint32_t)i + 3, (uint32_t)domain.log2_size);
67 __builtin_prefetch(&coeffs[next_index_1]);
68 __builtin_prefetch(&coeffs[next_index_2]);
69
70 uint32_t swap_index_1 = (uint32_t)reverse_bits((uint32_t)i, (uint32_t)domain.log2_size);
71 uint32_t swap_index_2 = (uint32_t)reverse_bits((uint32_t)i + 1, (uint32_t)domain.log2_size);
72
73 Fr::__copy(coeffs[swap_index_1], temp_1);
74 Fr::__copy(coeffs[swap_index_2], temp_2);
75 target[i + 1] = temp_1 - temp_2;
76 target[i] = temp_1 + temp_2;
77 }
78 });
79
80 // outer FFT loop
81 for (size_t m = 2; m < (domain.size); m <<= 1) {
82 parallel_for(domain.num_threads, [&](size_t j) {
83 Fr temp;
84
85 // Ok! So, what's going on here? This is the inner loop of the FFT algorithm, and we want to break it
86 // out into multiple independent threads. For `num_threads`, each thread will evaluation `domain.size /
87 // num_threads` of the polynomial. The actual iteration length will be half of this, because we leverage
88 // the fact that \omega^{n/2} = -\omega (where \omega is a root of unity)
89
90 // Here, `start` and `end` are used as our iterator limits, so that we can use our iterator `i` to
91 // directly access the roots of unity lookup table
92 const size_t start = j * (domain.thread_size >> 1);
93 const size_t end = (j + 1) * (domain.thread_size >> 1);
94
95 // For all but the last round of our FFT, the roots of unity that we need, will be a subset of our
96 // lookup table. e.g. for a size 2^n FFT, the 2^n'th roots create a multiplicative subgroup of order 2^n
97 // the 1st round will use the roots from the multiplicative subgroup of order 2 : the 2'th roots of
98 // unity the 2nd round will use the roots from the multiplicative subgroup of order 4 : the 4'th
99 // roots of unity
100 // i.e. each successive FFT round will double the set of roots that we need to index.
101 // We have already laid out the `root_table` container so that each FFT round's roots are linearly
102 // ordered in memory. For all FFT rounds, the number of elements we're iterating over is greater than
103 // the size of our lookup table. We need to access this table in a cyclical fasion - i.e. for a subgroup
104 // of size x, the first x iterations will index the subgroup elements in order, then for the next x
105 // iterations, we loop back to the start.
106
107 // We could implement the algorithm by having 2 nested loops (where the inner loop iterates over the
108 // root table), but we want to flatten this out - as for the first few rounds, the inner loop will be
109 // tiny and we'll have quite a bit of unneccesary branch checks For each iteration of our flattened
110 // loop, indexed by `i`, the element of the root table we need to access will be `i % (current round
111 // subgroup size)` Given that each round subgroup size is `m`, which is a power of 2, we can index the
112 // root table with a very cheap `i & (m - 1)` Which is why we have this odd `block_mask` variable
113 const size_t block_mask = m - 1;
114
115 // The next problem to tackle, is we now need to efficiently index the polynomial element in
116 // `scratch_space` in our flattened loop If we used nested loops, the outer loop (e.g. `y`) iterates
117 // from 0 to 'domain size', in steps of 2 * m, with the inner loop (e.g. `z`) iterating from 0 to m. We
118 // have our inner loop indexer with `i & (m - 1)`. We need to add to this our outer loop indexer, which
119 // is equivalent to taking our indexer `i`, masking out the bits used in the 'inner loop', and doubling
120 // the result. i.e. polynomial indexer = (i & (m - 1)) + ((i & ~(m - 1)) >> 1) To simplify this, we
121 // cache index_mask = ~block_mask, meaning that our indexer is just `((i & index_mask) << 1 + (i &
122 // block_mask)`
123 const size_t index_mask = ~block_mask;
124
125 // `round_roots` fetches the pointer to this round's lookup table. We use `numeric::get_msb(m) - 1` as
126 // our indexer, because we don't store the precomputed root values for the 1st round (because they're
127 // all 1).
128 const Fr* round_roots = root_table[static_cast<size_t>(numeric::get_msb(m)) - 1];
129
130 // Finally, we want to treat the final round differently from the others,
131 // so that we can reduce out of our 'coarse' reduction and store the output in `coeffs` instead of
132 // `scratch_space`
133 for (size_t i = start; i < end; ++i) {
134 size_t k1 = (i & index_mask) << 1;
135 size_t j1 = i & block_mask;
136 temp = round_roots[j1] * target[k1 + j1 + m];
137 target[k1 + j1 + m] = target[k1 + j1] - temp;
138 target[k1 + j1] += temp;
139 }
140 });
141 }
142}
143
144template <typename Fr>
145 requires SupportsFFT<Fr>
146void ifft(Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain)
147{
148 fft_inner_parallel(coeffs, target, domain, domain.root_inverse, domain.get_inverse_round_roots());
149
150 parallel_for(domain.num_threads, [&](size_t j) {
151 const size_t start = j * domain.thread_size;
152 const size_t end = (j + 1) * domain.thread_size;
153 for (size_t i = start; i < end; ++i) {
154 target[i] *= domain.domain_inverse;
155 }
156 });
157}
158
159template <typename Fr> Fr evaluate(const Fr* coeffs, const Fr& z, const size_t n)
160{
161 const size_t num_threads = get_num_cpus();
162 std::vector<Fr> evaluations(num_threads, Fr::zero());
163 parallel_for([&](const ThreadChunk& chunk) {
164 // parallel_for with ThreadChunk uses get_num_cpus() threads
165 BB_ASSERT_EQ(chunk.total_threads, evaluations.size());
166 auto range = chunk.range(n);
167 if (range.empty()) {
168 return;
169 }
170 size_t start = *range.begin();
171 Fr z_acc = z.pow(static_cast<uint64_t>(start));
172 for (size_t i : range) {
173 Fr work_var = z_acc * coeffs[i];
174 evaluations[chunk.thread_index] += work_var;
175 z_acc *= z;
176 }
177 });
178
179 Fr r = Fr::zero();
180 for (const auto& eval : evaluations) {
181 r += eval;
182 }
183 return r;
184}
185
186// This function computes sum of all scalars in a given array.
187template <typename Fr> Fr compute_sum(const Fr* src, const size_t n)
188{
189 Fr result = 0;
190 for (size_t i = 0; i < n; ++i) {
191 result += src[i];
192 }
193 return result;
194}
195
196// This function computes the polynomial (x - a)(x - b)(x - c)... given n distinct roots (a, b, c, ...).
197//
198// Build the product incrementally by multiplying in one root at a time. After the i-th iteration,
199// dest[0..i+1] holds the coefficients of ∏_{k=0}^{i} (X - roots[k]). Multiplying the existing
200// polynomial P(X) by (X - r) gives
201// P(X) · X shifts every coefficient up by one index, and
202// -P(X) · r scales every coefficient by -r.
203// Walking k high-to-low allows writing the shift-and-combine update in place with total cost O(n^2).
204template <typename Fr> void compute_linear_polynomial_product(const Fr* roots, Fr* dest, const size_t n)
205{
206 if (n == 0) {
207 return;
208 }
209
210 dest[0] = -roots[0];
211 dest[1] = Fr(1);
212 for (size_t i = 1; i < n; ++i) {
213 const Fr r = roots[i];
214 dest[i + 1] = dest[i];
215 for (size_t k = i; k >= 1; --k) {
216 dest[k] = dest[k - 1] - r * dest[k];
217 }
218 dest[0] = -r * dest[0];
219 }
220}
221
222template <typename Fr> Fr compute_linear_polynomial_product_evaluation(const Fr* roots, const Fr z, const size_t n)
223{
224 Fr result = 1;
225 for (size_t i = 0; i < n; ++i) {
226 result *= (z - roots[i]);
227 }
228 return result;
229}
230
231template <typename Fr>
232void compute_efficient_interpolation(const Fr* src, Fr* dest, const Fr* evaluation_points, const size_t n)
233{
234 /*
235 We use Lagrange technique to compute polynomial interpolation.
236 Given: (x_i, y_i) for i ∈ {0, 1, ..., n} =: [n]
237 Compute function f(X) such that f(x_i) = y_i for all i ∈ [n].
238 (X - x1)(X - x2)...(X - xn) (X - x0)(X - x2)...(X - xn)
239 F(X) = y0-------------------------------- + y1---------------------------------- + ...
240 (x0 - x_1)(x0 - x_2)...(x0 - xn) (x1 - x_0)(x1 - x_2)...(x1 - xn)
241 We write this as:
242 [ yi ]
243 F(X) = N(X) * |∑_i --------------- |
244 [ (X - xi) * di ]
245 where:
246 N(X) = ∏_{i \in [n]} (X - xi),
247 di = ∏_{j != i} (xi - xj)
248 For division of N(X) by (X - xi), we use the same trick that was used in compute_opening_polynomial()
249 function in the Kate commitment scheme.
250 We denote
251 q_{x_i} = N(X)/(X-x_i) * y_i * (d_i)^{-1} = q_{x_i,0}*1 + ... + q_{x_i,n-1} * X^{n-1} for i=0,..., n-1.
252
253 The computation of F(X) is split into two cases:
254
255 - if 0 is not in the interpolation domain, then the numerator polynomial N(X) has a non-zero constant term
256 that is used to initialize the division algorithm mentioned above; the monomial coefficients q_{x_i, j} of
257 q_{x_i} are accumulated into dest[j] for j=0,..., n-1
258
259 - if 0 is in the domain at index i_0, the constant term of N(X) is 0 and the division algorithm computing
260 q_{x_i} for i != i_0 is initialized with the constant term of N(X)/X. Note that its coefficients are given
261 by numerator_polynomial[j] for j=1,...,n. The monomial coefficients of q_{x_i} are then accumuluated in
262 dest[j] for j=1,..., n-1. Whereas the coefficients of
263 q_{0} = N(X)/X * f(0) * (d_{i_0})^{-1}
264 are added to dest[j] for j=0,..., n-1. Note that these coefficients do not require performing the division
265 algorithm used in Kate commitment scheme, as the coefficients of N(X)/X are given by numerator_polynomial[j]
266 for j=1,...,n.
267 */
268 // Lagrange interpolation is mathematically ill-defined when any two evaluation points coincide:
269 // the denominator d_i contains a zero factor. batch_invert silently skips zero entries, so
270 // without this check duplicate points produce an incorrect result.
271 for (size_t i = 0; i < n; ++i) {
272 for (size_t j = i + 1; j < n; ++j) {
273 BB_ASSERT(evaluation_points[i] != evaluation_points[j],
274 "compute_efficient_interpolation requires distinct evaluation points");
275 }
276 }
277
278 std::vector<Fr> numerator_polynomial(n + 1);
279 polynomial_arithmetic::compute_linear_polynomial_product(evaluation_points, numerator_polynomial.data(), n);
280 // First half contains roots, second half contains denominators (to be inverted)
281 std::vector<Fr> roots_and_denominators(2 * n);
282 std::vector<Fr> temp_src(n);
283 for (size_t i = 0; i < n; ++i) {
284 roots_and_denominators[i] = -evaluation_points[i];
285 temp_src[i] = src[i];
286 dest[i] = 0;
287 // compute constant denominators
288 roots_and_denominators[n + i] = 1;
289 for (size_t j = 0; j < n; ++j) {
290 if (j == i) {
291 continue;
292 }
293 roots_and_denominators[n + i] *= (evaluation_points[i] - evaluation_points[j]);
294 }
295 }
296 // at this point roots_and_denominators is populated as follows
297 // (x_0,\ldots, x_{n-1}, d_0, \ldots, d_{n-1})
298 Fr::batch_invert(roots_and_denominators.data(), 2 * n);
299
300 Fr z, multiplier;
301 std::vector<Fr> temp_dest(n);
302 size_t idx_zero = 0;
303 bool interpolation_domain_contains_zero = false;
304 // if the constant term of the numerator polynomial N(X) is 0, then the interpolation domain contains 0
305 // we find the index i_0, such that x_{i_0} = 0
306 if (numerator_polynomial[0] == Fr(0)) {
307 for (size_t i = 0; i < n; ++i) {
308 if (evaluation_points[i] == Fr(0)) {
309 idx_zero = i;
310 interpolation_domain_contains_zero = true;
311 break;
312 }
313 }
314 };
315
316 if (!interpolation_domain_contains_zero) {
317 for (size_t i = 0; i < n; ++i) {
318 // set z = - 1/x_i for x_i <> 0
319 z = roots_and_denominators[i];
320 // temp_src[i] is y_i, it gets multiplied by 1/d_i
321 multiplier = temp_src[i] * roots_and_denominators[n + i];
322 temp_dest[0] = multiplier * numerator_polynomial[0];
323 temp_dest[0] *= z;
324 dest[0] += temp_dest[0];
325 for (size_t j = 1; j < n; ++j) {
326 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
327 temp_dest[j] *= z;
328 dest[j] += temp_dest[j];
329 }
330 }
331 } else {
332 for (size_t i = 0; i < n; ++i) {
333 if (i == idx_zero) {
334 // the contribution from the term corresponding to i_0 is computed separately
335 continue;
336 }
337 // get the next inverted root
338 z = roots_and_denominators[i];
339 // compute f(x_i) * d_{x_i}^{-1}
340 multiplier = temp_src[i] * roots_and_denominators[n + i];
341 // get x_i^{-1} * f(x_i) * d_{x_i}^{-1} into the "free" term
342 temp_dest[1] = multiplier * numerator_polynomial[1];
343 temp_dest[1] *= z;
344 // correct the first coefficient as it is now accumulating free terms from
345 // f(x_i) d_i^{-1} prod_(X-x_i, x_i != 0) (X-x_i) * 1/(X-x_i)
346 dest[1] += temp_dest[1];
347 // compute the quotient N(X)/(X-x_i) f(x_i)/d_{x_i} and its contribution to the target coefficients
348 for (size_t j = 2; j < n; ++j) {
349 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
350 temp_dest[j] *= z;
351 dest[j] += temp_dest[j];
352 };
353 }
354 // correct the target coefficients by the contribution from q_{0} = N(X)/X * d_{i_0}^{-1} * f(0)
355 for (size_t i = 0; i < n; ++i) {
356 dest[i] += temp_src[idx_zero] * roots_and_denominators[n + idx_zero] * numerator_polynomial[i + 1];
357 }
358 }
359}
360
361template fr evaluate<fr>(const fr*, const fr&, const size_t);
362template void fft_inner_parallel<fr>(fr*, fr*, const EvaluationDomain<fr>&, const fr&, const std::vector<fr*>&);
363template void ifft<fr>(fr*, fr*, const EvaluationDomain<fr>&);
364template fr compute_sum<fr>(const fr*, const size_t);
365template void compute_linear_polynomial_product<fr>(const fr*, fr*, const size_t);
366template void compute_efficient_interpolation<fr>(const fr*, fr*, const fr*, const size_t);
367
368template grumpkin::fr evaluate<grumpkin::fr>(const grumpkin::fr*, const grumpkin::fr&, const size_t);
369template grumpkin::fr compute_sum<grumpkin::fr>(const grumpkin::fr*, const size_t);
370template void compute_linear_polynomial_product<grumpkin::fr>(const grumpkin::fr*, grumpkin::fr*, const size_t);
371template void compute_efficient_interpolation<grumpkin::fr>(const grumpkin::fr*,
373 const grumpkin::fr*,
374 const size_t);
375
376} // namespace bb::polynomial_arithmetic
#define BB_ASSERT(expression,...)
Definition assert.hpp:70
#define BB_ASSERT_EQ(actual, expected,...)
Definition assert.hpp:83
const std::vector< FF * > & get_inverse_round_roots() const
Fr compute_linear_polynomial_product_evaluation(const Fr *roots, const Fr z, const size_t n)
uint32_t reverse_bits(uint32_t x, uint32_t bit_length)
void ifft(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain)
void compute_linear_polynomial_product(const Fr *roots, Fr *dest, const size_t n)
Fr evaluate(const Fr *coeffs, const Fr &z, const size_t n)
void fft_inner_parallel(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain, const Fr &, const std::vector< Fr * > &root_table)
void compute_efficient_interpolation(const Fr *src, Fr *dest, const Fr *evaluation_points, const size_t n)
void scale_by_generator(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain, const Fr &generator_start, const Fr &generator_shift, const size_t generator_size)
Fr compute_sum(const Fr *src, const size_t n)
size_t get_num_cpus()
Definition thread.cpp:33
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
Curve::ScalarField Fr
size_t total_threads
Definition thread.hpp:151
size_t thread_index
Definition thread.hpp:150
auto range(size_t size, size_t offset=0) const
Definition thread.hpp:152
BB_INLINE constexpr field pow(const uint256_t &exponent) const noexcept
static void batch_invert(C &coeffs) noexcept
Batch invert a collection of field elements using Montgomery's trick.
static constexpr field zero()