21 x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
22 x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
23 x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
24 x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
25 return (((x >> 16) | (x << 16))) >> (32 - bit_length);
37 const Fr& generator_start,
38 const Fr& generator_shift,
39 const size_t generator_size)
42 "generator_size must be divisible by num_threads to avoid silently skipping elements");
44 Fr thread_shift = generator_shift.pow(static_cast<uint64_t>(j * (generator_size / domain.num_threads)));
45 Fr work_generator = generator_start * thread_shift;
46 const size_t offset = j * (generator_size / domain.num_threads);
47 const size_t end = offset + (generator_size / domain.num_threads);
48 for (size_t i = offset; i < end; ++i) {
49 target[i] = coeffs[i] * work_generator;
50 work_generator *= generator_shift;
60 BB_ASSERT(coeffs != target,
"fft_inner_parallel does not support in-place operation");
64 for (size_t i = (j * domain.thread_size); i < ((j + 1) * domain.thread_size); i += 2) {
65 uint32_t next_index_1 = (uint32_t)reverse_bits((uint32_t)i + 2, (uint32_t)domain.log2_size);
66 uint32_t next_index_2 = (uint32_t)reverse_bits((uint32_t)i + 3, (uint32_t)domain.log2_size);
67 __builtin_prefetch(&coeffs[next_index_1]);
68 __builtin_prefetch(&coeffs[next_index_2]);
70 uint32_t swap_index_1 = (uint32_t)reverse_bits((uint32_t)i, (uint32_t)domain.log2_size);
71 uint32_t swap_index_2 = (uint32_t)reverse_bits((uint32_t)i + 1, (uint32_t)domain.log2_size);
73 Fr::__copy(coeffs[swap_index_1], temp_1);
74 Fr::__copy(coeffs[swap_index_2], temp_2);
75 target[i + 1] = temp_1 - temp_2;
76 target[i] = temp_1 + temp_2;
81 for (
size_t m = 2; m < (domain.size); m <<= 1) {
92 const size_t start = j * (domain.thread_size >> 1);
93 const size_t end = (j + 1) * (domain.thread_size >> 1);
113 const size_t block_mask = m - 1;
123 const size_t index_mask = ~block_mask;
128 const Fr* round_roots = root_table[static_cast<size_t>(numeric::get_msb(m)) - 1];
133 for (size_t i = start; i < end; ++i) {
134 size_t k1 = (i & index_mask) << 1;
135 size_t j1 = i & block_mask;
136 temp = round_roots[j1] * target[k1 + j1 + m];
137 target[k1 + j1 + m] = target[k1 + j1] - temp;
138 target[k1 + j1] += temp;
271 for (
size_t i = 0; i < n; ++i) {
272 for (
size_t j = i + 1; j < n; ++j) {
273 BB_ASSERT(evaluation_points[i] != evaluation_points[j],
274 "compute_efficient_interpolation requires distinct evaluation points");
278 std::vector<Fr> numerator_polynomial(n + 1);
279 polynomial_arithmetic::compute_linear_polynomial_product(evaluation_points, numerator_polynomial.
data(), n);
281 std::vector<Fr> roots_and_denominators(2 * n);
282 std::vector<Fr> temp_src(n);
283 for (
size_t i = 0; i < n; ++i) {
284 roots_and_denominators[i] = -evaluation_points[i];
285 temp_src[i] = src[i];
288 roots_and_denominators[n + i] = 1;
289 for (
size_t j = 0; j < n; ++j) {
293 roots_and_denominators[n + i] *= (evaluation_points[i] - evaluation_points[j]);
301 std::vector<Fr> temp_dest(n);
303 bool interpolation_domain_contains_zero =
false;
306 if (numerator_polynomial[0] ==
Fr(0)) {
307 for (
size_t i = 0; i < n; ++i) {
308 if (evaluation_points[i] ==
Fr(0)) {
310 interpolation_domain_contains_zero =
true;
316 if (!interpolation_domain_contains_zero) {
317 for (
size_t i = 0; i < n; ++i) {
319 z = roots_and_denominators[i];
321 multiplier = temp_src[i] * roots_and_denominators[n + i];
322 temp_dest[0] = multiplier * numerator_polynomial[0];
324 dest[0] += temp_dest[0];
325 for (
size_t j = 1; j < n; ++j) {
326 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
328 dest[j] += temp_dest[j];
332 for (
size_t i = 0; i < n; ++i) {
338 z = roots_and_denominators[i];
340 multiplier = temp_src[i] * roots_and_denominators[n + i];
342 temp_dest[1] = multiplier * numerator_polynomial[1];
346 dest[1] += temp_dest[1];
348 for (
size_t j = 2; j < n; ++j) {
349 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
351 dest[j] += temp_dest[j];
355 for (
size_t i = 0; i < n; ++i) {
356 dest[i] += temp_src[idx_zero] * roots_and_denominators[n + idx_zero] * numerator_polynomial[i + 1];