Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
bb_bench.cpp
Go to the documentation of this file.
2#include <cstdint>
3#include <sys/types.h>
4#if !defined(__wasm__) || defined(ENABLE_WASM_BENCH)
6#include "bb_bench.hpp"
7#include <algorithm>
8#include <atomic>
9#include <cassert>
10#include <chrono>
11#include <cmath>
12#include <functional>
13#include <iomanip>
14#include <iostream>
15#include <mutex>
16#include <ostream>
17#include <set>
18#include <sstream>
19#include <thread>
20#include <unordered_map>
21#include <vector>
22
23namespace {
24// ANSI color codes
25struct Colors {
26 static constexpr const char* WHITE = "\033[37m";
27 static constexpr const char* RESET = "\033[0m";
28 static constexpr const char* BOLD = "\033[1m";
29 static constexpr const char* CYAN = "\033[36m";
30 static constexpr const char* GREEN = "\033[32m";
31 static constexpr const char* YELLOW = "\033[33m";
32 static constexpr const char* MAGENTA = "\033[35m";
33 static constexpr const char* DIM = "\033[2m";
34 static constexpr const char* RED = "\033[31m";
35};
36
37// Format time value with appropriate unit
38std::string format_time(double time_ms)
39{
40 std::ostringstream oss;
41 if (time_ms >= 1000.0) {
42 oss << std::fixed << std::setprecision(2) << (time_ms / 1000.0) << " s";
43 } else if (time_ms >= 1.0 && time_ms < 1000.0) {
44 oss << std::fixed << std::setprecision(2) << time_ms << " ms";
45 } else {
46 oss << std::fixed << std::setprecision(1) << (time_ms * 1000.0) << " μs";
47 }
48 return oss.str();
49}
50
51// Format time with fixed width for alignment
52std::string format_time_aligned(double time_ms)
53{
54 std::ostringstream oss;
55 if (time_ms >= 1000.0) {
56 std::ostringstream time_oss;
57 time_oss << std::fixed << std::setprecision(2) << (time_ms / 1000.0) << "s";
58 oss << std::left << std::setw(10) << time_oss.str();
59 } else {
60 std::ostringstream time_oss;
61 time_oss << std::fixed << std::setprecision(1) << time_ms << "ms";
62 oss << std::left << std::setw(10) << time_oss.str();
63 }
64 return oss.str();
65}
66
67// Helper to format percentage value
68std::string format_percentage_value(double percentage, const char* color)
69{
70 std::ostringstream oss;
71 oss << color << " " << std::left << std::fixed << std::setprecision(1) << std::setw(5) << percentage << "%"
72 << Colors::RESET;
73 return oss.str();
74}
75
76// Helper to format percentage with color based on percentage value
77std::string format_percentage(double value, double total, double min_threshold = 0.0)
78{
79 double percentage = (total <= 0) ? 0.0 : (value / total) * 100.0;
80 if (total <= 0 || percentage < min_threshold) {
81 return " ";
82 }
83
84 // Choose color based on percentage value (like time colors)
85 const char* color = Colors::CYAN; // Default color
86
87 return format_percentage_value(percentage, color);
88}
89
90// Helper to format percentage section
91std::string format_percentage_section(double time_ms, double parent_time, size_t indent_level)
92{
93 if (parent_time > 0 && indent_level > 0) {
94 return format_percentage(time_ms * 1000000.0, parent_time);
95 }
96 return " ";
97}
98
99// Helper to format time section
100std::string format_time_section(double time_ms)
101{
102 std::ostringstream oss;
103 oss << " ";
104 if (time_ms >= 100.0 && time_ms < 1000.0) {
105 oss << Colors::DIM << format_time_aligned(time_ms) << Colors::RESET;
106 } else {
107 oss << format_time_aligned(time_ms);
108 }
109 return oss.str();
110}
111
112// Helper to format call stats
113std::string format_call_stats(double time_ms, uint64_t count)
114{
115 if (!(time_ms >= 100.0 && count > 1)) {
116 return "";
117 }
118 double avg_ms = time_ms / static_cast<double>(count);
119 std::ostringstream oss;
120 oss << Colors::DIM << " (" << format_time(avg_ms) << " x " << count << ")" << Colors::RESET;
121 return oss.str();
122}
123
124std::string format_aligned_section(double time_ms, double parent_time, uint64_t count, size_t indent_level)
125{
126 std::ostringstream oss;
127
128 // Add indent level indicator at the beginning with different color
129 oss << Colors::MAGENTA << "[" << indent_level << "] " << Colors::RESET;
130
131 // Format percentage FIRST
132 oss << format_percentage_section(time_ms, parent_time, indent_level);
133
134 // Format time AFTER percentage with appropriate color (with more spacing)
135 oss << format_time_section(time_ms);
136
137 // Format calls/threads info - only show for >= 100ms, make it DIM
138 oss << format_call_stats(time_ms, count);
139
140 return oss.str();
141}
142
143// Get color based on time threshold
144struct TimeColor {
145 const char* name_color;
146 const char* time_color;
147};
148
149TimeColor get_time_colors(double time_ms)
150{
151 if (time_ms >= 1000.0) {
152 return { Colors::BOLD, Colors::WHITE };
153 }
154 if (time_ms >= 100.0) {
155 return { Colors::YELLOW, Colors::YELLOW };
156 }
157 return { Colors::DIM, Colors::DIM };
158}
159
160// Print separator line
161void print_separator(std::ostream& os, bool thick = true)
162{
163 const char* line = thick ? "═══════════════════════════════════════════════════════════════════════════════════════"
164 "═════════════════════"
165 : "───────────────────────────────────────────────────────────────────────────────────────"
166 "─────────────────────";
167 os << Colors::BOLD << Colors::CYAN << line << Colors::RESET << "\n";
168}
169} // anonymous namespace
170
171namespace bb::detail {
172
173// use_bb_bench is also set by --print_bench and --bench_out flags
174// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
175bool use_bb_bench = std::getenv("BB_BENCH") == nullptr ? false : std::string(std::getenv("BB_BENCH")) == "1";
176// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
177std::atomic<bool> capture_per_call_events{ false };
178// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
179using OperationKey = std::string_view;
180
182{
183 if (stats.count == 0) {
184 return;
185 }
186 // Account for aggregate time and count
187 time += stats.time;
188 count += stats.count;
189 time_max = std::max(stats.time, time_max);
190 // Use Welford's method to be able to track the variance
191 num_threads++;
192 double delta = static_cast<double>(stats.time) - time_mean;
193 time_mean += delta / static_cast<double>(num_threads);
194 double delta2 = static_cast<double>(stats.time) - time_mean;
195 time_m2 += delta * delta2;
196}
197
199{
200 // Calculate standard deviation
201 if (num_threads > 1) {
202 return std::sqrt(time_m2 / static_cast<double>(num_threads - 1));
203 }
204 return 0;
205}
206
207// Normalize the raw benchmark data into a clean structure for display
209{
210 AggregateData result;
211
212 // Each count has a unique [thread, key] combo.
213 // We therefore treat each count as a thread's contribution to that key.
214 for (const std::shared_ptr<TimeStatsEntry>& entry : entries) {
215 // A map from parent key => AggregateEntry
216 auto& entry_map = result[entry->key];
217 // combine all entries with same parent key
218 std::map<OperationKey, TimeAndCount> parent_key_to_stats;
219
220 // For collection-time performance, we allow multiple stat blocks with the same parent. It'd be simpler to have
221 // one but we just have to combine them here.
222 for (const TimeStats* stats = &entry->count; stats != nullptr; stats = stats->next.get()) {
223 OperationKey parent_key = stats->parent != nullptr ? stats->parent->key : "";
224 parent_key_to_stats[parent_key].count += stats->count;
225 parent_key_to_stats[parent_key].time += stats->time;
226 }
227
228 for (auto [parent_key, stats] : parent_key_to_stats) {
229 auto& normalized_entry = entry_map[parent_key];
230 normalized_entry.key = entry->key;
231 normalized_entry.parent = parent_key;
232 normalized_entry.add_thread_time_sample(stats);
233 }
234 }
235
236 return result;
237}
238
245
247{
249 entry->key = key;
250 entries.push_back(entry);
251}
252
254{
257 buf->tid = tid;
258 // Reserve up front: avoids reallocation churn when a worker thread emits
259 // tens of thousands of events over a full Chonk prove.
260 buf->events.reserve(1U << 14U);
261 ThreadEventBuffer& result = *buf;
262 thread_event_buffers.push_back(std::move(buf));
263 return result;
264}
265
267{
268 static thread_local ThreadEventBuffer* tl_buf = nullptr;
269 if (tl_buf == nullptr) {
270 const uint64_t tid = static_cast<uint64_t>(std::hash<std::thread::id>{}(std::this_thread::get_id()));
272 }
273 return *tl_buf;
274}
275
277{
278 std::cout << "GlobalBenchStatsContainer::print() START"
279 << "\n";
280 for (const std::shared_ptr<TimeStatsEntry>& entry : entries) {
281 print_stats_recursive(entry->key, &entry->count, "");
282 }
283 std::cout << "GlobalBenchStatsContainer::print() END"
284 << "\n";
285}
286
288 const TimeStats* stats,
289 const std::string& indent) const
290{
291 if (stats->count > 0) {
292 std::cout << indent << key << "\t" << stats->count << "\n";
293 }
294 if (stats->time > 0) {
295 std::cout << indent << key << "(t)\t" << static_cast<double>(stats->time) / 1000000.0 << "ms\n";
296 }
297
298 if (stats->next != nullptr) {
299 print_stats_recursive(key, stats->next.get(), indent + " ");
300 }
301}
302
303void GlobalBenchStatsContainer::print_aggregate_counts(std::ostream& os, size_t indent) const
304{
305 os << '{';
306 bool first = true;
307 for (const auto& [key, entry_map] : aggregate()) {
308 // Loop for a flattened view
309 uint64_t time = 0;
310 for (auto& [parent_key, entry] : entry_map) {
311 time += entry.time_max;
312 }
313
314 if (!first) {
315 os << ',';
316 }
317 if (indent > 0) {
318 os << "\n" << std::string(indent, ' ');
319 }
320 os << '"' << key << "\":" << time;
321 first = false;
322 }
323 if (indent > 0) {
324 os << "\n";
325 }
326 os << '}' << "\n";
327}
328
329// Serializable structure for a single benchmark entry (msgpack-compatible)
341
343{
345
346 // Convert AggregateData to a msgpack-serializable map
348
349 for (const auto& [key, parent_map] : data) {
351
352 for (const auto& [parent_key, entry] : parent_map) {
353 // Skip _root entries that have zero time (never called at root level)
354 if (parent_key.empty() && entry.time == 0) {
355 continue;
356 }
357
358 entries.push_back(SerializableEntry{ .parent = parent_key.empty() ? "_root" : std::string(parent_key),
359 .time = entry.time,
360 .time_max = entry.time_max,
361 .time_mean = entry.time_mean,
362 .time_stddev = entry.get_std_dev(),
363 .count = entry.count,
364 .num_threads = entry.num_threads });
365 }
366
367 // Only add functions that have non-empty entries
368 if (!entries.empty()) {
369 serializable_data[std::string(key)] = entries;
370 }
371 }
372
373 // Use msgpack to serialize and convert to JSON
374 msgpack::sbuffer buffer;
375 msgpack::pack(buffer, serializable_data);
376 msgpack::object_handle oh = msgpack::unpack(buffer.data(), buffer.size());
377 os << oh.get() << std::endl;
378}
379
380namespace {
381// Emit a single Chrome Trace Event Format "X" (complete) event.
382// ts/dur are in microseconds (the Chrome Trace convention), with 3 digits after the
383// decimal so we don't lose nanosecond precision.
384void emit_x_event(
385 std::ostream& os, OperationKey name, OperationKey parent, double ts_us, double dur_us, uint64_t tid, bool& first)
386{
387 if (!first) {
388 os << ',';
389 }
390 first = false;
391 os << "\n {";
392 os << "\"name\":\"" << name << "\"";
393 os << ",\"cat\":\"bb\"";
394 os << ",\"ph\":\"X\"";
395 os << ",\"pid\":0";
396 os << ",\"tid\":" << tid;
397 os << ",\"ts\":" << std::fixed << std::setprecision(3) << ts_us;
398 os << ",\"dur\":" << std::fixed << std::setprecision(3) << dur_us;
399 if (!parent.empty() && parent != "_root") {
400 os << ",\"args\":{\"parent\":\"" << parent << "\"}";
401 }
402 os << "}";
403}
404} // namespace
405
407{
408 std::unique_lock<std::mutex> lock(const_cast<std::mutex&>(event_mutex));
409
410 // Find the earliest start across all threads so the timeline begins at ts=0 —
411 // absolute ns-since-epoch values are huge and make Perfetto's axis awkward.
412 uint64_t min_ts = UINT64_MAX;
413 for (const auto& buf : thread_event_buffers) {
414 for (const PerCallEvent& e : buf->events) {
415 if (e.ts_ns < min_ts) {
416 min_ts = e.ts_ns;
417 }
418 }
419 }
420 if (min_ts == UINT64_MAX) {
421 min_ts = 0;
422 }
423
424 os << "{\n \"displayTimeUnit\":\"us\",\n \"traceEvents\":[";
425 bool first = true;
426
427 // Remap each thread's raw pthread-hash tid to a small integer. Native
428 // std::hash<thread::id> returns values > 2^32 (pthread stack addresses),
429 // which Perfetto's Chrome-trace loader can collapse onto a single track.
430 // Small integer tids avoid that and also match the WASM trace style.
432 uint64_t main_raw_tid = UINT64_MAX;
433 uint64_t main_start = UINT64_MAX;
434 for (const auto& buf : thread_event_buffers) {
435 if (!buf->events.empty() && buf->events.front().ts_ns < main_start) {
436 main_start = buf->events.front().ts_ns;
437 main_raw_tid = buf->tid;
438 }
439 }
440 // Main gets tid=1 (sorted first). Workers get 2..N.
441 tid_remap[main_raw_tid] = 1;
442 uint64_t next_worker_tid = 2;
443 for (const auto& buf : thread_event_buffers) {
444 if (buf->tid == main_raw_tid) {
445 continue;
446 }
447 tid_remap[buf->tid] = next_worker_tid++;
448 }
449
450 if (!first) {
451 os << ',';
452 }
453 first = false;
454 os << "\n {\"name\":\"process_name\",\"ph\":\"M\",\"pid\":0,\"tid\":0,\"args\":{\"name\":\"bb\"}}";
455 for (const auto& buf : thread_event_buffers) {
456 uint64_t small_tid = tid_remap[buf->tid];
457 std::string label = (buf->tid == main_raw_tid) ? "main" : ("worker-" + std::to_string(small_tid - 1));
458 os << ",\n {\"name\":\"thread_name\",\"ph\":\"M\",\"pid\":0,\"tid\":" << small_tid
459 << ",\"args\":{\"name\":\"" << label << "\"}}";
460 os << ",\n {\"name\":\"thread_sort_index\",\"ph\":\"M\",\"pid\":0,\"tid\":" << small_tid
461 << ",\"args\":{\"sort_index\":" << small_tid << "}}";
462 }
463
464 for (const auto& buf : thread_event_buffers) {
465 for (const PerCallEvent& e : buf->events) {
466 double ts_us = static_cast<double>(e.ts_ns - min_ts) / 1000.0;
467 double dur_us = static_cast<double>(e.dur_ns) / 1000.0;
468 emit_x_event(os, e.name, e.parent, ts_us, dur_us, tid_remap[e.tid], first);
469 }
470 }
471 os << "\n ]\n}\n";
472}
473
475{
477
478 os << "{\n \"displayTimeUnit\":\"us\",\n \"traceEvents\":[";
479 bool first = true;
480
481 // Map each (key, parent) aggregate entry to a synthesized ph:"X" block. We DFS from
482 // roots (empty parent), assigning ts sequentially so children are contained within
483 // their parent's [ts, ts+dur] range — that makes Chrome/Perfetto nest them visually.
484 // Using tid=0 across the whole synthesized trace; per-thread layout would double-count
485 // time since aggregates already sum across threads.
486 std::function<void(OperationKey, OperationKey, uint64_t)> emit_tree;
487 emit_tree = [&](OperationKey key, OperationKey parent_key, uint64_t ts_start_ns) -> void {
488 auto it = data.find(key);
489 if (it == data.end()) {
490 return;
491 }
492 const AggregateEntry* self = nullptr;
493 for (const auto& [p, entry] : it->second) {
494 if (p == parent_key) {
495 self = &entry;
496 break;
497 }
498 }
499 if (self == nullptr || self->time_max == 0) {
500 return;
501 }
502
503 emit_x_event(os,
504 key,
505 parent_key.empty() ? OperationKey{ "_root" } : parent_key,
506 static_cast<double>(ts_start_ns) / 1000.0,
507 static_cast<double>(self->time_max) / 1000.0,
508 /*tid=*/0,
509 first);
510
511 // Collect children: any entry whose parent_map contains `key`.
513 for (const auto& [child_key, pmap] : data) {
514 auto cit = pmap.find(key);
515 if (cit != pmap.end() && cit->second.time_max > 0) {
516 children.emplace_back(child_key, cit->second.time_max);
517 }
518 }
519 // Largest children first so big blocks are visually dominant.
520 std::sort(children.begin(), children.end(), [](const auto& a, const auto& b) { return a.second > b.second; });
521
522 uint64_t child_offset = 0;
523 for (const auto& [child_key, child_dur] : children) {
524 emit_tree(child_key, key, ts_start_ns + child_offset);
525 child_offset += child_dur;
526 }
527 };
528
529 // Roots: any key that has an empty-parent aggregate entry with non-zero time.
531 for (const auto& [key, pmap] : data) {
532 auto pit = pmap.find("");
533 if (pit != pmap.end() && pit->second.time_max > 0) {
534 roots.emplace_back(key, pit->second.time_max);
535 }
536 }
537 std::sort(roots.begin(), roots.end(), [](const auto& a, const auto& b) { return a.second > b.second; });
538
539 uint64_t root_offset = 0;
540 for (const auto& [root_key, root_dur] : roots) {
541 emit_tree(root_key, /*parent_key=*/OperationKey{ "" }, root_offset);
542 root_offset += root_dur;
543 }
544
545 os << "\n ]\n}\n";
546}
547
549{
550 AggregateData aggregated = aggregate();
551
552 if (aggregated.empty()) {
553 os << "No benchmark data collected\n";
554 return;
555 }
556
557 // Print header
558 os << "\n";
559 print_separator(os, true);
560 os << Colors::BOLD << " Benchmark Results" << Colors::RESET << "\n";
561 print_separator(os, true);
562
564 std::set<OperationKey> printed_in_detail;
565 for (auto& [key, entry_map] : aggregated) {
566 for (auto& [parent_key, entry] : entry_map) {
567 if (entry.count > 0) {
568 keys_to_parents[key].insert(parent_key);
569 }
570 }
571 }
572
573 // Helper function to print a stat line with tree drawing
574 auto print_entry = [&](const AggregateEntry& entry, size_t indent_level, bool is_last, uint64_t parent_time) {
575 std::string indent(indent_level * 2, ' ');
576 std::string prefix = (indent_level == 0) ? "" : (is_last ? "└─ " : "├─ ");
577
578 // Use exactly 80 characters for function name without indent
579 const size_t name_width = 80;
580 std::string display_name = std::string(entry.key);
581 if (display_name.length() > name_width) {
582 display_name = display_name.substr(0, name_width - 3) + "...";
583 }
584
585 double time_ms = static_cast<double>(entry.time_max) / 1000000.0;
586 auto colors = get_time_colors(time_ms);
587
588 // Print indent + prefix + name (exactly 80 chars) + time/percentage/calls
589 os << indent << prefix << colors.name_color;
590 if (time_ms >= 1000.0 && colors.name_color == Colors::BOLD) {
591 os << Colors::YELLOW; // Special case: bold yellow for >= 1s
592 }
593 os << std::left << std::setw(static_cast<int>(name_width)) << display_name << Colors::RESET;
594
595 // Print time if available with aligned section including indent level
596 if (entry.time_max > 0) {
597 if (time_ms < 100.0) {
598 // Minimal format for <100ms: only [level] and percentage, no time display
599 std::ostringstream minimal_oss;
600 minimal_oss << Colors::MAGENTA << "[" << indent_level << "] " << Colors::RESET;
601 minimal_oss << format_percentage_section(time_ms, static_cast<double>(parent_time), indent_level);
602 minimal_oss << " " << std::setw(10) << ""; // Add spacing to replace where time would be
603 os << " " << colors.time_color << std::setw(40) << std::left << minimal_oss.str() << Colors::RESET;
604 } else {
605 std::string aligned_section =
606 format_aligned_section(time_ms, static_cast<double>(parent_time), entry.count, indent_level);
607 os << " " << colors.time_color << std::setw(40) << std::left << aligned_section << Colors::RESET;
608 if (entry.num_threads > 1) {
609 double mean_ms = entry.time_mean / 1000000.0;
610 double stddev_percentage = floor(entry.get_std_dev() * 100 / entry.time_mean);
611 os << " " << entry.num_threads << " threads " << mean_ms << "ms average " << stddev_percentage
612 << "% stddev";
613 }
614 }
615 }
616
617 os << "\n";
618 };
619
620 // Recursive function to print hierarchy
621 std::function<void(OperationKey, size_t, bool, uint64_t, OperationKey)> print_hierarchy;
622 print_hierarchy = [&](OperationKey key,
623 size_t indent_level,
624 bool is_last,
625 uint64_t parent_time,
626 OperationKey current_parent) -> void {
627 auto it = aggregated.find(key);
628 if (it == aggregated.end()) {
629 return;
630 }
631
632 // Find the entry with the specific parent context
633 const AggregateEntry* entry_to_print = nullptr;
634 for (const auto& [parent_key, entry] : it->second) {
635 if ((indent_level == 0 && parent_key.empty()) || (indent_level > 0 && parent_key == current_parent)) {
636 entry_to_print = &entry;
637 break;
638 }
639 }
640
641 if (!entry_to_print) {
642 return;
643 }
644
645 // Print this entry
646 print_entry(*entry_to_print, indent_level, is_last, parent_time);
647
648 // Find and print children - operations that have this key as parent (only those with meaningful time >= 0.5ms)
650 if (!printed_in_detail.contains(key)) {
651 for (const auto& [child_key, parent_map] : aggregated) {
652 for (const auto& [parent_key, entry] : parent_map) {
653 if (parent_key == key && entry.time_max >= 500000) { // 0.5ms in nanoseconds
654 children.push_back(child_key);
655 break;
656 }
657 }
658 }
659 printed_in_detail.insert(key);
660 }
661
662 // Sort children by their time in THIS parent context
663 std::ranges::sort(children, [&](OperationKey a, OperationKey b) {
664 uint64_t time_a = 0;
665 uint64_t time_b = 0;
666 if (auto it = aggregated.find(a); it != aggregated.end()) {
667 for (const auto& [parent_key, entry] : it->second) {
668 if (parent_key == key) {
669 time_a = entry.time_max;
670 break;
671 }
672 }
673 }
674 if (auto it = aggregated.find(b); it != aggregated.end()) {
675 for (const auto& [parent_key, entry] : it->second) {
676 if (parent_key == key) {
677 time_b = entry.time_max;
678 break;
679 }
680 }
681 }
682 return time_a > time_b;
683 });
684
685 // Calculate time spent in children and add "(other)" if >5% unaccounted
686 uint64_t children_total_time = 0;
687 for (const auto& child_key : children) {
688 if (auto it = aggregated.find(child_key); it != aggregated.end()) {
689 for (const auto& [parent_key, entry] : it->second) {
690 if (parent_key == key && entry.time_max >= 500000) { // 0.5ms in nanoseconds
691 children_total_time += entry.time_max;
692 }
693 }
694 }
695 }
696 uint64_t parent_total_time = entry_to_print->time_max;
697 bool should_add_other = false;
698 if (!children.empty() && parent_total_time > 0 && children_total_time < parent_total_time) {
699 uint64_t unaccounted = parent_total_time - children_total_time;
700 double percentage = (static_cast<double>(unaccounted) / static_cast<double>(parent_total_time)) * 100.0;
701 should_add_other = percentage > 5.0 && unaccounted > 0;
702 }
703 uint64_t other_time = should_add_other ? (parent_total_time - children_total_time) : 0;
704
705 if (!children.empty() && keys_to_parents[key].size() > 1) {
706 os << std::string(indent_level * 2, ' ') << " ├─ NOTE: Shared children. Can add up to > 100%.\n";
707 }
708
709 // Print children
710 for (size_t i = 0; i < children.size(); ++i) {
711 bool is_last_child = (i == children.size() - 1) && !should_add_other;
712 print_hierarchy(children[i], indent_level + 1, is_last_child, entry_to_print->time, key);
713 }
714
715 // Print "(other)" category if significant unaccounted time exists
716 if (should_add_other && keys_to_parents[key].size() <= 1) {
717 AggregateEntry other_entry;
718 other_entry.key = "(other)";
719 other_entry.time = other_time;
720 other_entry.time_max = other_time;
721 other_entry.count = 1;
722 other_entry.num_threads = 1;
723 print_entry(other_entry, indent_level + 1, true, parent_total_time); // always last
724 }
725 };
726
727 // Find root entries (those that ONLY have empty parent key and significant time)
729 for (const auto& [key, parent_map] : aggregated) {
730 auto empty_parent_it = parent_map.find("");
731 if (empty_parent_it != parent_map.end() && empty_parent_it->second.time > 0) {
732 roots.push_back(key);
733 }
734 }
735
736 // Sort roots by time (descending)
737 std::ranges::sort(roots, [&](OperationKey a, OperationKey b) {
738 uint64_t time_a = 0;
739 uint64_t time_b = 0;
740 if (auto it_a = aggregated.find(a); it_a != aggregated.end()) {
741 if (auto parent_it = it_a->second.find(""); parent_it != it_a->second.end()) {
742 time_a = parent_it->second.time_max;
743 }
744 }
745 if (auto it_b = aggregated.find(b); it_b != aggregated.end()) {
746 if (auto parent_it = it_b->second.find(""); parent_it != it_b->second.end()) {
747 time_b = parent_it->second.time_max;
748 }
749 }
750 return time_a > time_b;
751 });
752
753 // Print hierarchies starting from roots
754 for (size_t i = 0; i < roots.size(); ++i) {
755 print_hierarchy(roots[i], 0, i == roots.size() - 1, 0, "");
756 }
757
758 // Print summary
759 print_separator(os, false);
760
761 // Calculate totals from root entries
762 std::set<OperationKey> unique_funcs;
763 for (const auto& [key, _] : aggregated) {
764 unique_funcs.insert(key);
765 }
766 size_t unique_functions_count = unique_funcs.size();
767
768 uint64_t shared_count = 0;
769 for (const auto& [key, parents] : keys_to_parents) {
770 if (parents.size() > 1) {
771 shared_count++;
772 }
773 }
774
775 uint64_t total_time = 0;
776 for (const auto& [_, parent_map] : aggregated) {
777 if (auto it = parent_map.find(""); it != parent_map.end()) {
778 total_time = std::max(total_time, it->second.time_max);
779 }
780 }
781
782 uint64_t total_calls = 0;
783 for (const auto& [_, parent_map] : aggregated) {
784 for (const auto& [__, entry] : parent_map) {
785 total_calls += entry.count;
786 }
787 }
788
789 double total_time_ms = static_cast<double>(total_time) / 1000000.0;
790
791 os << " " << Colors::BOLD << "Total: " << Colors::RESET << Colors::MAGENTA << unique_functions_count
792 << " functions" << Colors::RESET;
793 if (shared_count > 0) {
794 os << " (" << Colors::RED << shared_count << " shared" << Colors::RESET << ")";
795 }
796 os << ", " << Colors::GREEN << total_calls << " measurements" << Colors::RESET << ", " << Colors::YELLOW;
797 if (total_time_ms >= 1000.0) {
798 os << std::fixed << std::setprecision(2) << (total_time_ms / 1000.0) << " seconds";
799 } else {
800 os << std::fixed << std::setprecision(2) << total_time_ms << " ms";
801 }
802 os << Colors::RESET;
803
804 os << "\n";
805 print_separator(os, true);
806 os << "\n";
807}
808
810{
813 entry->count = TimeStats();
814 }
816 for (const auto& buf : thread_event_buffers) {
817 buf->events.clear();
818 }
819}
820
821// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
823
825 : parent(nullptr)
826 , stats(entry)
827 , time(0)
828{
829 if (stats == nullptr) {
830 return;
831 }
832 // Track the current parent context
834 auto now = std::chrono::high_resolution_clock::now();
835 auto now_ns = std::chrono::time_point_cast<std::chrono::nanoseconds>(now);
836 time = static_cast<uint64_t>(now_ns.time_since_epoch().count());
837}
839{
840 if (stats == nullptr) {
841 return;
842 }
843 auto now = std::chrono::high_resolution_clock::now();
844 auto now_ns = std::chrono::time_point_cast<std::chrono::nanoseconds>(now);
845 uint64_t end_ns = static_cast<uint64_t>(now_ns.time_since_epoch().count());
846 // Add, taking advantage of our parent context
847 stats->count.track(parent, end_ns - time);
848
849 // Per-call event capture for Chrome Trace Event / Perfetto output. Only active when
850 // --trace_out_perfetto was set; otherwise a single relaxed atomic load on the hot path.
853 buf.events.push_back(PerCallEvent{
854 /*name=*/stats->key,
855 /*parent=*/parent != nullptr ? parent->key : OperationKey{ "_root" },
856 /*ts_ns=*/time,
857 /*dur_ns=*/end_ns - time,
858 /*tid=*/buf.tid,
859 });
860 }
861
862 // Unwind to previous parent
864}
865} // namespace bb::detail
866#endif
const std::vector< MemoryValue > data
FF a
FF b
std::unique_ptr< uint8_t[]> buffer
Definition engine.cpp:60
GlobalBenchStatsContainer GLOBAL_BENCH_STATS
Definition bb_bench.cpp:822
ThreadEventBuffer & get_thread_event_buffer()
Definition bb_bench.cpp:266
std::atomic< bool > capture_per_call_events
Definition bb_bench.cpp:177
std::unordered_map< OperationKey, std::map< OperationKey, AggregateEntry > > AggregateData
Definition bb_bench.hpp:84
bool use_bb_bench
Definition bb_bench.cpp:175
std::string_view OperationKey
Definition bb_bench.cpp:179
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
std::string to_string(bb::avm2::ValueTag tag)
Definition bb_bench.hpp:63
void add_thread_time_sample(const TimeAndCount &stats)
Definition bb_bench.cpp:181
uint64_t time
Definition bb_bench.hpp:67
double time_m2
Definition bb_bench.hpp:75
double time_mean
Definition bb_bench.hpp:70
size_t num_threads
Definition bb_bench.hpp:69
uint64_t time_max
Definition bb_bench.hpp:71
double get_std_dev() const
Definition bb_bench.cpp:198
OperationKey key
Definition bb_bench.hpp:65
uint64_t count
Definition bb_bench.hpp:68
BenchReporter(TimeStatsEntry *entry)
Definition bb_bench.cpp:824
TimeStatsEntry * parent
Definition bb_bench.hpp:227
TimeStatsEntry * stats
Definition bb_bench.hpp:228
void print_stats_recursive(const OperationKey &key, const TimeStats *stats, const std::string &indent) const
Definition bb_bench.cpp:287
void print_aggregate_counts_hierarchical(std::ostream &) const
Definition bb_bench.cpp:548
void serialize_trace_events_json(std::ostream &) const
Definition bb_bench.cpp:406
void print_aggregate_counts(std::ostream &, size_t) const
Definition bb_bench.cpp:303
void serialize_aggregate_data_json(std::ostream &) const
Definition bb_bench.cpp:342
void add_entry(const char *key, const std::shared_ptr< TimeStatsEntry > &entry)
Definition bb_bench.cpp:246
std::vector< std::unique_ptr< ThreadEventBuffer > > thread_event_buffers
Definition bb_bench.hpp:116
std::vector< std::shared_ptr< TimeStatsEntry > > entries
Definition bb_bench.hpp:112
void serialize_aggregate_trace_json(std::ostream &) const
Definition bb_bench.cpp:474
ThreadEventBuffer & register_thread_event_buffer(uint64_t tid)
Definition bb_bench.cpp:253
static thread_local TimeStatsEntry * parent
Definition bb_bench.hpp:109
Definition bb_bench.cpp:330
double time_mean
Definition bb_bench.cpp:334
std::string parent
Definition bb_bench.cpp:331
uint64_t time
Definition bb_bench.cpp:332
double time_stddev
Definition bb_bench.cpp:335
uint64_t time_max
Definition bb_bench.cpp:333
uint64_t num_threads
Definition bb_bench.cpp:337
SERIALIZATION_FIELDS(parent, time, time_max, time_mean, time_stddev, count, num_threads)
uint64_t count
Definition bb_bench.cpp:336
std::vector< PerCallEvent > events
Definition bb_bench.hpp:100
Definition bb_bench.hpp:193
OperationKey key
Definition bb_bench.hpp:194
TimeStats count
Definition bb_bench.hpp:195
void track(TimeStatsEntry *current_parent, uint64_t time_val)
Definition bb_bench.hpp:160
std::unique_ptr< TimeStats > next
Definition bb_bench.hpp:151
#define RED
#define RESET