Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
bb_bench.hpp
Go to the documentation of this file.
1
2#pragma once
3
5#include <atomic>
6#include <iostream>
7#include <map>
8#include <memory>
9#include <mutex>
10#include <ostream>
11#include <string_view>
12#include <tracy/Tracy.hpp>
13#include <unordered_map>
14#include <vector>
15
21namespace bb::detail {
22// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
23extern bool use_bb_bench;
24// When true, BenchReporter pushes a {name, parent, ts, dur, tid} record into a per-thread
25// event buffer on every scope exit, for Chrome Trace Event / Perfetto-style output.
26// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
27extern std::atomic<bool> capture_per_call_events;
28
29// Compile-time string
30// See e.g. https://www.reddit.com/r/cpp_questions/comments/pumi9r/does_c20_not_support_string_literals_as_template/
31template <std::size_t N> struct OperationLabel {
32 constexpr static std::size_t size() { return N; }
33 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays)
34 constexpr OperationLabel(const char (&str)[N])
35 {
36 for (std::size_t i = 0; i < N; ++i) {
37 value[i] = str[i];
38 }
39 }
40
41 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays)
42 char value[N];
43};
44
45template <OperationLabel op1, OperationLabel op2> constexpr auto concat()
46{
47 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays)
48 char result_cstr[op1.size() + op2.size() - 1] = {};
49 std::copy(op1.value, op1.value + op1.size() - 1, result_cstr);
50 std::copy(op2.value, op2.value + op2.size(), result_cstr + op1.size() - 1);
51 return OperationLabel{ result_cstr };
52}
53struct TimeStats;
54struct TimeStatsEntry;
55using OperationKey = std::string_view;
56
58 uint64_t time = 0;
59 uint64_t count = 0;
60};
61
62// Normalized benchmark entry - each represents a unique (function, parent) pair
64 // For convenience, even though redundant with map store
67 uint64_t time = 0;
68 uint64_t count = 0;
69 size_t num_threads = 0;
70 double time_mean = 0;
71 uint64_t time_max = 0;
72 double time_stddev = 0;
73
74 // Welford's algorithm state
75 double time_m2 = 0; // sum of squared differences from mean
76
77 void add_thread_time_sample(const TimeAndCount& stats);
78 double get_std_dev() const;
79};
80
81// AggregateData: Result of normalizing benchmark data
82// entries: Key -> ParentKey -> Entry
83// Empty string is used as key if the entry has no parent.
85
86// A single scope entry/exit pair — captured only when capture_per_call_events is true.
87// name and parent are stable string_views into OperationLabel static storage or entry->key.
91 uint64_t ts_ns; // start wall-clock nanoseconds
92 uint64_t dur_ns; // end - start
93 uint64_t tid; // hashed thread id
94};
95
96// Per-thread event buffer. Owned by the global container so serialized traces can safely
97// include events from worker threads that have already exited.
102
103// Access the current thread's event buffer, registering it on first touch.
105
106// Contains all statically known op counts
108 public:
109 static inline thread_local TimeStatsEntry* parent = nullptr;
111 std::mutex mutex;
113 // Protects thread_event_buffers. Separate from `mutex` so serializers can iterate
114 // thread buffers without contending with active threads registering new TimeStatsEntries.
115 std::mutex event_mutex;
117 void print() const;
118 // NOTE: Should be called when other threads aren't active
119 void clear();
120 void add_entry(const char* key, const std::shared_ptr<TimeStatsEntry>& entry);
122 void print_stats_recursive(const OperationKey& key, const TimeStats* stats, const std::string& indent) const;
123 void print_aggregate_counts(std::ostream&, size_t) const;
124 void print_aggregate_counts_hierarchical(std::ostream&) const;
125 void serialize_aggregate_data_json(std::ostream&) const;
126 // Chrome Trace Event Format output for Perfetto / chrome://tracing.
127 // serialize_trace_events_json emits every captured per-call event (requires
128 // capture_per_call_events to have been true during the run).
129 void serialize_trace_events_json(std::ostream&) const;
130 // Synthesizes Chrome Trace Event entries from the aggregate stats — one "X" event per
131 // (name, parent, thread_slot) laid out in DFS order. Lossy about individual call timing
132 // but tiny and works even without per-call capture.
133 void serialize_aggregate_trace_json(std::ostream&) const;
134
135 // Normalize the raw benchmark data into a clean structure for display
136 AggregateData aggregate() const;
137};
138
139// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
141
142// Tracks operation statistics and links them to their immediate parent context.
143// Each stat is associated only with its direct parent, not the full call hierarchy.
144// This allows measuring the direct contribution of nested operations to their parent,
145// but doesn't provide recursive parent-child relationships through the entire call stack.
146struct TimeStats {
148 uint64_t count = 0;
149 uint64_t time = 0;
150 // Used if the parent changes from last call - chains to handle multiple parent contexts
152
153 TimeStats() = default;
154 TimeStats(TimeStatsEntry* parent_ptr, uint64_t count_val, uint64_t time_val)
155 : parent(parent_ptr)
156 , count(count_val)
157 , time(time_val)
158 {}
159
160 void track(TimeStatsEntry* current_parent, uint64_t time_val)
161 {
162 // Try to track with current stats if parent matches
163 // Check if 'next' already handles this parent to avoid creating duplicates
164 if (raw_track(current_parent, time_val) || (next && next->raw_track(current_parent, time_val))) {
165 return;
166 }
167 // Create new TimeStats at the front of this linked list.
168 auto new_next = std::make_unique<TimeStats>(parent, count, time);
169 new_next->next = std::move(next);
170 next = std::move(new_next);
171
172 // Reset this node.
173 parent = current_parent;
174 count = 1;
175 time = time_val;
176 }
177
178 private:
179 // Returns true if successfully tracked (parent matches), false otherwise
180 bool raw_track(TimeStatsEntry* expected_parent, uint64_t time_val)
181 {
182 if (parent != expected_parent) {
183 return false;
184 }
185 count++;
186 time += time_val;
187 return true;
188 }
189};
190
191// Each key will appear at most once *per thread*.
192// Each thread has its own count for thread-safety.
197
198// The stat entry associated with a certain label AND a certain thread.
199// These will later be aggregated, and the TimeStats itself contains stat
200// entries for each caller context change (for later summarization).
201template <OperationLabel Op> struct ThreadBenchStats {
202 public:
203 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
205 {
206 // Workaround for GCC 13 bug with thread_local static inline members in templates
207 static thread_local std::shared_ptr<TimeStatsEntry> stats;
208 return stats;
209 }
210
211 static void init_entry(TimeStatsEntry& entry);
212 // returns null if use_bb_bench not enabled
214 {
215 auto& stats = get_stats();
216 if (bb::detail::use_bb_bench && BB_UNLIKELY(stats == nullptr)) {
218 GLOBAL_BENCH_STATS.add_entry(Op.value, stats);
219 }
220 return stats;
221 }
222};
223
224// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
225// no-op if passed null stats
233} // namespace bb::detail
234
235// Define macros. we use void(0) for empty ones as we want these to be statements that need a semicolon.
236#ifdef TRACY_INSTRUMENTED
237#define BB_TRACY() ZoneScopedN(__func__)
238#define BB_TRACY_NAME(name) ZoneScopedN(name)
239#define BB_BENCH_TRACY() ZoneScopedN(__func__)
240#define BB_BENCH_TRACY_NAME(name) ZoneScopedN(name)
241#define BB_BENCH_ONLY_NAME(name) (void)0
242#define BB_BENCH_ENABLE_NESTING() (void)0
243#define BB_BENCH_ONLY() (void)0
244#elif defined __wasm__ && !defined ENABLE_WASM_BENCH
245#define BB_TRACY() (void)0
246#define BB_TRACY_NAME(name) (void)0
247#define BB_BENCH_TRACY() (void)0
248#define BB_BENCH_TRACY_NAME(name) (void)0
249#define BB_BENCH_ONLY_NAME(name) (void)0
250#define BB_BENCH_ENABLE_NESTING() (void)0
251#define BB_BENCH_ONLY() (void)0
252#else
253#define BB_TRACY() (void)0
254#define BB_TRACY_NAME(name) (void)0
255#define BB_BENCH_TRACY() BB_BENCH_ONLY_NAME(__func__)
256#define BB_BENCH_TRACY_NAME(name) BB_BENCH_ONLY_NAME(name)
257#define BB_BENCH_ONLY_NAME(name) \
258 bb::detail::BenchReporter _bb_bench_reporter((bb::detail::ThreadBenchStats<name>::ensure_stats().get()))
259#define BB_BENCH_ENABLE_NESTING() \
260 if (_bb_bench_reporter.stats) \
261 bb::detail::GlobalBenchStatsContainer::parent = _bb_bench_reporter.stats
262#define BB_BENCH_ONLY() BB_BENCH_ONLY_NAME(__func__)
263#endif
264#define BB_BENCH_NAME(name) \
265 BB_BENCH_TRACY_NAME(name); \
266 BB_BENCH_ENABLE_NESTING()
267
268#define BB_BENCH() \
269 BB_BENCH_TRACY(); \
270 BB_BENCH_ENABLE_NESTING()
#define BB_UNLIKELY(x)
GlobalBenchStatsContainer GLOBAL_BENCH_STATS
Definition bb_bench.cpp:822
constexpr auto concat()
Definition bb_bench.hpp:45
ThreadEventBuffer & get_thread_event_buffer()
Definition bb_bench.cpp:266
std::atomic< bool > capture_per_call_events
Definition bb_bench.cpp:177
std::unordered_map< OperationKey, std::map< OperationKey, AggregateEntry > > AggregateData
Definition bb_bench.hpp:84
bool use_bb_bench
Definition bb_bench.cpp:175
std::string_view OperationKey
Definition bb_bench.cpp:179
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
Definition bb_bench.hpp:63
void add_thread_time_sample(const TimeAndCount &stats)
Definition bb_bench.cpp:181
double time_stddev
Definition bb_bench.hpp:72
uint64_t time
Definition bb_bench.hpp:67
double time_m2
Definition bb_bench.hpp:75
double time_mean
Definition bb_bench.hpp:70
OperationKey parent
Definition bb_bench.hpp:66
size_t num_threads
Definition bb_bench.hpp:69
uint64_t time_max
Definition bb_bench.hpp:71
double get_std_dev() const
Definition bb_bench.cpp:198
OperationKey key
Definition bb_bench.hpp:65
uint64_t count
Definition bb_bench.hpp:68
TimeStatsEntry * parent
Definition bb_bench.hpp:227
TimeStatsEntry * stats
Definition bb_bench.hpp:228
void print_stats_recursive(const OperationKey &key, const TimeStats *stats, const std::string &indent) const
Definition bb_bench.cpp:287
void print_aggregate_counts_hierarchical(std::ostream &) const
Definition bb_bench.cpp:548
void serialize_trace_events_json(std::ostream &) const
Definition bb_bench.cpp:406
void print_aggregate_counts(std::ostream &, size_t) const
Definition bb_bench.cpp:303
void serialize_aggregate_data_json(std::ostream &) const
Definition bb_bench.cpp:342
void add_entry(const char *key, const std::shared_ptr< TimeStatsEntry > &entry)
Definition bb_bench.cpp:246
std::vector< std::unique_ptr< ThreadEventBuffer > > thread_event_buffers
Definition bb_bench.hpp:116
std::vector< std::shared_ptr< TimeStatsEntry > > entries
Definition bb_bench.hpp:112
void serialize_aggregate_trace_json(std::ostream &) const
Definition bb_bench.cpp:474
ThreadEventBuffer & register_thread_event_buffer(uint64_t tid)
Definition bb_bench.cpp:253
static thread_local TimeStatsEntry * parent
Definition bb_bench.hpp:109
static constexpr std::size_t size()
Definition bb_bench.hpp:32
constexpr OperationLabel(const char(&str)[N])
Definition bb_bench.hpp:34
static std::shared_ptr< TimeStatsEntry > & get_stats()
Definition bb_bench.hpp:204
static std::shared_ptr< TimeStatsEntry > ensure_stats()
Definition bb_bench.hpp:213
static void init_entry(TimeStatsEntry &entry)
std::vector< PerCallEvent > events
Definition bb_bench.hpp:100
Definition bb_bench.hpp:193
OperationKey key
Definition bb_bench.hpp:194
TimeStats count
Definition bb_bench.hpp:195
bool raw_track(TimeStatsEntry *expected_parent, uint64_t time_val)
Definition bb_bench.hpp:180
TimeStats(TimeStatsEntry *parent_ptr, uint64_t count_val, uint64_t time_val)
Definition bb_bench.hpp:154
void track(TimeStatsEntry *current_parent, uint64_t time_val)
Definition bb_bench.hpp:160
TimeStatsEntry * parent
Definition bb_bench.hpp:147
std::unique_ptr< TimeStats > next
Definition bb_bench.hpp:151