9 parser
= argparse
.ArgumentParser(formatter_class
=argparse
.RawDescriptionHelpFormatter
,
10 description
='A reactor stall backtrace graph analyser.',
12 stall-analyser helps analyze a series of reactor-stall backtraces using a graph.
13 Each node in the graph includes:
14 `addr` - a program address
15 Each link in the graph includes:
16 `total` - the total sum of stalls, in milliseconds
17 of all reactor stalls that pass via this caller/callee link.
18 `count` - number of backtraces going through the link.
20 When printed, the graph is traversed in descending `total` order
21 to emphasize stall backtraces that are frequent and long.
23 Each node in the printed output is preceded with [level#index pct%],
24 where `level` is the level of that node in the graph (0 are root nodes),
25 `index` is the index in the parent node's list of callers/callees, and
26 `pct` is the percantage of this link's `total` time relative to
29 When given an executable, addresses are decoding using `addr2line`
31 parser
.add_argument('--address-threshold', default
='0x100000000',
32 help='Skip common backtrace prefix terminated by one or more addresses greater or equal to the threshold (0=disabled)')
33 parser
.add_argument('-e', '--executable',
34 help='Decode addresses to lines using given executable')
35 parser
.add_argument('-f', '--full-function-names', action
='store_const', const
=True, default
=False,
36 help="When demangling C++ function names, display all information, including the type of the function's parameters. Otherwise, they are omitted (see `c++filt(1) -p`).")
37 parser
.add_argument('-w', '--width', type=int, default
=None,
38 help='Smart trim of long lines to width characters (0=disabled)')
39 parser
.add_argument('-d', '--direction', choices
=['bottom-up', 'top-down'], default
='bottom-up',
40 help='Print graph bottom-up (default, callees first) or top-down (callers first)')
41 parser
.add_argument('-m', '--minimum', type=int, default
=None,
42 help='Process only stalls lasting the given time, in milliseconds, or longer')
43 parser
.add_argument('-b', '--branch-threshold', type=float, default
=0.05,
44 help='Drop branches responsible for less than this threshold relative to the previous level, not global. (default 5%%)')
45 parser
.add_argument('file', nargs
='?',
46 help='File containing reactor stall backtraces. Read from stdin if missing.')
48 args
= parser
.parse_args()
50 resolver
= addr2line
.BacktraceResolver(executable
=args
.executable
, concise
=not args
.full_function_names
) if args
.executable
else None
53 def __init__(self
, addr
:str):
60 return f
"Node({self.addr})"
63 def __init__(self
, node
, t
:int):
68 def __eq__(self
, other
):
69 return self
.total
== other
.total
and self
.count
== other
.count
71 def __ne__(self
, other
):
72 return not (self
== other
)
74 def __lt__(self
, other
):
75 return self
.total
< other
.total
or self
.total
== other
.total
and self
.count
< other
.count
81 def link_caller(self
, t
:int, n
):
82 if n
.addr
in self
.callers
:
83 link
= self
.callers
[n
.addr
]
85 n
.callees
[self
.addr
].add(t
)
87 self
.callers
[n
.addr
] = self
.Link(n
, t
)
88 n
.callees
[self
.addr
] = self
.Link(self
, t
)
91 def unlink_caller(self
, addr
:str):
92 link
= self
.callers
.pop(addr
)
93 link
.node
.callees
.pop(self
.addr
)
95 def link_callee(self
, t
:int, n
):
96 if n
.addr
in self
.callees
:
97 link
= self
.callees
[n
.addr
]
99 n
.callers
[self
.addr
].add(t
)
101 self
.callees
[n
.addr
] = self
.Link(n
, t
)
102 n
.callers
[self
.addr
] = self
.Link(self
, t
)
105 def unlink_callee(self
, addr
:str):
106 link
= self
.callees
.pop(addr
)
107 link
.node
.callers
.pop(self
.addr
)
109 def sorted_links(self
, links
:list, descending
=True):
110 return sorted([l
for l
in links
if l
.node
.addr
], reverse
=descending
)
112 def sorted_callers(self
, descending
=True):
113 return self
.sorted_links(self
.callers
.values(), descending
)
115 def sorted_callees(self
, descending
=True):
116 return self
.sorted_links(self
.callees
.values(), descending
)
120 # Each node in the tree contains:
127 def add(self
, prev
:Node
, t
:int, addr
:str):
128 if addr
in self
.nodes
:
134 if prev
.addr
in self
.head
.callees
:
135 self
.head
.unlink_callee(prev
.addr
)
136 prev
.link_caller(t
, n
)
137 if addr
in self
.tail
.callers
:
138 self
.tail
.unlink_caller(addr
)
139 elif not n
.callees
or addr
in self
.tail
.callers
:
140 self
.tail
.link_caller(t
, n
)
143 def add_head(self
, t
:int, n
:Node
):
144 self
.head
.link_callee(t
, n
)
146 def smart_print(self
, lines
:str, width
:int):
147 def _print(l
:str, width
:int):
148 if not width
or len(l
) <= width
:
156 w
= width
- len(sfx
) - 3
161 print(f
"{pfx}...{sfx}")
162 for l
in lines
.splitlines():
166 def print_graph(self
, direction
:str):
167 top_down
= (direction
== 'top-down')
169 This graph is printed in {direction} order, where {'callers' if top_down else 'callees'} are printed first.
170 Use --direction={'bottom-up' if top_down else 'top-down'} to print {'callees' if top_down else 'callers'} first.
172 [level#index/out_of pct%] below denotes:
173 level - nesting level in the graph
174 index - index of node among to its siblings
175 out_of - number of siblings
176 pct - percentage of total stall time of this call relative to its siblings
180 for k
in varargs
.keys():
182 opt
= re
.sub('_', '-', k
)
185 elif not isinstance(val
, bool):
186 clopts
+= f
" --{opt}={val}"
188 clopts
+= f
" --{opt}"
189 print(f
"Command line options:{clopts}\n")
191 def _prefix(prefix_list
:list):
193 for p
in prefix_list
:
197 def _recursive_print_graph(n
:Node
, total
:int=0, count
:int=0, level
:int=-1, idx
:int=0, out_of
:int=0, rel
:float=1.0, prefix_list
=[], skip_stats
=False):
200 avg
= round(total
/ count
) if count
else 0
201 prefix
= _prefix(prefix_list
)
202 p
= '+' if idx
== 1 or idx
== out_of
else '|'
204 l
= f
"[{level}#{idx}/{out_of} {round(100*rel)}%]"
205 cont_indent
= len(l
) + 1
207 l
= f
"{' ' * (len(l)-2)} -"
210 stats
= f
" total={total} count={count} avg={avg}"
211 l
= f
"{prefix}{p}{l} addr={n.addr}{stats}"
214 lines
= resolver
.resolve_address(n
.addr
).splitlines()
217 if li
.startswith("??"):
220 l
+= f
":\n{prefix}{p}{' '*cont_indent}{li.strip()}"
226 l
+= f
"{prefix}{p}{' '*cont_indent}{li.strip()}\n"
227 width
= args
.width
or 0
228 self
.smart_print(l
, width
)
230 print(f
"{prefix}-> continued at addr={n.addr} above")
233 next
= n
.sorted_callees() if top_down
else n
.sorted_callers()
237 if level
>= 0 and len(next
) == 1 and link
.total
== total
and link
.count
== count
:
238 _recursive_print_graph(link
.node
, link
.total
, link
.count
, level
, idx
, out_of
, rel
, prefix_list
, skip_stats
=True)
240 total
= sum(link
.total
for link
in next
)
241 next_prefix_list
= prefix_list
+ ["| " if idx
< out_of
else " "] if level
>= 0 else []
248 rel
= link
.total
/ total
249 if rel
< args
.branch_threshold
:
252 omitted_total
+= link
.total
253 omitted_count
+= link
.count
255 _recursive_print_graph(link
.node
, link
.total
, link
.count
, level
+ 1, i
, last_idx
, rel
, next_prefix_list
)
258 prefix
= _prefix(next_prefix_list
)
260 rel
= omitted_total
/ total
261 avg
= round(omitted_total
/ omitted_count
) if count
else 0
262 l
= f
"[{level+1}#{omitted_idx}/{last_idx} {round(100*rel)}%]"
263 print(f
"{prefix}{p}{l} {last_idx - omitted_idx + 1} more branches total={omitted_total} count={omitted_count} avg={avg}")
265 r
= self
.head
if top_down
else self
.tail
266 _recursive_print_graph(r
)
270 # process each backtrace and insert it to the tree
272 # The backtraces are assumed to be in bottom-up order, i.e.
273 # the first address indicates the innermost frame and the last
274 # address is in the outermost, in calling order.
276 # This helps identifying closely related reactor stalls
277 # where a code path that stalls may be called from multiple
279 def process_graph(t
: int, trace
: list[str]):
282 n
= graph
.add(n
, t
, addr
)
285 address_threshold
= int(args
.address_threshold
, 0)
288 def print_stats(tally
:dict, tmin
):
299 for t
in sorted(tally
.keys()):
301 data
.append((t
, count
))
302 total_time
+= t
* count
309 processed_count
+= count
311 for (t
, count
) in data
:
312 running_count
+= count
313 if median
is None and running_count
>= total_count
/ 2:
315 elif p95
is None and running_count
>= (total_count
* 95) / 100:
317 elif p99
is None and running_count
>= (total_count
* 99) / 100:
319 elif p999
is None and running_count
>= (total_count
* 999) / 1000:
321 print(f
"Processed {total_count} stalls lasting a total of {total_time} milliseconds.")
323 print(f
"Of which, {processed_count} lasted {tmin} milliseconds or longer.")
324 avg_time
= total_time
/ total_count
if total_count
else 0
325 print(f
"min={min_time} avg={avg_time:.1f} median={median} p95={p95} p99={p99} p999={p999} max={max_time}")
327 input = open(args
.file) if args
.file else sys
.stdin
329 comment
= re
.compile('^\s*#')
330 pattern
= re
.compile('Reactor stall')
332 if comment
.search(s
) or not pattern
.search(s
):
336 for i
in range(0, len(trace
)):
337 if trace
[i
] == 'Reactor':
341 tally
[t
] = tally
.pop(t
, 0) + 1
342 trace
= trace
[i
+ 6:]
343 # The address_threshold typically indicates a library call
344 # and the backtrace up-to and including it are usually of
345 # no interest as they all contain the stall backtrace geneneration code, e.g.:
346 # seastar::internal::cpu_stall_detector::generate_trace
347 # void seastar::backtrace<seastar::backtrace_buffer::append_backtrace_oneline()::{lambda(seastar::frame)#1}>(seastar::backt>
348 # (inlined by) seastar::backtrace_buffer::append_backtrace_oneline() at ./build/release/seastar/./seastar/src/core/reactor.cc:771
349 # (inlined by) seastar::print_with_backtrace(seastar::backtrace_buffer&, bool) at ./build/release/seastar/./seastar/src/core/reactor.cc>
350 # seastar::internal::cpu_stall_detector::generate_trace() at ./build/release/seastar/./seastar/src/core/reactor.cc:1257
351 # seastar::internal::cpu_stall_detector::maybe_report() at ./build/release/seastar/./seastar/src/core/reactor.cc:1103
352 # (inlined by) seastar::internal::cpu_stall_detector::on_signal() at ./build/release/seastar/./seastar/src/core/reactor.cc:1117
353 # (inlined by) seastar::reactor::block_notifier(int) at ./build/release/seastar/./seastar/src/core/reactor.cc:1240
355 if address_threshold
:
356 for i
in range(0, len(trace
)):
357 if int(trace
[i
], 0) >= address_threshold
:
358 while int(trace
[i
], 0) >= address_threshold
:
362 tmin
= args
.minimum
or 0
364 process_graph(t
, trace
)
367 print_stats(tally
, tmin
)
368 graph
.print_graph(args
.direction
)
369 except BrokenPipeError
: