]>
Commit | Line | Data |
---|---|---|
f9bc9e65 JF |
1 | #!/usr/bin/python |
2 | # | |
3 | # top-like utility for displaying kvm statistics | |
4 | # | |
5 | # Copyright 2006-2008 Qumranet Technologies | |
6 | # Copyright 2008-2011 Red Hat, Inc. | |
7 | # | |
8 | # Authors: | |
9 | # Avi Kivity <avi@redhat.com> | |
10 | # | |
11 | # This work is licensed under the terms of the GNU GPL, version 2. See | |
12 | # the COPYING file in the top-level directory. | |
fabc7128 JF |
13 | """The kvm_stat module outputs statistics about running KVM VMs |
14 | ||
15 | Three different ways of output formatting are available: | |
16 | - as a top-like text ui | |
17 | - in a key -> value format | |
18 | - in an all keys, all values format | |
19 | ||
20 | The data is sampled from the KVM's debugfs entries and its perf events. | |
21 | """ | |
f9bc9e65 JF |
22 | |
23 | import curses | |
24 | import sys | |
25 | import os | |
26 | import time | |
27 | import optparse | |
28 | import ctypes | |
29 | import fcntl | |
30 | import resource | |
31 | import struct | |
32 | import re | |
33 | from collections import defaultdict | |
f9bc9e65 JF |
34 | |
35 | VMX_EXIT_REASONS = { | |
36 | 'EXCEPTION_NMI': 0, | |
37 | 'EXTERNAL_INTERRUPT': 1, | |
38 | 'TRIPLE_FAULT': 2, | |
39 | 'PENDING_INTERRUPT': 7, | |
40 | 'NMI_WINDOW': 8, | |
41 | 'TASK_SWITCH': 9, | |
42 | 'CPUID': 10, | |
43 | 'HLT': 12, | |
44 | 'INVLPG': 14, | |
45 | 'RDPMC': 15, | |
46 | 'RDTSC': 16, | |
47 | 'VMCALL': 18, | |
48 | 'VMCLEAR': 19, | |
49 | 'VMLAUNCH': 20, | |
50 | 'VMPTRLD': 21, | |
51 | 'VMPTRST': 22, | |
52 | 'VMREAD': 23, | |
53 | 'VMRESUME': 24, | |
54 | 'VMWRITE': 25, | |
55 | 'VMOFF': 26, | |
56 | 'VMON': 27, | |
57 | 'CR_ACCESS': 28, | |
58 | 'DR_ACCESS': 29, | |
59 | 'IO_INSTRUCTION': 30, | |
60 | 'MSR_READ': 31, | |
61 | 'MSR_WRITE': 32, | |
62 | 'INVALID_STATE': 33, | |
63 | 'MWAIT_INSTRUCTION': 36, | |
64 | 'MONITOR_INSTRUCTION': 39, | |
65 | 'PAUSE_INSTRUCTION': 40, | |
66 | 'MCE_DURING_VMENTRY': 41, | |
67 | 'TPR_BELOW_THRESHOLD': 43, | |
68 | 'APIC_ACCESS': 44, | |
69 | 'EPT_VIOLATION': 48, | |
70 | 'EPT_MISCONFIG': 49, | |
71 | 'WBINVD': 54, | |
72 | 'XSETBV': 55, | |
73 | 'APIC_WRITE': 56, | |
74 | 'INVPCID': 58, | |
75 | } | |
76 | ||
77 | SVM_EXIT_REASONS = { | |
78 | 'READ_CR0': 0x000, | |
79 | 'READ_CR3': 0x003, | |
80 | 'READ_CR4': 0x004, | |
81 | 'READ_CR8': 0x008, | |
82 | 'WRITE_CR0': 0x010, | |
83 | 'WRITE_CR3': 0x013, | |
84 | 'WRITE_CR4': 0x014, | |
85 | 'WRITE_CR8': 0x018, | |
86 | 'READ_DR0': 0x020, | |
87 | 'READ_DR1': 0x021, | |
88 | 'READ_DR2': 0x022, | |
89 | 'READ_DR3': 0x023, | |
90 | 'READ_DR4': 0x024, | |
91 | 'READ_DR5': 0x025, | |
92 | 'READ_DR6': 0x026, | |
93 | 'READ_DR7': 0x027, | |
94 | 'WRITE_DR0': 0x030, | |
95 | 'WRITE_DR1': 0x031, | |
96 | 'WRITE_DR2': 0x032, | |
97 | 'WRITE_DR3': 0x033, | |
98 | 'WRITE_DR4': 0x034, | |
99 | 'WRITE_DR5': 0x035, | |
100 | 'WRITE_DR6': 0x036, | |
101 | 'WRITE_DR7': 0x037, | |
102 | 'EXCP_BASE': 0x040, | |
103 | 'INTR': 0x060, | |
104 | 'NMI': 0x061, | |
105 | 'SMI': 0x062, | |
106 | 'INIT': 0x063, | |
107 | 'VINTR': 0x064, | |
108 | 'CR0_SEL_WRITE': 0x065, | |
109 | 'IDTR_READ': 0x066, | |
110 | 'GDTR_READ': 0x067, | |
111 | 'LDTR_READ': 0x068, | |
112 | 'TR_READ': 0x069, | |
113 | 'IDTR_WRITE': 0x06a, | |
114 | 'GDTR_WRITE': 0x06b, | |
115 | 'LDTR_WRITE': 0x06c, | |
116 | 'TR_WRITE': 0x06d, | |
117 | 'RDTSC': 0x06e, | |
118 | 'RDPMC': 0x06f, | |
119 | 'PUSHF': 0x070, | |
120 | 'POPF': 0x071, | |
121 | 'CPUID': 0x072, | |
122 | 'RSM': 0x073, | |
123 | 'IRET': 0x074, | |
124 | 'SWINT': 0x075, | |
125 | 'INVD': 0x076, | |
126 | 'PAUSE': 0x077, | |
127 | 'HLT': 0x078, | |
128 | 'INVLPG': 0x079, | |
129 | 'INVLPGA': 0x07a, | |
130 | 'IOIO': 0x07b, | |
131 | 'MSR': 0x07c, | |
132 | 'TASK_SWITCH': 0x07d, | |
133 | 'FERR_FREEZE': 0x07e, | |
134 | 'SHUTDOWN': 0x07f, | |
135 | 'VMRUN': 0x080, | |
136 | 'VMMCALL': 0x081, | |
137 | 'VMLOAD': 0x082, | |
138 | 'VMSAVE': 0x083, | |
139 | 'STGI': 0x084, | |
140 | 'CLGI': 0x085, | |
141 | 'SKINIT': 0x086, | |
142 | 'RDTSCP': 0x087, | |
143 | 'ICEBP': 0x088, | |
144 | 'WBINVD': 0x089, | |
145 | 'MONITOR': 0x08a, | |
146 | 'MWAIT': 0x08b, | |
147 | 'MWAIT_COND': 0x08c, | |
148 | 'XSETBV': 0x08d, | |
149 | 'NPF': 0x400, | |
150 | } | |
151 | ||
152 | # EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h) | |
153 | AARCH64_EXIT_REASONS = { | |
154 | 'UNKNOWN': 0x00, | |
155 | 'WFI': 0x01, | |
156 | 'CP15_32': 0x03, | |
157 | 'CP15_64': 0x04, | |
158 | 'CP14_MR': 0x05, | |
159 | 'CP14_LS': 0x06, | |
160 | 'FP_ASIMD': 0x07, | |
161 | 'CP10_ID': 0x08, | |
162 | 'CP14_64': 0x0C, | |
163 | 'ILL_ISS': 0x0E, | |
164 | 'SVC32': 0x11, | |
165 | 'HVC32': 0x12, | |
166 | 'SMC32': 0x13, | |
167 | 'SVC64': 0x15, | |
168 | 'HVC64': 0x16, | |
169 | 'SMC64': 0x17, | |
170 | 'SYS64': 0x18, | |
171 | 'IABT': 0x20, | |
172 | 'IABT_HYP': 0x21, | |
173 | 'PC_ALIGN': 0x22, | |
174 | 'DABT': 0x24, | |
175 | 'DABT_HYP': 0x25, | |
176 | 'SP_ALIGN': 0x26, | |
177 | 'FP_EXC32': 0x28, | |
178 | 'FP_EXC64': 0x2C, | |
179 | 'SERROR': 0x2F, | |
180 | 'BREAKPT': 0x30, | |
181 | 'BREAKPT_HYP': 0x31, | |
182 | 'SOFTSTP': 0x32, | |
183 | 'SOFTSTP_HYP': 0x33, | |
184 | 'WATCHPT': 0x34, | |
185 | 'WATCHPT_HYP': 0x35, | |
186 | 'BKPT32': 0x38, | |
187 | 'VECTOR32': 0x3A, | |
188 | 'BRK64': 0x3C, | |
189 | } | |
190 | ||
191 | # From include/uapi/linux/kvm.h, KVM_EXIT_xxx | |
192 | USERSPACE_EXIT_REASONS = { | |
193 | 'UNKNOWN': 0, | |
194 | 'EXCEPTION': 1, | |
195 | 'IO': 2, | |
196 | 'HYPERCALL': 3, | |
197 | 'DEBUG': 4, | |
198 | 'HLT': 5, | |
199 | 'MMIO': 6, | |
200 | 'IRQ_WINDOW_OPEN': 7, | |
201 | 'SHUTDOWN': 8, | |
202 | 'FAIL_ENTRY': 9, | |
203 | 'INTR': 10, | |
204 | 'SET_TPR': 11, | |
205 | 'TPR_ACCESS': 12, | |
206 | 'S390_SIEIC': 13, | |
207 | 'S390_RESET': 14, | |
208 | 'DCR': 15, | |
209 | 'NMI': 16, | |
210 | 'INTERNAL_ERROR': 17, | |
211 | 'OSI': 18, | |
212 | 'PAPR_HCALL': 19, | |
213 | 'S390_UCONTROL': 20, | |
214 | 'WATCHDOG': 21, | |
215 | 'S390_TSCH': 22, | |
216 | 'EPR': 23, | |
217 | 'SYSTEM_EVENT': 24, | |
218 | } | |
219 | ||
220 | IOCTL_NUMBERS = { | |
221 | 'SET_FILTER': 0x40082406, | |
222 | 'ENABLE': 0x00002400, | |
223 | 'DISABLE': 0x00002401, | |
224 | 'RESET': 0x00002403, | |
225 | } | |
226 | ||
227 | class Arch(object): | |
fabc7128 JF |
228 | """Encapsulates global architecture specific data. |
229 | ||
230 | Contains the performance event open syscall and ioctl numbers, as | |
231 | well as the VM exit reasons for the architecture it runs on. | |
f9bc9e65 JF |
232 | |
233 | """ | |
234 | @staticmethod | |
235 | def get_arch(): | |
236 | machine = os.uname()[4] | |
237 | ||
238 | if machine.startswith('ppc'): | |
239 | return ArchPPC() | |
240 | elif machine.startswith('aarch64'): | |
241 | return ArchA64() | |
242 | elif machine.startswith('s390'): | |
243 | return ArchS390() | |
244 | else: | |
245 | # X86_64 | |
246 | for line in open('/proc/cpuinfo'): | |
247 | if not line.startswith('flags'): | |
248 | continue | |
249 | ||
250 | flags = line.split() | |
251 | if 'vmx' in flags: | |
252 | return ArchX86(VMX_EXIT_REASONS) | |
253 | if 'svm' in flags: | |
254 | return ArchX86(SVM_EXIT_REASONS) | |
255 | return | |
256 | ||
257 | class ArchX86(Arch): | |
258 | def __init__(self, exit_reasons): | |
259 | self.sc_perf_evt_open = 298 | |
260 | self.ioctl_numbers = IOCTL_NUMBERS | |
261 | self.exit_reasons = exit_reasons | |
262 | ||
263 | class ArchPPC(Arch): | |
264 | def __init__(self): | |
265 | self.sc_perf_evt_open = 319 | |
266 | self.ioctl_numbers = IOCTL_NUMBERS | |
267 | self.ioctl_numbers['ENABLE'] = 0x20002400 | |
268 | self.ioctl_numbers['DISABLE'] = 0x20002401 | |
c7d4fb5a | 269 | self.ioctl_numbers['RESET'] = 0x20002403 |
f9bc9e65 JF |
270 | |
271 | # PPC comes in 32 and 64 bit and some generated ioctl | |
272 | # numbers depend on the wordsize. | |
273 | char_ptr_size = ctypes.sizeof(ctypes.c_char_p) | |
274 | self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 | |
c7d4fb5a | 275 | self.exit_reasons = {} |
f9bc9e65 JF |
276 | |
277 | class ArchA64(Arch): | |
278 | def __init__(self): | |
279 | self.sc_perf_evt_open = 241 | |
280 | self.ioctl_numbers = IOCTL_NUMBERS | |
281 | self.exit_reasons = AARCH64_EXIT_REASONS | |
282 | ||
283 | class ArchS390(Arch): | |
284 | def __init__(self): | |
285 | self.sc_perf_evt_open = 331 | |
286 | self.ioctl_numbers = IOCTL_NUMBERS | |
287 | self.exit_reasons = None | |
288 | ||
289 | ARCH = Arch.get_arch() | |
290 | ||
291 | ||
292 | def walkdir(path): | |
293 | """Returns os.walk() data for specified directory. | |
294 | ||
295 | As it is only a wrapper it returns the same 3-tuple of (dirpath, | |
296 | dirnames, filenames). | |
297 | """ | |
298 | return next(os.walk(path)) | |
299 | ||
300 | ||
301 | def parse_int_list(list_string): | |
302 | """Returns an int list from a string of comma separated integers and | |
303 | integer ranges.""" | |
304 | integers = [] | |
305 | members = list_string.split(',') | |
306 | ||
307 | for member in members: | |
308 | if '-' not in member: | |
309 | integers.append(int(member)) | |
310 | else: | |
311 | int_range = member.split('-') | |
312 | integers.extend(range(int(int_range[0]), | |
313 | int(int_range[1]) + 1)) | |
314 | ||
315 | return integers | |
316 | ||
317 | ||
318 | def get_online_cpus(): | |
fabc7128 | 319 | """Returns a list of cpu id integers.""" |
f9bc9e65 JF |
320 | with open('/sys/devices/system/cpu/online') as cpu_list: |
321 | cpu_string = cpu_list.readline() | |
322 | return parse_int_list(cpu_string) | |
323 | ||
324 | ||
325 | def get_filters(): | |
fabc7128 JF |
326 | """Returns a dict of trace events, their filter ids and |
327 | the values that can be filtered. | |
328 | ||
329 | Trace events can be filtered for special values by setting a | |
330 | filter string via an ioctl. The string normally has the format | |
331 | identifier==value. For each filter a new event will be created, to | |
332 | be able to distinguish the events. | |
333 | ||
334 | """ | |
f9bc9e65 JF |
335 | filters = {} |
336 | filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS) | |
337 | if ARCH.exit_reasons: | |
338 | filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons) | |
339 | return filters | |
340 | ||
341 | libc = ctypes.CDLL('libc.so.6', use_errno=True) | |
342 | syscall = libc.syscall | |
343 | ||
344 | class perf_event_attr(ctypes.Structure): | |
fabc7128 JF |
345 | """Struct that holds the necessary data to set up a trace event. |
346 | ||
347 | For an extensive explanation see perf_event_open(2) and | |
348 | include/uapi/linux/perf_event.h, struct perf_event_attr | |
349 | ||
350 | All fields that are not initialized in the constructor are 0. | |
351 | ||
352 | """ | |
f9bc9e65 JF |
353 | _fields_ = [('type', ctypes.c_uint32), |
354 | ('size', ctypes.c_uint32), | |
355 | ('config', ctypes.c_uint64), | |
356 | ('sample_freq', ctypes.c_uint64), | |
357 | ('sample_type', ctypes.c_uint64), | |
358 | ('read_format', ctypes.c_uint64), | |
359 | ('flags', ctypes.c_uint64), | |
360 | ('wakeup_events', ctypes.c_uint32), | |
361 | ('bp_type', ctypes.c_uint32), | |
362 | ('bp_addr', ctypes.c_uint64), | |
363 | ('bp_len', ctypes.c_uint64), | |
364 | ] | |
365 | ||
366 | def __init__(self): | |
367 | super(self.__class__, self).__init__() | |
368 | self.type = PERF_TYPE_TRACEPOINT | |
369 | self.size = ctypes.sizeof(self) | |
370 | self.read_format = PERF_FORMAT_GROUP | |
371 | ||
372 | def perf_event_open(attr, pid, cpu, group_fd, flags): | |
fabc7128 JF |
373 | """Wrapper for the sys_perf_evt_open() syscall. |
374 | ||
375 | Used to set up performance events, returns a file descriptor or -1 | |
376 | on error. | |
377 | ||
378 | Attributes are: | |
379 | - syscall number | |
380 | - struct perf_event_attr * | |
381 | - pid or -1 to monitor all pids | |
382 | - cpu number or -1 to monitor all cpus | |
383 | - The file descriptor of the group leader or -1 to create a group. | |
384 | - flags | |
385 | ||
386 | """ | |
f9bc9e65 JF |
387 | return syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr), |
388 | ctypes.c_int(pid), ctypes.c_int(cpu), | |
389 | ctypes.c_int(group_fd), ctypes.c_long(flags)) | |
390 | ||
391 | PERF_TYPE_TRACEPOINT = 2 | |
392 | PERF_FORMAT_GROUP = 1 << 3 | |
393 | ||
394 | PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing' | |
395 | PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm' | |
396 | ||
397 | class Group(object): | |
fabc7128 JF |
398 | """Represents a perf event group.""" |
399 | ||
f9bc9e65 JF |
400 | def __init__(self): |
401 | self.events = [] | |
402 | ||
403 | def add_event(self, event): | |
404 | self.events.append(event) | |
405 | ||
406 | def read(self): | |
fabc7128 JF |
407 | """Returns a dict with 'event name: value' for all events in the |
408 | group. | |
409 | ||
410 | Values are read by reading from the file descriptor of the | |
411 | event that is the group leader. See perf_event_open(2) for | |
412 | details. | |
413 | ||
414 | Read format for the used event configuration is: | |
415 | struct read_format { | |
416 | u64 nr; /* The number of events */ | |
417 | struct { | |
418 | u64 value; /* The value of the event */ | |
419 | } values[nr]; | |
420 | }; | |
421 | ||
422 | """ | |
f9bc9e65 JF |
423 | length = 8 * (1 + len(self.events)) |
424 | read_format = 'xxxxxxxx' + 'Q' * len(self.events) | |
425 | return dict(zip([event.name for event in self.events], | |
426 | struct.unpack(read_format, | |
427 | os.read(self.events[0].fd, length)))) | |
428 | ||
429 | class Event(object): | |
fabc7128 | 430 | """Represents a performance event and manages its life cycle.""" |
f0cf040f JF |
431 | def __init__(self, name, group, trace_cpu, trace_pid, trace_point, |
432 | trace_filter, trace_set='kvm'): | |
f9bc9e65 JF |
433 | self.name = name |
434 | self.fd = None | |
f0cf040f JF |
435 | self.setup_event(group, trace_cpu, trace_pid, trace_point, |
436 | trace_filter, trace_set) | |
437 | ||
438 | def __del__(self): | |
fabc7128 JF |
439 | """Closes the event's file descriptor. |
440 | ||
441 | As no python file object was created for the file descriptor, | |
442 | python will not reference count the descriptor and will not | |
443 | close it itself automatically, so we do it. | |
444 | ||
445 | """ | |
f0cf040f JF |
446 | if self.fd: |
447 | os.close(self.fd) | |
f9bc9e65 JF |
448 | |
449 | def setup_event_attribute(self, trace_set, trace_point): | |
fabc7128 JF |
450 | """Returns an initialized ctype perf_event_attr struct.""" |
451 | ||
f9bc9e65 JF |
452 | id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set, |
453 | trace_point, 'id') | |
454 | ||
455 | event_attr = perf_event_attr() | |
456 | event_attr.config = int(open(id_path).read()) | |
457 | return event_attr | |
458 | ||
f0cf040f JF |
459 | def setup_event(self, group, trace_cpu, trace_pid, trace_point, |
460 | trace_filter, trace_set): | |
fabc7128 JF |
461 | """Sets up the perf event in Linux. |
462 | ||
463 | Issues the syscall to register the event in the kernel and | |
464 | then sets the optional filter. | |
465 | ||
466 | """ | |
467 | ||
f9bc9e65 JF |
468 | event_attr = self.setup_event_attribute(trace_set, trace_point) |
469 | ||
fabc7128 | 470 | # First event will be group leader. |
f9bc9e65 | 471 | group_leader = -1 |
fabc7128 JF |
472 | |
473 | # All others have to pass the leader's descriptor instead. | |
f9bc9e65 JF |
474 | if group.events: |
475 | group_leader = group.events[0].fd | |
476 | ||
f0cf040f JF |
477 | fd = perf_event_open(event_attr, trace_pid, |
478 | trace_cpu, group_leader, 0) | |
f9bc9e65 JF |
479 | if fd == -1: |
480 | err = ctypes.get_errno() | |
481 | raise OSError(err, os.strerror(err), | |
482 | 'while calling sys_perf_event_open().') | |
483 | ||
484 | if trace_filter: | |
485 | fcntl.ioctl(fd, ARCH.ioctl_numbers['SET_FILTER'], | |
486 | trace_filter) | |
487 | ||
488 | self.fd = fd | |
489 | ||
490 | def enable(self): | |
fabc7128 JF |
491 | """Enables the trace event in the kernel. |
492 | ||
493 | Enabling the group leader makes reading counters from it and the | |
494 | events under it possible. | |
495 | ||
496 | """ | |
f9bc9e65 JF |
497 | fcntl.ioctl(self.fd, ARCH.ioctl_numbers['ENABLE'], 0) |
498 | ||
499 | def disable(self): | |
fabc7128 JF |
500 | """Disables the trace event in the kernel. |
501 | ||
502 | Disabling the group leader makes reading all counters under it | |
503 | impossible. | |
504 | ||
505 | """ | |
f9bc9e65 JF |
506 | fcntl.ioctl(self.fd, ARCH.ioctl_numbers['DISABLE'], 0) |
507 | ||
508 | def reset(self): | |
fabc7128 | 509 | """Resets the count of the trace event in the kernel.""" |
f9bc9e65 JF |
510 | fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0) |
511 | ||
512 | class TracepointProvider(object): | |
fabc7128 JF |
513 | """Data provider for the stats class. |
514 | ||
515 | Manages the events/groups from which it acquires its data. | |
516 | ||
517 | """ | |
f9bc9e65 JF |
518 | def __init__(self): |
519 | self.group_leaders = [] | |
520 | self.filters = get_filters() | |
521 | self._fields = self.get_available_fields() | |
f0cf040f | 522 | self._pid = 0 |
f9bc9e65 JF |
523 | |
524 | def get_available_fields(self): | |
fabc7128 JF |
525 | """Returns a list of available event's of format 'event name(filter |
526 | name)'. | |
527 | ||
528 | All available events have directories under | |
529 | /sys/kernel/debug/tracing/events/ which export information | |
530 | about the specific event. Therefore, listing the dirs gives us | |
531 | a list of all available events. | |
532 | ||
533 | Some events like the vm exit reasons can be filtered for | |
534 | specific values. To take account for that, the routine below | |
535 | creates special fields with the following format: | |
536 | event name(filter name) | |
537 | ||
538 | """ | |
f9bc9e65 JF |
539 | path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm') |
540 | fields = walkdir(path)[1] | |
541 | extra = [] | |
542 | for field in fields: | |
543 | if field in self.filters: | |
544 | filter_name_, filter_dicts = self.filters[field] | |
545 | for name in filter_dicts: | |
546 | extra.append(field + '(' + name + ')') | |
547 | fields += extra | |
548 | return fields | |
549 | ||
550 | def setup_traces(self): | |
fabc7128 JF |
551 | """Creates all event and group objects needed to be able to retrieve |
552 | data.""" | |
f0cf040f JF |
553 | if self._pid > 0: |
554 | # Fetch list of all threads of the monitored pid, as qemu | |
555 | # starts a thread for each vcpu. | |
556 | path = os.path.join('/proc', str(self._pid), 'task') | |
557 | groupids = walkdir(path)[1] | |
558 | else: | |
559 | groupids = get_online_cpus() | |
f9bc9e65 JF |
560 | |
561 | # The constant is needed as a buffer for python libs, std | |
562 | # streams and other files that the script opens. | |
f0cf040f | 563 | newlim = len(groupids) * len(self._fields) + 50 |
f9bc9e65 JF |
564 | try: |
565 | softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE) | |
566 | ||
567 | if hardlim < newlim: | |
568 | # Now we need CAP_SYS_RESOURCE, to increase the hard limit. | |
569 | resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, newlim)) | |
570 | else: | |
571 | # Raising the soft limit is sufficient. | |
572 | resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, hardlim)) | |
573 | ||
574 | except ValueError: | |
575 | sys.exit("NOFILE rlimit could not be raised to {0}".format(newlim)) | |
576 | ||
f0cf040f | 577 | for groupid in groupids: |
f9bc9e65 JF |
578 | group = Group() |
579 | for name in self._fields: | |
580 | tracepoint = name | |
581 | tracefilter = None | |
582 | match = re.match(r'(.*)\((.*)\)', name) | |
583 | if match: | |
584 | tracepoint, sub = match.groups() | |
585 | tracefilter = ('%s==%d\0' % | |
586 | (self.filters[tracepoint][0], | |
587 | self.filters[tracepoint][1][sub])) | |
588 | ||
f0cf040f JF |
589 | # From perf_event_open(2): |
590 | # pid > 0 and cpu == -1 | |
591 | # This measures the specified process/thread on any CPU. | |
592 | # | |
593 | # pid == -1 and cpu >= 0 | |
594 | # This measures all processes/threads on the specified CPU. | |
595 | trace_cpu = groupid if self._pid == 0 else -1 | |
596 | trace_pid = int(groupid) if self._pid != 0 else -1 | |
597 | ||
f9bc9e65 JF |
598 | group.add_event(Event(name=name, |
599 | group=group, | |
f0cf040f JF |
600 | trace_cpu=trace_cpu, |
601 | trace_pid=trace_pid, | |
f9bc9e65 JF |
602 | trace_point=tracepoint, |
603 | trace_filter=tracefilter)) | |
f0cf040f | 604 | |
f9bc9e65 JF |
605 | self.group_leaders.append(group) |
606 | ||
607 | def available_fields(self): | |
608 | return self.get_available_fields() | |
609 | ||
610 | @property | |
611 | def fields(self): | |
612 | return self._fields | |
613 | ||
614 | @fields.setter | |
615 | def fields(self, fields): | |
fabc7128 | 616 | """Enables/disables the (un)wanted events""" |
f9bc9e65 JF |
617 | self._fields = fields |
618 | for group in self.group_leaders: | |
619 | for index, event in enumerate(group.events): | |
620 | if event.name in fields: | |
621 | event.reset() | |
622 | event.enable() | |
623 | else: | |
624 | # Do not disable the group leader. | |
625 | # It would disable all of its events. | |
626 | if index != 0: | |
627 | event.disable() | |
628 | ||
f0cf040f JF |
629 | @property |
630 | def pid(self): | |
631 | return self._pid | |
632 | ||
633 | @pid.setter | |
634 | def pid(self, pid): | |
fabc7128 | 635 | """Changes the monitored pid by setting new traces.""" |
f0cf040f | 636 | self._pid = pid |
fabc7128 JF |
637 | # The garbage collector will get rid of all Event/Group |
638 | # objects and open files after removing the references. | |
f0cf040f JF |
639 | self.group_leaders = [] |
640 | self.setup_traces() | |
641 | self.fields = self._fields | |
642 | ||
f9bc9e65 | 643 | def read(self): |
fabc7128 | 644 | """Returns 'event name: current value' for all enabled events.""" |
f9bc9e65 JF |
645 | ret = defaultdict(int) |
646 | for group in self.group_leaders: | |
647 | for name, val in group.read().iteritems(): | |
648 | if name in self._fields: | |
649 | ret[name] += val | |
650 | return ret | |
651 | ||
652 | class DebugfsProvider(object): | |
fabc7128 JF |
653 | """Provides data from the files that KVM creates in the kvm debugfs |
654 | folder.""" | |
f9bc9e65 JF |
655 | def __init__(self): |
656 | self._fields = self.get_available_fields() | |
f0cf040f JF |
657 | self._pid = 0 |
658 | self.do_read = True | |
e0ba3876 | 659 | self.paths = [] |
f9bc9e65 JF |
660 | |
661 | def get_available_fields(self): | |
fabc7128 JF |
662 | """"Returns a list of available fields. |
663 | ||
664 | The fields are all available KVM debugfs files | |
665 | ||
666 | """ | |
f9bc9e65 JF |
667 | return walkdir(PATH_DEBUGFS_KVM)[2] |
668 | ||
669 | @property | |
670 | def fields(self): | |
671 | return self._fields | |
672 | ||
673 | @fields.setter | |
674 | def fields(self, fields): | |
675 | self._fields = fields | |
676 | ||
f0cf040f JF |
677 | @property |
678 | def pid(self): | |
679 | return self._pid | |
680 | ||
681 | @pid.setter | |
682 | def pid(self, pid): | |
683 | if pid != 0: | |
684 | self._pid = pid | |
685 | ||
686 | vms = walkdir(PATH_DEBUGFS_KVM)[1] | |
687 | if len(vms) == 0: | |
688 | self.do_read = False | |
689 | ||
690 | self.paths = filter(lambda x: "{}-".format(pid) in x, vms) | |
691 | ||
692 | else: | |
693 | self.paths = [''] | |
694 | self.do_read = True | |
695 | ||
f9bc9e65 | 696 | def read(self): |
f0cf040f JF |
697 | """Returns a dict with format:'file name / field -> current value'.""" |
698 | results = {} | |
699 | ||
700 | # If no debugfs filtering support is available, then don't read. | |
701 | if not self.do_read: | |
702 | return results | |
703 | ||
704 | for path in self.paths: | |
705 | for field in self._fields: | |
706 | results[field] = results.get(field, 0) \ | |
707 | + self.read_field(field, path) | |
708 | ||
709 | return results | |
710 | ||
711 | def read_field(self, field, path): | |
712 | """Returns the value of a single field from a specific VM.""" | |
713 | try: | |
714 | return int(open(os.path.join(PATH_DEBUGFS_KVM, | |
715 | path, | |
716 | field)) | |
717 | .read()) | |
718 | except IOError: | |
719 | return 0 | |
f9bc9e65 JF |
720 | |
721 | class Stats(object): | |
fabc7128 JF |
722 | """Manages the data providers and the data they provide. |
723 | ||
724 | It is used to set filters on the provider's data and collect all | |
725 | provider data. | |
726 | ||
727 | """ | |
f0cf040f | 728 | def __init__(self, providers, pid, fields=None): |
f9bc9e65 | 729 | self.providers = providers |
f0cf040f | 730 | self._pid_filter = pid |
f9bc9e65 JF |
731 | self._fields_filter = fields |
732 | self.values = {} | |
f0cf040f | 733 | self.update_provider_pid() |
f9bc9e65 JF |
734 | self.update_provider_filters() |
735 | ||
736 | def update_provider_filters(self): | |
fabc7128 | 737 | """Propagates fields filters to providers.""" |
f9bc9e65 JF |
738 | def wanted(key): |
739 | if not self._fields_filter: | |
740 | return True | |
741 | return re.match(self._fields_filter, key) is not None | |
742 | ||
743 | # As we reset the counters when updating the fields we can | |
744 | # also clear the cache of old values. | |
745 | self.values = {} | |
746 | for provider in self.providers: | |
747 | provider_fields = [key for key in provider.get_available_fields() | |
748 | if wanted(key)] | |
749 | provider.fields = provider_fields | |
750 | ||
f0cf040f | 751 | def update_provider_pid(self): |
fabc7128 | 752 | """Propagates pid filters to providers.""" |
f0cf040f JF |
753 | for provider in self.providers: |
754 | provider.pid = self._pid_filter | |
755 | ||
f9bc9e65 JF |
756 | @property |
757 | def fields_filter(self): | |
758 | return self._fields_filter | |
759 | ||
760 | @fields_filter.setter | |
761 | def fields_filter(self, fields_filter): | |
762 | self._fields_filter = fields_filter | |
763 | self.update_provider_filters() | |
764 | ||
f0cf040f JF |
765 | @property |
766 | def pid_filter(self): | |
767 | return self._pid_filter | |
768 | ||
769 | @pid_filter.setter | |
770 | def pid_filter(self, pid): | |
771 | self._pid_filter = pid | |
772 | self.values = {} | |
773 | self.update_provider_pid() | |
774 | ||
f9bc9e65 | 775 | def get(self): |
fabc7128 JF |
776 | """Returns a dict with field -> (value, delta to last value) of all |
777 | provider data.""" | |
f9bc9e65 JF |
778 | for provider in self.providers: |
779 | new = provider.read() | |
780 | for key in provider.fields: | |
781 | oldval = self.values.get(key, (0, 0)) | |
782 | newval = new.get(key, 0) | |
783 | newdelta = None | |
784 | if oldval is not None: | |
785 | newdelta = newval - oldval[0] | |
786 | self.values[key] = (newval, newdelta) | |
787 | return self.values | |
788 | ||
789 | LABEL_WIDTH = 40 | |
790 | NUMBER_WIDTH = 10 | |
791 | ||
792 | class Tui(object): | |
fabc7128 | 793 | """Instruments curses to draw a nice text ui.""" |
f9bc9e65 JF |
794 | def __init__(self, stats): |
795 | self.stats = stats | |
796 | self.screen = None | |
f9bc9e65 JF |
797 | self.update_drilldown() |
798 | ||
799 | def __enter__(self): | |
800 | """Initialises curses for later use. Based on curses.wrapper | |
801 | implementation from the Python standard library.""" | |
802 | self.screen = curses.initscr() | |
803 | curses.noecho() | |
804 | curses.cbreak() | |
805 | ||
806 | # The try/catch works around a minor bit of | |
807 | # over-conscientiousness in the curses module, the error | |
808 | # return from C start_color() is ignorable. | |
809 | try: | |
810 | curses.start_color() | |
9fc0adfc | 811 | except curses.error: |
f9bc9e65 JF |
812 | pass |
813 | ||
a0b4e6a0 SR |
814 | # Hide cursor in extra statement as some monochrome terminals |
815 | # might support hiding but not colors. | |
816 | try: | |
817 | curses.curs_set(0) | |
818 | except curses.error: | |
819 | pass | |
820 | ||
f9bc9e65 JF |
821 | curses.use_default_colors() |
822 | return self | |
823 | ||
824 | def __exit__(self, *exception): | |
825 | """Resets the terminal to its normal state. Based on curses.wrappre | |
826 | implementation from the Python standard library.""" | |
827 | if self.screen: | |
828 | self.screen.keypad(0) | |
829 | curses.echo() | |
830 | curses.nocbreak() | |
831 | curses.endwin() | |
832 | ||
833 | def update_drilldown(self): | |
fabc7128 | 834 | """Sets or removes a filter that only allows fields without braces.""" |
f9bc9e65 JF |
835 | if not self.stats.fields_filter: |
836 | self.stats.fields_filter = r'^[^\(]*$' | |
837 | ||
838 | elif self.stats.fields_filter == r'^[^\(]*$': | |
839 | self.stats.fields_filter = None | |
840 | ||
f0cf040f | 841 | def update_pid(self, pid): |
fabc7128 | 842 | """Propagates pid selection to stats object.""" |
f0cf040f JF |
843 | self.stats.pid_filter = pid |
844 | ||
f9bc9e65 | 845 | def refresh(self, sleeptime): |
fabc7128 | 846 | """Refreshes on-screen data.""" |
f9bc9e65 | 847 | self.screen.erase() |
f0cf040f JF |
848 | if self.stats.pid_filter > 0: |
849 | self.screen.addstr(0, 0, 'kvm statistics - pid {0}' | |
850 | .format(self.stats.pid_filter), | |
851 | curses.A_BOLD) | |
852 | else: | |
853 | self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD) | |
f9bc9e65 JF |
854 | self.screen.addstr(2, 1, 'Event') |
855 | self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH - | |
856 | len('Total'), 'Total') | |
857 | self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 - | |
858 | len('Current'), 'Current') | |
859 | row = 3 | |
860 | stats = self.stats.get() | |
861 | def sortkey(x): | |
862 | if stats[x][1]: | |
863 | return (-stats[x][1], -stats[x][0]) | |
864 | else: | |
865 | return (0, -stats[x][0]) | |
866 | for key in sorted(stats.keys(), key=sortkey): | |
867 | ||
868 | if row >= self.screen.getmaxyx()[0]: | |
869 | break | |
870 | values = stats[key] | |
871 | if not values[0] and not values[1]: | |
872 | break | |
873 | col = 1 | |
874 | self.screen.addstr(row, col, key) | |
875 | col += LABEL_WIDTH | |
876 | self.screen.addstr(row, col, '%10d' % (values[0],)) | |
877 | col += NUMBER_WIDTH | |
878 | if values[1] is not None: | |
879 | self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,)) | |
880 | row += 1 | |
881 | self.screen.refresh() | |
882 | ||
883 | def show_filter_selection(self): | |
fabc7128 JF |
884 | """Draws filter selection mask. |
885 | ||
886 | Asks for a valid regex and sets the fields filter accordingly. | |
887 | ||
888 | """ | |
f9bc9e65 JF |
889 | while True: |
890 | self.screen.erase() | |
891 | self.screen.addstr(0, 0, | |
892 | "Show statistics for events matching a regex.", | |
893 | curses.A_BOLD) | |
894 | self.screen.addstr(2, 0, | |
895 | "Current regex: {0}" | |
896 | .format(self.stats.fields_filter)) | |
897 | self.screen.addstr(3, 0, "New regex: ") | |
898 | curses.echo() | |
899 | regex = self.screen.getstr() | |
900 | curses.noecho() | |
901 | if len(regex) == 0: | |
902 | return | |
903 | try: | |
904 | re.compile(regex) | |
905 | self.stats.fields_filter = regex | |
906 | return | |
907 | except re.error: | |
908 | continue | |
909 | ||
f0cf040f | 910 | def show_vm_selection(self): |
fabc7128 JF |
911 | """Draws PID selection mask. |
912 | ||
913 | Asks for a pid until a valid pid or 0 has been entered. | |
914 | ||
915 | """ | |
f0cf040f JF |
916 | while True: |
917 | self.screen.erase() | |
918 | self.screen.addstr(0, 0, | |
919 | 'Show statistics for specific pid.', | |
920 | curses.A_BOLD) | |
921 | self.screen.addstr(1, 0, | |
922 | 'This might limit the shown data to the trace ' | |
923 | 'statistics.') | |
924 | ||
925 | curses.echo() | |
926 | self.screen.addstr(3, 0, "Pid [0 or pid]: ") | |
927 | pid = self.screen.getstr() | |
928 | curses.noecho() | |
929 | ||
930 | try: | |
931 | pid = int(pid) | |
932 | ||
933 | if pid == 0: | |
934 | self.update_pid(pid) | |
935 | break | |
936 | else: | |
937 | if not os.path.isdir(os.path.join('/proc/', str(pid))): | |
938 | continue | |
939 | else: | |
940 | self.update_pid(pid) | |
941 | break | |
942 | ||
943 | except ValueError: | |
944 | continue | |
945 | ||
f9bc9e65 | 946 | def show_stats(self): |
fabc7128 | 947 | """Refreshes the screen and processes user input.""" |
f9bc9e65 JF |
948 | sleeptime = 0.25 |
949 | while True: | |
950 | self.refresh(sleeptime) | |
951 | curses.halfdelay(int(sleeptime * 10)) | |
e0ba3876 | 952 | sleeptime = 3.0 |
f9bc9e65 JF |
953 | try: |
954 | char = self.screen.getkey() | |
955 | if char == 'x': | |
f9bc9e65 JF |
956 | self.update_drilldown() |
957 | if char == 'q': | |
958 | break | |
959 | if char == 'f': | |
960 | self.show_filter_selection() | |
f0cf040f JF |
961 | if char == 'p': |
962 | self.show_vm_selection() | |
f9bc9e65 JF |
963 | except KeyboardInterrupt: |
964 | break | |
965 | except curses.error: | |
966 | continue | |
967 | ||
968 | def batch(stats): | |
fabc7128 | 969 | """Prints statistics in a key, value format.""" |
dadf1e78 SR |
970 | try: |
971 | s = stats.get() | |
972 | time.sleep(1) | |
973 | s = stats.get() | |
974 | for key in sorted(s.keys()): | |
975 | values = s[key] | |
976 | print '%-42s%10d%10d' % (key, values[0], values[1]) | |
977 | except KeyboardInterrupt: | |
978 | pass | |
f9bc9e65 JF |
979 | |
980 | def log(stats): | |
fabc7128 | 981 | """Prints statistics as reiterating key block, multiple value blocks.""" |
f9bc9e65 JF |
982 | keys = sorted(stats.get().iterkeys()) |
983 | def banner(): | |
984 | for k in keys: | |
985 | print '%s' % k, | |
986 | ||
987 | def statline(): | |
988 | s = stats.get() | |
989 | for k in keys: | |
990 | print ' %9d' % s[k][1], | |
991 | ||
992 | line = 0 | |
993 | banner_repeat = 20 | |
994 | while True: | |
dadf1e78 SR |
995 | try: |
996 | time.sleep(1) | |
997 | if line % banner_repeat == 0: | |
998 | banner() | |
999 | statline() | |
1000 | line += 1 | |
1001 | except KeyboardInterrupt: | |
1002 | break | |
f9bc9e65 JF |
1003 | |
1004 | def get_options(): | |
fabc7128 | 1005 | """Returns processed program arguments.""" |
f9bc9e65 JF |
1006 | description_text = """ |
1007 | This script displays various statistics about VMs running under KVM. | |
1008 | The statistics are gathered from the KVM debugfs entries and / or the | |
1009 | currently available perf traces. | |
1010 | ||
1011 | The monitoring takes additional cpu cycles and might affect the VM's | |
1012 | performance. | |
1013 | ||
1014 | Requirements: | |
1015 | - Access to: | |
1016 | /sys/kernel/debug/kvm | |
1017 | /sys/kernel/debug/trace/events/* | |
1018 | /proc/pid/task | |
1019 | - /proc/sys/kernel/perf_event_paranoid < 1 if user has no | |
1020 | CAP_SYS_ADMIN and perf events are used. | |
1021 | - CAP_SYS_RESOURCE if the hard limit is not high enough to allow | |
1022 | the large number of files that are possibly opened. | |
1023 | """ | |
1024 | ||
1025 | class PlainHelpFormatter(optparse.IndentedHelpFormatter): | |
1026 | def format_description(self, description): | |
1027 | if description: | |
1028 | return description + "\n" | |
1029 | else: | |
1030 | return "" | |
1031 | ||
1032 | optparser = optparse.OptionParser(description=description_text, | |
1033 | formatter=PlainHelpFormatter()) | |
1034 | optparser.add_option('-1', '--once', '--batch', | |
1035 | action='store_true', | |
1036 | default=False, | |
1037 | dest='once', | |
1038 | help='run in batch mode for one second', | |
1039 | ) | |
1040 | optparser.add_option('-l', '--log', | |
1041 | action='store_true', | |
1042 | default=False, | |
1043 | dest='log', | |
1044 | help='run in logging mode (like vmstat)', | |
1045 | ) | |
1046 | optparser.add_option('-t', '--tracepoints', | |
1047 | action='store_true', | |
1048 | default=False, | |
1049 | dest='tracepoints', | |
1050 | help='retrieve statistics from tracepoints', | |
1051 | ) | |
1052 | optparser.add_option('-d', '--debugfs', | |
1053 | action='store_true', | |
1054 | default=False, | |
1055 | dest='debugfs', | |
1056 | help='retrieve statistics from debugfs', | |
1057 | ) | |
1058 | optparser.add_option('-f', '--fields', | |
1059 | action='store', | |
1060 | default=None, | |
1061 | dest='fields', | |
1062 | help='fields to display (regex)', | |
1063 | ) | |
f0cf040f | 1064 | optparser.add_option('-p', '--pid', |
e0ba3876 SR |
1065 | action='store', |
1066 | default=0, | |
1067 | type='int', | |
1068 | dest='pid', | |
1069 | help='restrict statistics to pid', | |
1070 | ) | |
f9bc9e65 JF |
1071 | (options, _) = optparser.parse_args(sys.argv) |
1072 | return options | |
1073 | ||
1074 | def get_providers(options): | |
fabc7128 | 1075 | """Returns a list of data providers depending on the passed options.""" |
f9bc9e65 JF |
1076 | providers = [] |
1077 | ||
1078 | if options.tracepoints: | |
1079 | providers.append(TracepointProvider()) | |
1080 | if options.debugfs: | |
1081 | providers.append(DebugfsProvider()) | |
1082 | if len(providers) == 0: | |
1083 | providers.append(TracepointProvider()) | |
1084 | ||
1085 | return providers | |
1086 | ||
1087 | def check_access(options): | |
fabc7128 | 1088 | """Exits if the current user can't access all needed directories.""" |
f9bc9e65 JF |
1089 | if not os.path.exists('/sys/kernel/debug'): |
1090 | sys.stderr.write('Please enable CONFIG_DEBUG_FS in your kernel.') | |
1091 | sys.exit(1) | |
1092 | ||
1093 | if not os.path.exists(PATH_DEBUGFS_KVM): | |
1094 | sys.stderr.write("Please make sure, that debugfs is mounted and " | |
1095 | "readable by the current user:\n" | |
1096 | "('mount -t debugfs debugfs /sys/kernel/debug')\n" | |
1097 | "Also ensure, that the kvm modules are loaded.\n") | |
1098 | sys.exit(1) | |
1099 | ||
e0ba3876 SR |
1100 | if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints or |
1101 | not options.debugfs): | |
f9bc9e65 JF |
1102 | sys.stderr.write("Please enable CONFIG_TRACING in your kernel " |
1103 | "when using the option -t (default).\n" | |
1104 | "If it is enabled, make {0} readable by the " | |
1105 | "current user.\n" | |
1106 | .format(PATH_DEBUGFS_TRACING)) | |
1107 | if options.tracepoints: | |
1108 | sys.exit(1) | |
1109 | ||
1110 | sys.stderr.write("Falling back to debugfs statistics!\n") | |
1111 | options.debugfs = True | |
e0ba3876 | 1112 | time.sleep(5) |
f9bc9e65 JF |
1113 | |
1114 | return options | |
1115 | ||
1116 | def main(): | |
1117 | options = get_options() | |
1118 | options = check_access(options) | |
f0cf040f JF |
1119 | |
1120 | if (options.pid > 0 and | |
1121 | not os.path.isdir(os.path.join('/proc/', | |
1122 | str(options.pid)))): | |
1123 | sys.stderr.write('Did you use a (unsupported) tid instead of a pid?\n') | |
1124 | sys.exit('Specified pid does not exist.') | |
1125 | ||
f9bc9e65 | 1126 | providers = get_providers(options) |
f0cf040f | 1127 | stats = Stats(providers, options.pid, fields=options.fields) |
f9bc9e65 JF |
1128 | |
1129 | if options.log: | |
1130 | log(stats) | |
1131 | elif not options.once: | |
1132 | with Tui(stats) as tui: | |
1133 | tui.show_stats() | |
1134 | else: | |
1135 | batch(stats) | |
1136 | ||
1137 | if __name__ == "__main__": | |
1138 | main() |