]>
Commit | Line | Data |
---|---|---|
f9bc9e65 JF |
1 | #!/usr/bin/python |
2 | # | |
3 | # top-like utility for displaying kvm statistics | |
4 | # | |
5 | # Copyright 2006-2008 Qumranet Technologies | |
6 | # Copyright 2008-2011 Red Hat, Inc. | |
7 | # | |
8 | # Authors: | |
9 | # Avi Kivity <avi@redhat.com> | |
10 | # | |
11 | # This work is licensed under the terms of the GNU GPL, version 2. See | |
12 | # the COPYING file in the top-level directory. | |
fabc7128 JF |
13 | """The kvm_stat module outputs statistics about running KVM VMs |
14 | ||
15 | Three different ways of output formatting are available: | |
16 | - as a top-like text ui | |
17 | - in a key -> value format | |
18 | - in an all keys, all values format | |
19 | ||
20 | The data is sampled from the KVM's debugfs entries and its perf events. | |
21 | """ | |
f9bc9e65 JF |
22 | |
23 | import curses | |
24 | import sys | |
25 | import os | |
26 | import time | |
27 | import optparse | |
28 | import ctypes | |
29 | import fcntl | |
30 | import resource | |
31 | import struct | |
32 | import re | |
33 | from collections import defaultdict | |
34 | from time import sleep | |
35 | ||
36 | VMX_EXIT_REASONS = { | |
37 | 'EXCEPTION_NMI': 0, | |
38 | 'EXTERNAL_INTERRUPT': 1, | |
39 | 'TRIPLE_FAULT': 2, | |
40 | 'PENDING_INTERRUPT': 7, | |
41 | 'NMI_WINDOW': 8, | |
42 | 'TASK_SWITCH': 9, | |
43 | 'CPUID': 10, | |
44 | 'HLT': 12, | |
45 | 'INVLPG': 14, | |
46 | 'RDPMC': 15, | |
47 | 'RDTSC': 16, | |
48 | 'VMCALL': 18, | |
49 | 'VMCLEAR': 19, | |
50 | 'VMLAUNCH': 20, | |
51 | 'VMPTRLD': 21, | |
52 | 'VMPTRST': 22, | |
53 | 'VMREAD': 23, | |
54 | 'VMRESUME': 24, | |
55 | 'VMWRITE': 25, | |
56 | 'VMOFF': 26, | |
57 | 'VMON': 27, | |
58 | 'CR_ACCESS': 28, | |
59 | 'DR_ACCESS': 29, | |
60 | 'IO_INSTRUCTION': 30, | |
61 | 'MSR_READ': 31, | |
62 | 'MSR_WRITE': 32, | |
63 | 'INVALID_STATE': 33, | |
64 | 'MWAIT_INSTRUCTION': 36, | |
65 | 'MONITOR_INSTRUCTION': 39, | |
66 | 'PAUSE_INSTRUCTION': 40, | |
67 | 'MCE_DURING_VMENTRY': 41, | |
68 | 'TPR_BELOW_THRESHOLD': 43, | |
69 | 'APIC_ACCESS': 44, | |
70 | 'EPT_VIOLATION': 48, | |
71 | 'EPT_MISCONFIG': 49, | |
72 | 'WBINVD': 54, | |
73 | 'XSETBV': 55, | |
74 | 'APIC_WRITE': 56, | |
75 | 'INVPCID': 58, | |
76 | } | |
77 | ||
78 | SVM_EXIT_REASONS = { | |
79 | 'READ_CR0': 0x000, | |
80 | 'READ_CR3': 0x003, | |
81 | 'READ_CR4': 0x004, | |
82 | 'READ_CR8': 0x008, | |
83 | 'WRITE_CR0': 0x010, | |
84 | 'WRITE_CR3': 0x013, | |
85 | 'WRITE_CR4': 0x014, | |
86 | 'WRITE_CR8': 0x018, | |
87 | 'READ_DR0': 0x020, | |
88 | 'READ_DR1': 0x021, | |
89 | 'READ_DR2': 0x022, | |
90 | 'READ_DR3': 0x023, | |
91 | 'READ_DR4': 0x024, | |
92 | 'READ_DR5': 0x025, | |
93 | 'READ_DR6': 0x026, | |
94 | 'READ_DR7': 0x027, | |
95 | 'WRITE_DR0': 0x030, | |
96 | 'WRITE_DR1': 0x031, | |
97 | 'WRITE_DR2': 0x032, | |
98 | 'WRITE_DR3': 0x033, | |
99 | 'WRITE_DR4': 0x034, | |
100 | 'WRITE_DR5': 0x035, | |
101 | 'WRITE_DR6': 0x036, | |
102 | 'WRITE_DR7': 0x037, | |
103 | 'EXCP_BASE': 0x040, | |
104 | 'INTR': 0x060, | |
105 | 'NMI': 0x061, | |
106 | 'SMI': 0x062, | |
107 | 'INIT': 0x063, | |
108 | 'VINTR': 0x064, | |
109 | 'CR0_SEL_WRITE': 0x065, | |
110 | 'IDTR_READ': 0x066, | |
111 | 'GDTR_READ': 0x067, | |
112 | 'LDTR_READ': 0x068, | |
113 | 'TR_READ': 0x069, | |
114 | 'IDTR_WRITE': 0x06a, | |
115 | 'GDTR_WRITE': 0x06b, | |
116 | 'LDTR_WRITE': 0x06c, | |
117 | 'TR_WRITE': 0x06d, | |
118 | 'RDTSC': 0x06e, | |
119 | 'RDPMC': 0x06f, | |
120 | 'PUSHF': 0x070, | |
121 | 'POPF': 0x071, | |
122 | 'CPUID': 0x072, | |
123 | 'RSM': 0x073, | |
124 | 'IRET': 0x074, | |
125 | 'SWINT': 0x075, | |
126 | 'INVD': 0x076, | |
127 | 'PAUSE': 0x077, | |
128 | 'HLT': 0x078, | |
129 | 'INVLPG': 0x079, | |
130 | 'INVLPGA': 0x07a, | |
131 | 'IOIO': 0x07b, | |
132 | 'MSR': 0x07c, | |
133 | 'TASK_SWITCH': 0x07d, | |
134 | 'FERR_FREEZE': 0x07e, | |
135 | 'SHUTDOWN': 0x07f, | |
136 | 'VMRUN': 0x080, | |
137 | 'VMMCALL': 0x081, | |
138 | 'VMLOAD': 0x082, | |
139 | 'VMSAVE': 0x083, | |
140 | 'STGI': 0x084, | |
141 | 'CLGI': 0x085, | |
142 | 'SKINIT': 0x086, | |
143 | 'RDTSCP': 0x087, | |
144 | 'ICEBP': 0x088, | |
145 | 'WBINVD': 0x089, | |
146 | 'MONITOR': 0x08a, | |
147 | 'MWAIT': 0x08b, | |
148 | 'MWAIT_COND': 0x08c, | |
149 | 'XSETBV': 0x08d, | |
150 | 'NPF': 0x400, | |
151 | } | |
152 | ||
153 | # EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h) | |
154 | AARCH64_EXIT_REASONS = { | |
155 | 'UNKNOWN': 0x00, | |
156 | 'WFI': 0x01, | |
157 | 'CP15_32': 0x03, | |
158 | 'CP15_64': 0x04, | |
159 | 'CP14_MR': 0x05, | |
160 | 'CP14_LS': 0x06, | |
161 | 'FP_ASIMD': 0x07, | |
162 | 'CP10_ID': 0x08, | |
163 | 'CP14_64': 0x0C, | |
164 | 'ILL_ISS': 0x0E, | |
165 | 'SVC32': 0x11, | |
166 | 'HVC32': 0x12, | |
167 | 'SMC32': 0x13, | |
168 | 'SVC64': 0x15, | |
169 | 'HVC64': 0x16, | |
170 | 'SMC64': 0x17, | |
171 | 'SYS64': 0x18, | |
172 | 'IABT': 0x20, | |
173 | 'IABT_HYP': 0x21, | |
174 | 'PC_ALIGN': 0x22, | |
175 | 'DABT': 0x24, | |
176 | 'DABT_HYP': 0x25, | |
177 | 'SP_ALIGN': 0x26, | |
178 | 'FP_EXC32': 0x28, | |
179 | 'FP_EXC64': 0x2C, | |
180 | 'SERROR': 0x2F, | |
181 | 'BREAKPT': 0x30, | |
182 | 'BREAKPT_HYP': 0x31, | |
183 | 'SOFTSTP': 0x32, | |
184 | 'SOFTSTP_HYP': 0x33, | |
185 | 'WATCHPT': 0x34, | |
186 | 'WATCHPT_HYP': 0x35, | |
187 | 'BKPT32': 0x38, | |
188 | 'VECTOR32': 0x3A, | |
189 | 'BRK64': 0x3C, | |
190 | } | |
191 | ||
192 | # From include/uapi/linux/kvm.h, KVM_EXIT_xxx | |
193 | USERSPACE_EXIT_REASONS = { | |
194 | 'UNKNOWN': 0, | |
195 | 'EXCEPTION': 1, | |
196 | 'IO': 2, | |
197 | 'HYPERCALL': 3, | |
198 | 'DEBUG': 4, | |
199 | 'HLT': 5, | |
200 | 'MMIO': 6, | |
201 | 'IRQ_WINDOW_OPEN': 7, | |
202 | 'SHUTDOWN': 8, | |
203 | 'FAIL_ENTRY': 9, | |
204 | 'INTR': 10, | |
205 | 'SET_TPR': 11, | |
206 | 'TPR_ACCESS': 12, | |
207 | 'S390_SIEIC': 13, | |
208 | 'S390_RESET': 14, | |
209 | 'DCR': 15, | |
210 | 'NMI': 16, | |
211 | 'INTERNAL_ERROR': 17, | |
212 | 'OSI': 18, | |
213 | 'PAPR_HCALL': 19, | |
214 | 'S390_UCONTROL': 20, | |
215 | 'WATCHDOG': 21, | |
216 | 'S390_TSCH': 22, | |
217 | 'EPR': 23, | |
218 | 'SYSTEM_EVENT': 24, | |
219 | } | |
220 | ||
221 | IOCTL_NUMBERS = { | |
222 | 'SET_FILTER': 0x40082406, | |
223 | 'ENABLE': 0x00002400, | |
224 | 'DISABLE': 0x00002401, | |
225 | 'RESET': 0x00002403, | |
226 | } | |
227 | ||
228 | class Arch(object): | |
fabc7128 JF |
229 | """Encapsulates global architecture specific data. |
230 | ||
231 | Contains the performance event open syscall and ioctl numbers, as | |
232 | well as the VM exit reasons for the architecture it runs on. | |
f9bc9e65 JF |
233 | |
234 | """ | |
235 | @staticmethod | |
236 | def get_arch(): | |
237 | machine = os.uname()[4] | |
238 | ||
239 | if machine.startswith('ppc'): | |
240 | return ArchPPC() | |
241 | elif machine.startswith('aarch64'): | |
242 | return ArchA64() | |
243 | elif machine.startswith('s390'): | |
244 | return ArchS390() | |
245 | else: | |
246 | # X86_64 | |
247 | for line in open('/proc/cpuinfo'): | |
248 | if not line.startswith('flags'): | |
249 | continue | |
250 | ||
251 | flags = line.split() | |
252 | if 'vmx' in flags: | |
253 | return ArchX86(VMX_EXIT_REASONS) | |
254 | if 'svm' in flags: | |
255 | return ArchX86(SVM_EXIT_REASONS) | |
256 | return | |
257 | ||
258 | class ArchX86(Arch): | |
259 | def __init__(self, exit_reasons): | |
260 | self.sc_perf_evt_open = 298 | |
261 | self.ioctl_numbers = IOCTL_NUMBERS | |
262 | self.exit_reasons = exit_reasons | |
263 | ||
264 | class ArchPPC(Arch): | |
265 | def __init__(self): | |
266 | self.sc_perf_evt_open = 319 | |
267 | self.ioctl_numbers = IOCTL_NUMBERS | |
268 | self.ioctl_numbers['ENABLE'] = 0x20002400 | |
269 | self.ioctl_numbers['DISABLE'] = 0x20002401 | |
c7d4fb5a | 270 | self.ioctl_numbers['RESET'] = 0x20002403 |
f9bc9e65 JF |
271 | |
272 | # PPC comes in 32 and 64 bit and some generated ioctl | |
273 | # numbers depend on the wordsize. | |
274 | char_ptr_size = ctypes.sizeof(ctypes.c_char_p) | |
275 | self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 | |
c7d4fb5a | 276 | self.exit_reasons = {} |
f9bc9e65 JF |
277 | |
278 | class ArchA64(Arch): | |
279 | def __init__(self): | |
280 | self.sc_perf_evt_open = 241 | |
281 | self.ioctl_numbers = IOCTL_NUMBERS | |
282 | self.exit_reasons = AARCH64_EXIT_REASONS | |
283 | ||
284 | class ArchS390(Arch): | |
285 | def __init__(self): | |
286 | self.sc_perf_evt_open = 331 | |
287 | self.ioctl_numbers = IOCTL_NUMBERS | |
288 | self.exit_reasons = None | |
289 | ||
290 | ARCH = Arch.get_arch() | |
291 | ||
292 | ||
293 | def walkdir(path): | |
294 | """Returns os.walk() data for specified directory. | |
295 | ||
296 | As it is only a wrapper it returns the same 3-tuple of (dirpath, | |
297 | dirnames, filenames). | |
298 | """ | |
299 | return next(os.walk(path)) | |
300 | ||
301 | ||
302 | def parse_int_list(list_string): | |
303 | """Returns an int list from a string of comma separated integers and | |
304 | integer ranges.""" | |
305 | integers = [] | |
306 | members = list_string.split(',') | |
307 | ||
308 | for member in members: | |
309 | if '-' not in member: | |
310 | integers.append(int(member)) | |
311 | else: | |
312 | int_range = member.split('-') | |
313 | integers.extend(range(int(int_range[0]), | |
314 | int(int_range[1]) + 1)) | |
315 | ||
316 | return integers | |
317 | ||
318 | ||
319 | def get_online_cpus(): | |
fabc7128 | 320 | """Returns a list of cpu id integers.""" |
f9bc9e65 JF |
321 | with open('/sys/devices/system/cpu/online') as cpu_list: |
322 | cpu_string = cpu_list.readline() | |
323 | return parse_int_list(cpu_string) | |
324 | ||
325 | ||
326 | def get_filters(): | |
fabc7128 JF |
327 | """Returns a dict of trace events, their filter ids and |
328 | the values that can be filtered. | |
329 | ||
330 | Trace events can be filtered for special values by setting a | |
331 | filter string via an ioctl. The string normally has the format | |
332 | identifier==value. For each filter a new event will be created, to | |
333 | be able to distinguish the events. | |
334 | ||
335 | """ | |
f9bc9e65 JF |
336 | filters = {} |
337 | filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS) | |
338 | if ARCH.exit_reasons: | |
339 | filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons) | |
340 | return filters | |
341 | ||
342 | libc = ctypes.CDLL('libc.so.6', use_errno=True) | |
343 | syscall = libc.syscall | |
344 | ||
345 | class perf_event_attr(ctypes.Structure): | |
fabc7128 JF |
346 | """Struct that holds the necessary data to set up a trace event. |
347 | ||
348 | For an extensive explanation see perf_event_open(2) and | |
349 | include/uapi/linux/perf_event.h, struct perf_event_attr | |
350 | ||
351 | All fields that are not initialized in the constructor are 0. | |
352 | ||
353 | """ | |
f9bc9e65 JF |
354 | _fields_ = [('type', ctypes.c_uint32), |
355 | ('size', ctypes.c_uint32), | |
356 | ('config', ctypes.c_uint64), | |
357 | ('sample_freq', ctypes.c_uint64), | |
358 | ('sample_type', ctypes.c_uint64), | |
359 | ('read_format', ctypes.c_uint64), | |
360 | ('flags', ctypes.c_uint64), | |
361 | ('wakeup_events', ctypes.c_uint32), | |
362 | ('bp_type', ctypes.c_uint32), | |
363 | ('bp_addr', ctypes.c_uint64), | |
364 | ('bp_len', ctypes.c_uint64), | |
365 | ] | |
366 | ||
367 | def __init__(self): | |
368 | super(self.__class__, self).__init__() | |
369 | self.type = PERF_TYPE_TRACEPOINT | |
370 | self.size = ctypes.sizeof(self) | |
371 | self.read_format = PERF_FORMAT_GROUP | |
372 | ||
373 | def perf_event_open(attr, pid, cpu, group_fd, flags): | |
fabc7128 JF |
374 | """Wrapper for the sys_perf_evt_open() syscall. |
375 | ||
376 | Used to set up performance events, returns a file descriptor or -1 | |
377 | on error. | |
378 | ||
379 | Attributes are: | |
380 | - syscall number | |
381 | - struct perf_event_attr * | |
382 | - pid or -1 to monitor all pids | |
383 | - cpu number or -1 to monitor all cpus | |
384 | - The file descriptor of the group leader or -1 to create a group. | |
385 | - flags | |
386 | ||
387 | """ | |
f9bc9e65 JF |
388 | return syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr), |
389 | ctypes.c_int(pid), ctypes.c_int(cpu), | |
390 | ctypes.c_int(group_fd), ctypes.c_long(flags)) | |
391 | ||
392 | PERF_TYPE_TRACEPOINT = 2 | |
393 | PERF_FORMAT_GROUP = 1 << 3 | |
394 | ||
395 | PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing' | |
396 | PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm' | |
397 | ||
398 | class Group(object): | |
fabc7128 JF |
399 | """Represents a perf event group.""" |
400 | ||
f9bc9e65 JF |
401 | def __init__(self): |
402 | self.events = [] | |
403 | ||
404 | def add_event(self, event): | |
405 | self.events.append(event) | |
406 | ||
407 | def read(self): | |
fabc7128 JF |
408 | """Returns a dict with 'event name: value' for all events in the |
409 | group. | |
410 | ||
411 | Values are read by reading from the file descriptor of the | |
412 | event that is the group leader. See perf_event_open(2) for | |
413 | details. | |
414 | ||
415 | Read format for the used event configuration is: | |
416 | struct read_format { | |
417 | u64 nr; /* The number of events */ | |
418 | struct { | |
419 | u64 value; /* The value of the event */ | |
420 | } values[nr]; | |
421 | }; | |
422 | ||
423 | """ | |
f9bc9e65 JF |
424 | length = 8 * (1 + len(self.events)) |
425 | read_format = 'xxxxxxxx' + 'Q' * len(self.events) | |
426 | return dict(zip([event.name for event in self.events], | |
427 | struct.unpack(read_format, | |
428 | os.read(self.events[0].fd, length)))) | |
429 | ||
430 | class Event(object): | |
fabc7128 | 431 | """Represents a performance event and manages its life cycle.""" |
f0cf040f JF |
432 | def __init__(self, name, group, trace_cpu, trace_pid, trace_point, |
433 | trace_filter, trace_set='kvm'): | |
f9bc9e65 JF |
434 | self.name = name |
435 | self.fd = None | |
f0cf040f JF |
436 | self.setup_event(group, trace_cpu, trace_pid, trace_point, |
437 | trace_filter, trace_set) | |
438 | ||
439 | def __del__(self): | |
fabc7128 JF |
440 | """Closes the event's file descriptor. |
441 | ||
442 | As no python file object was created for the file descriptor, | |
443 | python will not reference count the descriptor and will not | |
444 | close it itself automatically, so we do it. | |
445 | ||
446 | """ | |
f0cf040f JF |
447 | if self.fd: |
448 | os.close(self.fd) | |
f9bc9e65 JF |
449 | |
450 | def setup_event_attribute(self, trace_set, trace_point): | |
fabc7128 JF |
451 | """Returns an initialized ctype perf_event_attr struct.""" |
452 | ||
f9bc9e65 JF |
453 | id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set, |
454 | trace_point, 'id') | |
455 | ||
456 | event_attr = perf_event_attr() | |
457 | event_attr.config = int(open(id_path).read()) | |
458 | return event_attr | |
459 | ||
f0cf040f JF |
460 | def setup_event(self, group, trace_cpu, trace_pid, trace_point, |
461 | trace_filter, trace_set): | |
fabc7128 JF |
462 | """Sets up the perf event in Linux. |
463 | ||
464 | Issues the syscall to register the event in the kernel and | |
465 | then sets the optional filter. | |
466 | ||
467 | """ | |
468 | ||
f9bc9e65 JF |
469 | event_attr = self.setup_event_attribute(trace_set, trace_point) |
470 | ||
fabc7128 | 471 | # First event will be group leader. |
f9bc9e65 | 472 | group_leader = -1 |
fabc7128 JF |
473 | |
474 | # All others have to pass the leader's descriptor instead. | |
f9bc9e65 JF |
475 | if group.events: |
476 | group_leader = group.events[0].fd | |
477 | ||
f0cf040f JF |
478 | fd = perf_event_open(event_attr, trace_pid, |
479 | trace_cpu, group_leader, 0) | |
f9bc9e65 JF |
480 | if fd == -1: |
481 | err = ctypes.get_errno() | |
482 | raise OSError(err, os.strerror(err), | |
483 | 'while calling sys_perf_event_open().') | |
484 | ||
485 | if trace_filter: | |
486 | fcntl.ioctl(fd, ARCH.ioctl_numbers['SET_FILTER'], | |
487 | trace_filter) | |
488 | ||
489 | self.fd = fd | |
490 | ||
491 | def enable(self): | |
fabc7128 JF |
492 | """Enables the trace event in the kernel. |
493 | ||
494 | Enabling the group leader makes reading counters from it and the | |
495 | events under it possible. | |
496 | ||
497 | """ | |
f9bc9e65 JF |
498 | fcntl.ioctl(self.fd, ARCH.ioctl_numbers['ENABLE'], 0) |
499 | ||
500 | def disable(self): | |
fabc7128 JF |
501 | """Disables the trace event in the kernel. |
502 | ||
503 | Disabling the group leader makes reading all counters under it | |
504 | impossible. | |
505 | ||
506 | """ | |
f9bc9e65 JF |
507 | fcntl.ioctl(self.fd, ARCH.ioctl_numbers['DISABLE'], 0) |
508 | ||
509 | def reset(self): | |
fabc7128 | 510 | """Resets the count of the trace event in the kernel.""" |
f9bc9e65 JF |
511 | fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0) |
512 | ||
513 | class TracepointProvider(object): | |
fabc7128 JF |
514 | """Data provider for the stats class. |
515 | ||
516 | Manages the events/groups from which it acquires its data. | |
517 | ||
518 | """ | |
f9bc9e65 JF |
519 | def __init__(self): |
520 | self.group_leaders = [] | |
521 | self.filters = get_filters() | |
522 | self._fields = self.get_available_fields() | |
f0cf040f | 523 | self._pid = 0 |
f9bc9e65 JF |
524 | |
525 | def get_available_fields(self): | |
fabc7128 JF |
526 | """Returns a list of available event's of format 'event name(filter |
527 | name)'. | |
528 | ||
529 | All available events have directories under | |
530 | /sys/kernel/debug/tracing/events/ which export information | |
531 | about the specific event. Therefore, listing the dirs gives us | |
532 | a list of all available events. | |
533 | ||
534 | Some events like the vm exit reasons can be filtered for | |
535 | specific values. To take account for that, the routine below | |
536 | creates special fields with the following format: | |
537 | event name(filter name) | |
538 | ||
539 | """ | |
f9bc9e65 JF |
540 | path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm') |
541 | fields = walkdir(path)[1] | |
542 | extra = [] | |
543 | for field in fields: | |
544 | if field in self.filters: | |
545 | filter_name_, filter_dicts = self.filters[field] | |
546 | for name in filter_dicts: | |
547 | extra.append(field + '(' + name + ')') | |
548 | fields += extra | |
549 | return fields | |
550 | ||
551 | def setup_traces(self): | |
fabc7128 JF |
552 | """Creates all event and group objects needed to be able to retrieve |
553 | data.""" | |
f0cf040f JF |
554 | if self._pid > 0: |
555 | # Fetch list of all threads of the monitored pid, as qemu | |
556 | # starts a thread for each vcpu. | |
557 | path = os.path.join('/proc', str(self._pid), 'task') | |
558 | groupids = walkdir(path)[1] | |
559 | else: | |
560 | groupids = get_online_cpus() | |
f9bc9e65 JF |
561 | |
562 | # The constant is needed as a buffer for python libs, std | |
563 | # streams and other files that the script opens. | |
f0cf040f | 564 | newlim = len(groupids) * len(self._fields) + 50 |
f9bc9e65 JF |
565 | try: |
566 | softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE) | |
567 | ||
568 | if hardlim < newlim: | |
569 | # Now we need CAP_SYS_RESOURCE, to increase the hard limit. | |
570 | resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, newlim)) | |
571 | else: | |
572 | # Raising the soft limit is sufficient. | |
573 | resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, hardlim)) | |
574 | ||
575 | except ValueError: | |
576 | sys.exit("NOFILE rlimit could not be raised to {0}".format(newlim)) | |
577 | ||
f0cf040f | 578 | for groupid in groupids: |
f9bc9e65 JF |
579 | group = Group() |
580 | for name in self._fields: | |
581 | tracepoint = name | |
582 | tracefilter = None | |
583 | match = re.match(r'(.*)\((.*)\)', name) | |
584 | if match: | |
585 | tracepoint, sub = match.groups() | |
586 | tracefilter = ('%s==%d\0' % | |
587 | (self.filters[tracepoint][0], | |
588 | self.filters[tracepoint][1][sub])) | |
589 | ||
f0cf040f JF |
590 | # From perf_event_open(2): |
591 | # pid > 0 and cpu == -1 | |
592 | # This measures the specified process/thread on any CPU. | |
593 | # | |
594 | # pid == -1 and cpu >= 0 | |
595 | # This measures all processes/threads on the specified CPU. | |
596 | trace_cpu = groupid if self._pid == 0 else -1 | |
597 | trace_pid = int(groupid) if self._pid != 0 else -1 | |
598 | ||
f9bc9e65 JF |
599 | group.add_event(Event(name=name, |
600 | group=group, | |
f0cf040f JF |
601 | trace_cpu=trace_cpu, |
602 | trace_pid=trace_pid, | |
f9bc9e65 JF |
603 | trace_point=tracepoint, |
604 | trace_filter=tracefilter)) | |
f0cf040f | 605 | |
f9bc9e65 JF |
606 | self.group_leaders.append(group) |
607 | ||
608 | def available_fields(self): | |
609 | return self.get_available_fields() | |
610 | ||
611 | @property | |
612 | def fields(self): | |
613 | return self._fields | |
614 | ||
615 | @fields.setter | |
616 | def fields(self, fields): | |
fabc7128 | 617 | """Enables/disables the (un)wanted events""" |
f9bc9e65 JF |
618 | self._fields = fields |
619 | for group in self.group_leaders: | |
620 | for index, event in enumerate(group.events): | |
621 | if event.name in fields: | |
622 | event.reset() | |
623 | event.enable() | |
624 | else: | |
625 | # Do not disable the group leader. | |
626 | # It would disable all of its events. | |
627 | if index != 0: | |
628 | event.disable() | |
629 | ||
f0cf040f JF |
630 | @property |
631 | def pid(self): | |
632 | return self._pid | |
633 | ||
634 | @pid.setter | |
635 | def pid(self, pid): | |
fabc7128 | 636 | """Changes the monitored pid by setting new traces.""" |
f0cf040f | 637 | self._pid = pid |
fabc7128 JF |
638 | # The garbage collector will get rid of all Event/Group |
639 | # objects and open files after removing the references. | |
f0cf040f JF |
640 | self.group_leaders = [] |
641 | self.setup_traces() | |
642 | self.fields = self._fields | |
643 | ||
f9bc9e65 | 644 | def read(self): |
fabc7128 | 645 | """Returns 'event name: current value' for all enabled events.""" |
f9bc9e65 JF |
646 | ret = defaultdict(int) |
647 | for group in self.group_leaders: | |
648 | for name, val in group.read().iteritems(): | |
649 | if name in self._fields: | |
650 | ret[name] += val | |
651 | return ret | |
652 | ||
653 | class DebugfsProvider(object): | |
fabc7128 JF |
654 | """Provides data from the files that KVM creates in the kvm debugfs |
655 | folder.""" | |
f9bc9e65 JF |
656 | def __init__(self): |
657 | self._fields = self.get_available_fields() | |
f0cf040f JF |
658 | self._pid = 0 |
659 | self.do_read = True | |
f9bc9e65 JF |
660 | |
661 | def get_available_fields(self): | |
fabc7128 JF |
662 | """"Returns a list of available fields. |
663 | ||
664 | The fields are all available KVM debugfs files | |
665 | ||
666 | """ | |
f9bc9e65 JF |
667 | return walkdir(PATH_DEBUGFS_KVM)[2] |
668 | ||
669 | @property | |
670 | def fields(self): | |
671 | return self._fields | |
672 | ||
673 | @fields.setter | |
674 | def fields(self, fields): | |
675 | self._fields = fields | |
676 | ||
f0cf040f JF |
677 | @property |
678 | def pid(self): | |
679 | return self._pid | |
680 | ||
681 | @pid.setter | |
682 | def pid(self, pid): | |
683 | if pid != 0: | |
684 | self._pid = pid | |
685 | ||
686 | vms = walkdir(PATH_DEBUGFS_KVM)[1] | |
687 | if len(vms) == 0: | |
688 | self.do_read = False | |
689 | ||
690 | self.paths = filter(lambda x: "{}-".format(pid) in x, vms) | |
691 | ||
692 | else: | |
693 | self.paths = [''] | |
694 | self.do_read = True | |
695 | ||
f9bc9e65 | 696 | def read(self): |
f0cf040f JF |
697 | """Returns a dict with format:'file name / field -> current value'.""" |
698 | results = {} | |
699 | ||
700 | # If no debugfs filtering support is available, then don't read. | |
701 | if not self.do_read: | |
702 | return results | |
703 | ||
704 | for path in self.paths: | |
705 | for field in self._fields: | |
706 | results[field] = results.get(field, 0) \ | |
707 | + self.read_field(field, path) | |
708 | ||
709 | return results | |
710 | ||
711 | def read_field(self, field, path): | |
712 | """Returns the value of a single field from a specific VM.""" | |
713 | try: | |
714 | return int(open(os.path.join(PATH_DEBUGFS_KVM, | |
715 | path, | |
716 | field)) | |
717 | .read()) | |
718 | except IOError: | |
719 | return 0 | |
f9bc9e65 JF |
720 | |
721 | class Stats(object): | |
fabc7128 JF |
722 | """Manages the data providers and the data they provide. |
723 | ||
724 | It is used to set filters on the provider's data and collect all | |
725 | provider data. | |
726 | ||
727 | """ | |
f0cf040f | 728 | def __init__(self, providers, pid, fields=None): |
f9bc9e65 | 729 | self.providers = providers |
f0cf040f | 730 | self._pid_filter = pid |
f9bc9e65 JF |
731 | self._fields_filter = fields |
732 | self.values = {} | |
f0cf040f | 733 | self.update_provider_pid() |
f9bc9e65 JF |
734 | self.update_provider_filters() |
735 | ||
736 | def update_provider_filters(self): | |
fabc7128 | 737 | """Propagates fields filters to providers.""" |
f9bc9e65 JF |
738 | def wanted(key): |
739 | if not self._fields_filter: | |
740 | return True | |
741 | return re.match(self._fields_filter, key) is not None | |
742 | ||
743 | # As we reset the counters when updating the fields we can | |
744 | # also clear the cache of old values. | |
745 | self.values = {} | |
746 | for provider in self.providers: | |
747 | provider_fields = [key for key in provider.get_available_fields() | |
748 | if wanted(key)] | |
749 | provider.fields = provider_fields | |
750 | ||
f0cf040f | 751 | def update_provider_pid(self): |
fabc7128 | 752 | """Propagates pid filters to providers.""" |
f0cf040f JF |
753 | for provider in self.providers: |
754 | provider.pid = self._pid_filter | |
755 | ||
f9bc9e65 JF |
756 | @property |
757 | def fields_filter(self): | |
758 | return self._fields_filter | |
759 | ||
760 | @fields_filter.setter | |
761 | def fields_filter(self, fields_filter): | |
762 | self._fields_filter = fields_filter | |
763 | self.update_provider_filters() | |
764 | ||
f0cf040f JF |
765 | @property |
766 | def pid_filter(self): | |
767 | return self._pid_filter | |
768 | ||
769 | @pid_filter.setter | |
770 | def pid_filter(self, pid): | |
771 | self._pid_filter = pid | |
772 | self.values = {} | |
773 | self.update_provider_pid() | |
774 | ||
f9bc9e65 | 775 | def get(self): |
fabc7128 JF |
776 | """Returns a dict with field -> (value, delta to last value) of all |
777 | provider data.""" | |
f9bc9e65 JF |
778 | for provider in self.providers: |
779 | new = provider.read() | |
780 | for key in provider.fields: | |
781 | oldval = self.values.get(key, (0, 0)) | |
782 | newval = new.get(key, 0) | |
783 | newdelta = None | |
784 | if oldval is not None: | |
785 | newdelta = newval - oldval[0] | |
786 | self.values[key] = (newval, newdelta) | |
787 | return self.values | |
788 | ||
789 | LABEL_WIDTH = 40 | |
790 | NUMBER_WIDTH = 10 | |
791 | ||
792 | class Tui(object): | |
fabc7128 | 793 | """Instruments curses to draw a nice text ui.""" |
f9bc9e65 JF |
794 | def __init__(self, stats): |
795 | self.stats = stats | |
796 | self.screen = None | |
797 | self.drilldown = False | |
798 | self.update_drilldown() | |
799 | ||
800 | def __enter__(self): | |
801 | """Initialises curses for later use. Based on curses.wrapper | |
802 | implementation from the Python standard library.""" | |
803 | self.screen = curses.initscr() | |
804 | curses.noecho() | |
805 | curses.cbreak() | |
806 | ||
807 | # The try/catch works around a minor bit of | |
808 | # over-conscientiousness in the curses module, the error | |
809 | # return from C start_color() is ignorable. | |
810 | try: | |
811 | curses.start_color() | |
9fc0adfc | 812 | except curses.error: |
f9bc9e65 JF |
813 | pass |
814 | ||
a0b4e6a0 SR |
815 | # Hide cursor in extra statement as some monochrome terminals |
816 | # might support hiding but not colors. | |
817 | try: | |
818 | curses.curs_set(0) | |
819 | except curses.error: | |
820 | pass | |
821 | ||
f9bc9e65 JF |
822 | curses.use_default_colors() |
823 | return self | |
824 | ||
825 | def __exit__(self, *exception): | |
826 | """Resets the terminal to its normal state. Based on curses.wrappre | |
827 | implementation from the Python standard library.""" | |
828 | if self.screen: | |
829 | self.screen.keypad(0) | |
830 | curses.echo() | |
831 | curses.nocbreak() | |
832 | curses.endwin() | |
833 | ||
834 | def update_drilldown(self): | |
fabc7128 | 835 | """Sets or removes a filter that only allows fields without braces.""" |
f9bc9e65 JF |
836 | if not self.stats.fields_filter: |
837 | self.stats.fields_filter = r'^[^\(]*$' | |
838 | ||
839 | elif self.stats.fields_filter == r'^[^\(]*$': | |
840 | self.stats.fields_filter = None | |
841 | ||
f0cf040f | 842 | def update_pid(self, pid): |
fabc7128 | 843 | """Propagates pid selection to stats object.""" |
f0cf040f JF |
844 | self.stats.pid_filter = pid |
845 | ||
f9bc9e65 | 846 | def refresh(self, sleeptime): |
fabc7128 | 847 | """Refreshes on-screen data.""" |
f9bc9e65 | 848 | self.screen.erase() |
f0cf040f JF |
849 | if self.stats.pid_filter > 0: |
850 | self.screen.addstr(0, 0, 'kvm statistics - pid {0}' | |
851 | .format(self.stats.pid_filter), | |
852 | curses.A_BOLD) | |
853 | else: | |
854 | self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD) | |
f9bc9e65 JF |
855 | self.screen.addstr(2, 1, 'Event') |
856 | self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH - | |
857 | len('Total'), 'Total') | |
858 | self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 - | |
859 | len('Current'), 'Current') | |
860 | row = 3 | |
861 | stats = self.stats.get() | |
862 | def sortkey(x): | |
863 | if stats[x][1]: | |
864 | return (-stats[x][1], -stats[x][0]) | |
865 | else: | |
866 | return (0, -stats[x][0]) | |
867 | for key in sorted(stats.keys(), key=sortkey): | |
868 | ||
869 | if row >= self.screen.getmaxyx()[0]: | |
870 | break | |
871 | values = stats[key] | |
872 | if not values[0] and not values[1]: | |
873 | break | |
874 | col = 1 | |
875 | self.screen.addstr(row, col, key) | |
876 | col += LABEL_WIDTH | |
877 | self.screen.addstr(row, col, '%10d' % (values[0],)) | |
878 | col += NUMBER_WIDTH | |
879 | if values[1] is not None: | |
880 | self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,)) | |
881 | row += 1 | |
882 | self.screen.refresh() | |
883 | ||
884 | def show_filter_selection(self): | |
fabc7128 JF |
885 | """Draws filter selection mask. |
886 | ||
887 | Asks for a valid regex and sets the fields filter accordingly. | |
888 | ||
889 | """ | |
f9bc9e65 JF |
890 | while True: |
891 | self.screen.erase() | |
892 | self.screen.addstr(0, 0, | |
893 | "Show statistics for events matching a regex.", | |
894 | curses.A_BOLD) | |
895 | self.screen.addstr(2, 0, | |
896 | "Current regex: {0}" | |
897 | .format(self.stats.fields_filter)) | |
898 | self.screen.addstr(3, 0, "New regex: ") | |
899 | curses.echo() | |
900 | regex = self.screen.getstr() | |
901 | curses.noecho() | |
902 | if len(regex) == 0: | |
903 | return | |
904 | try: | |
905 | re.compile(regex) | |
906 | self.stats.fields_filter = regex | |
907 | return | |
908 | except re.error: | |
909 | continue | |
910 | ||
f0cf040f | 911 | def show_vm_selection(self): |
fabc7128 JF |
912 | """Draws PID selection mask. |
913 | ||
914 | Asks for a pid until a valid pid or 0 has been entered. | |
915 | ||
916 | """ | |
f0cf040f JF |
917 | while True: |
918 | self.screen.erase() | |
919 | self.screen.addstr(0, 0, | |
920 | 'Show statistics for specific pid.', | |
921 | curses.A_BOLD) | |
922 | self.screen.addstr(1, 0, | |
923 | 'This might limit the shown data to the trace ' | |
924 | 'statistics.') | |
925 | ||
926 | curses.echo() | |
927 | self.screen.addstr(3, 0, "Pid [0 or pid]: ") | |
928 | pid = self.screen.getstr() | |
929 | curses.noecho() | |
930 | ||
931 | try: | |
932 | pid = int(pid) | |
933 | ||
934 | if pid == 0: | |
935 | self.update_pid(pid) | |
936 | break | |
937 | else: | |
938 | if not os.path.isdir(os.path.join('/proc/', str(pid))): | |
939 | continue | |
940 | else: | |
941 | self.update_pid(pid) | |
942 | break | |
943 | ||
944 | except ValueError: | |
945 | continue | |
946 | ||
f9bc9e65 | 947 | def show_stats(self): |
fabc7128 | 948 | """Refreshes the screen and processes user input.""" |
f9bc9e65 JF |
949 | sleeptime = 0.25 |
950 | while True: | |
951 | self.refresh(sleeptime) | |
952 | curses.halfdelay(int(sleeptime * 10)) | |
953 | sleeptime = 3 | |
954 | try: | |
955 | char = self.screen.getkey() | |
956 | if char == 'x': | |
957 | self.drilldown = not self.drilldown | |
958 | self.update_drilldown() | |
959 | if char == 'q': | |
960 | break | |
961 | if char == 'f': | |
962 | self.show_filter_selection() | |
f0cf040f JF |
963 | if char == 'p': |
964 | self.show_vm_selection() | |
f9bc9e65 JF |
965 | except KeyboardInterrupt: |
966 | break | |
967 | except curses.error: | |
968 | continue | |
969 | ||
970 | def batch(stats): | |
fabc7128 | 971 | """Prints statistics in a key, value format.""" |
f9bc9e65 JF |
972 | s = stats.get() |
973 | time.sleep(1) | |
974 | s = stats.get() | |
975 | for key in sorted(s.keys()): | |
976 | values = s[key] | |
977 | print '%-42s%10d%10d' % (key, values[0], values[1]) | |
978 | ||
979 | def log(stats): | |
fabc7128 | 980 | """Prints statistics as reiterating key block, multiple value blocks.""" |
f9bc9e65 JF |
981 | keys = sorted(stats.get().iterkeys()) |
982 | def banner(): | |
983 | for k in keys: | |
984 | print '%s' % k, | |
985 | ||
986 | def statline(): | |
987 | s = stats.get() | |
988 | for k in keys: | |
989 | print ' %9d' % s[k][1], | |
990 | ||
991 | line = 0 | |
992 | banner_repeat = 20 | |
993 | while True: | |
994 | time.sleep(1) | |
995 | if line % banner_repeat == 0: | |
996 | banner() | |
997 | statline() | |
998 | line += 1 | |
999 | ||
1000 | def get_options(): | |
fabc7128 | 1001 | """Returns processed program arguments.""" |
f9bc9e65 JF |
1002 | description_text = """ |
1003 | This script displays various statistics about VMs running under KVM. | |
1004 | The statistics are gathered from the KVM debugfs entries and / or the | |
1005 | currently available perf traces. | |
1006 | ||
1007 | The monitoring takes additional cpu cycles and might affect the VM's | |
1008 | performance. | |
1009 | ||
1010 | Requirements: | |
1011 | - Access to: | |
1012 | /sys/kernel/debug/kvm | |
1013 | /sys/kernel/debug/trace/events/* | |
1014 | /proc/pid/task | |
1015 | - /proc/sys/kernel/perf_event_paranoid < 1 if user has no | |
1016 | CAP_SYS_ADMIN and perf events are used. | |
1017 | - CAP_SYS_RESOURCE if the hard limit is not high enough to allow | |
1018 | the large number of files that are possibly opened. | |
1019 | """ | |
1020 | ||
1021 | class PlainHelpFormatter(optparse.IndentedHelpFormatter): | |
1022 | def format_description(self, description): | |
1023 | if description: | |
1024 | return description + "\n" | |
1025 | else: | |
1026 | return "" | |
1027 | ||
1028 | optparser = optparse.OptionParser(description=description_text, | |
1029 | formatter=PlainHelpFormatter()) | |
1030 | optparser.add_option('-1', '--once', '--batch', | |
1031 | action='store_true', | |
1032 | default=False, | |
1033 | dest='once', | |
1034 | help='run in batch mode for one second', | |
1035 | ) | |
1036 | optparser.add_option('-l', '--log', | |
1037 | action='store_true', | |
1038 | default=False, | |
1039 | dest='log', | |
1040 | help='run in logging mode (like vmstat)', | |
1041 | ) | |
1042 | optparser.add_option('-t', '--tracepoints', | |
1043 | action='store_true', | |
1044 | default=False, | |
1045 | dest='tracepoints', | |
1046 | help='retrieve statistics from tracepoints', | |
1047 | ) | |
1048 | optparser.add_option('-d', '--debugfs', | |
1049 | action='store_true', | |
1050 | default=False, | |
1051 | dest='debugfs', | |
1052 | help='retrieve statistics from debugfs', | |
1053 | ) | |
1054 | optparser.add_option('-f', '--fields', | |
1055 | action='store', | |
1056 | default=None, | |
1057 | dest='fields', | |
1058 | help='fields to display (regex)', | |
1059 | ) | |
f0cf040f JF |
1060 | optparser.add_option('-p', '--pid', |
1061 | action='store', | |
1062 | default=0, | |
1063 | type=int, | |
1064 | dest='pid', | |
1065 | help='restrict statistics to pid', | |
1066 | ) | |
f9bc9e65 JF |
1067 | (options, _) = optparser.parse_args(sys.argv) |
1068 | return options | |
1069 | ||
1070 | def get_providers(options): | |
fabc7128 | 1071 | """Returns a list of data providers depending on the passed options.""" |
f9bc9e65 JF |
1072 | providers = [] |
1073 | ||
1074 | if options.tracepoints: | |
1075 | providers.append(TracepointProvider()) | |
1076 | if options.debugfs: | |
1077 | providers.append(DebugfsProvider()) | |
1078 | if len(providers) == 0: | |
1079 | providers.append(TracepointProvider()) | |
1080 | ||
1081 | return providers | |
1082 | ||
1083 | def check_access(options): | |
fabc7128 | 1084 | """Exits if the current user can't access all needed directories.""" |
f9bc9e65 JF |
1085 | if not os.path.exists('/sys/kernel/debug'): |
1086 | sys.stderr.write('Please enable CONFIG_DEBUG_FS in your kernel.') | |
1087 | sys.exit(1) | |
1088 | ||
1089 | if not os.path.exists(PATH_DEBUGFS_KVM): | |
1090 | sys.stderr.write("Please make sure, that debugfs is mounted and " | |
1091 | "readable by the current user:\n" | |
1092 | "('mount -t debugfs debugfs /sys/kernel/debug')\n" | |
1093 | "Also ensure, that the kvm modules are loaded.\n") | |
1094 | sys.exit(1) | |
1095 | ||
1096 | if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints | |
1097 | or not options.debugfs): | |
1098 | sys.stderr.write("Please enable CONFIG_TRACING in your kernel " | |
1099 | "when using the option -t (default).\n" | |
1100 | "If it is enabled, make {0} readable by the " | |
1101 | "current user.\n" | |
1102 | .format(PATH_DEBUGFS_TRACING)) | |
1103 | if options.tracepoints: | |
1104 | sys.exit(1) | |
1105 | ||
1106 | sys.stderr.write("Falling back to debugfs statistics!\n") | |
1107 | options.debugfs = True | |
1108 | sleep(5) | |
1109 | ||
1110 | return options | |
1111 | ||
1112 | def main(): | |
1113 | options = get_options() | |
1114 | options = check_access(options) | |
f0cf040f JF |
1115 | |
1116 | if (options.pid > 0 and | |
1117 | not os.path.isdir(os.path.join('/proc/', | |
1118 | str(options.pid)))): | |
1119 | sys.stderr.write('Did you use a (unsupported) tid instead of a pid?\n') | |
1120 | sys.exit('Specified pid does not exist.') | |
1121 | ||
f9bc9e65 | 1122 | providers = get_providers(options) |
f0cf040f | 1123 | stats = Stats(providers, options.pid, fields=options.fields) |
f9bc9e65 JF |
1124 | |
1125 | if options.log: | |
1126 | log(stats) | |
1127 | elif not options.once: | |
1128 | with Tui(stats) as tui: | |
1129 | tui.show_stats() | |
1130 | else: | |
1131 | batch(stats) | |
1132 | ||
1133 | if __name__ == "__main__": | |
1134 | main() |