]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - arch/x86/events/intel/rapl.c
perf/x86/intel/rapl: Refactor the code some more
[mirror_ubuntu-artful-kernel.git] / arch / x86 / events / intel / rapl.c
CommitLineData
4788e5b4
SE
1/*
2 * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
3 * Copyright (C) 2013 Google, Inc., Stephane Eranian
4 *
5 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
6 * section 14.7.1 (September 2013)
7 *
8 * RAPL provides more controls than just reporting energy consumption
9 * however here we only expose the 3 energy consumption free running
10 * counters (pp0, pkg, dram).
11 *
12 * Each of those counters increments in a power unit defined by the
13 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
14 * but it can vary.
15 *
16 * Counter to rapl events mappings:
17 *
18 * pp0 counter: consumption of all physical cores (power plane 0)
19 * event: rapl_energy_cores
20 * perf code: 0x1
21 *
22 * pkg counter: consumption of the whole processor package
23 * event: rapl_energy_pkg
24 * perf code: 0x2
25 *
26 * dram counter: consumption of the dram domain (servers only)
27 * event: rapl_energy_dram
28 * perf code: 0x3
29 *
f228c5b8
SE
30 * dram counter: consumption of the builtin-gpu domain (client only)
31 * event: rapl_energy_gpu
32 * perf code: 0x4
33 *
4788e5b4
SE
34 * We manage those counters as free running (read-only). They may be
35 * use simultaneously by other tools, such as turbostat.
36 *
37 * The events only support system-wide mode counting. There is no
38 * sampling support because it does not make sense and is not
39 * supported by the RAPL hardware.
40 *
41 * Because we want to avoid floating-point operations in the kernel,
42 * the events are all reported in fixed point arithmetic (32.32).
43 * Tools must adjust the counts to convert them to Watts using
44 * the duration of the measurement. Tools may use a function such as
45 * ldexp(raw_count, -32);
46 */
512089d9
TG
47
48#define pr_fmt(fmt) "RAPL PMU: " fmt
49
4788e5b4
SE
50#include <linux/module.h>
51#include <linux/slab.h>
52#include <linux/perf_event.h>
53#include <asm/cpu_device_id.h>
27f6d22b 54#include "../perf_event.h"
4788e5b4
SE
55
56/*
57 * RAPL energy status counters
58 */
59#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
60#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
61#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
62#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
63#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
64#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
e69af465 65#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
f228c5b8 66#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
4788e5b4 67
64552396 68#define NR_RAPL_DOMAINS 0x4
da008ee7 69static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
64552396
JP
70 "pp0-core",
71 "package",
72 "dram",
73 "pp1-gpu",
74};
75
4788e5b4
SE
76/* Clients have PP0, PKG */
77#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
f228c5b8
SE
78 1<<RAPL_IDX_PKG_NRG_STAT|\
79 1<<RAPL_IDX_PP1_NRG_STAT)
4788e5b4
SE
80
81/* Servers have PP0, PKG, RAM */
82#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
83 1<<RAPL_IDX_PKG_NRG_STAT|\
84 1<<RAPL_IDX_RAM_NRG_STAT)
85
e69af465
VW
86/* Servers have PP0, PKG, RAM, PP1 */
87#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
88 1<<RAPL_IDX_PKG_NRG_STAT|\
89 1<<RAPL_IDX_RAM_NRG_STAT|\
90 1<<RAPL_IDX_PP1_NRG_STAT)
91
3a2a7797
DC
92/* Knights Landing has PKG, RAM */
93#define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\
94 1<<RAPL_IDX_RAM_NRG_STAT)
95
4788e5b4
SE
96/*
97 * event code: LSB 8 bits, passed in attr->config
98 * any other bit is reserved
99 */
100#define RAPL_EVENT_MASK 0xFFULL
101
102#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
103static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
104 struct kobj_attribute *attr, \
105 char *page) \
106{ \
107 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
108 return sprintf(page, _format "\n"); \
109} \
110static struct kobj_attribute format_attr_##_var = \
111 __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
112
7162b8fe 113#define RAPL_CNTR_WIDTH 32
4788e5b4 114
d3bcd64b
HR
115#define RAPL_EVENT_ATTR_STR(_name, v, str) \
116static struct perf_pmu_events_attr event_attr_##v = { \
117 .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
118 .id = 0, \
119 .event_str = str, \
433678bd
SE
120};
121
4788e5b4 122struct rapl_pmu {
7162b8fe
TG
123 spinlock_t lock;
124 int n_active;
125 struct list_head active_list;
126 struct pmu *pmu;
127 ktime_t timer_interval;
128 struct hrtimer hrtimer;
4788e5b4
SE
129};
130
7162b8fe
TG
131 /* 1/2^hw_unit Joule */
132static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
4788e5b4
SE
133static struct pmu rapl_pmu_class;
134static cpumask_t rapl_cpu_mask;
135static int rapl_cntr_mask;
75c7003f 136static u64 rapl_timer_ms;
4788e5b4
SE
137
138static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
139static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
140
141static inline u64 rapl_read_counter(struct perf_event *event)
142{
143 u64 raw;
144 rdmsrl(event->hw.event_base, raw);
145 return raw;
146}
147
64552396 148static inline u64 rapl_scale(u64 v, int cfg)
4788e5b4 149{
64552396 150 if (cfg > NR_RAPL_DOMAINS) {
512089d9 151 pr_warn("Invalid domain %d, failed to scale data\n", cfg);
64552396
JP
152 return v;
153 }
4788e5b4
SE
154 /*
155 * scale delta to smallest unit (1/2^32)
156 * users must then scale back: count * 1/(1e9*2^32) to get Joules
157 * or use ldexp(count, -32).
158 * Watts = Joules/Time delta
159 */
64552396 160 return v << (32 - rapl_hw_unit[cfg - 1]);
4788e5b4
SE
161}
162
163static u64 rapl_event_update(struct perf_event *event)
164{
165 struct hw_perf_event *hwc = &event->hw;
166 u64 prev_raw_count, new_raw_count;
167 s64 delta, sdelta;
168 int shift = RAPL_CNTR_WIDTH;
169
170again:
171 prev_raw_count = local64_read(&hwc->prev_count);
172 rdmsrl(event->hw.event_base, new_raw_count);
173
174 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
175 new_raw_count) != prev_raw_count) {
176 cpu_relax();
177 goto again;
178 }
179
180 /*
181 * Now we have the new raw value and have updated the prev
182 * timestamp already. We can now calculate the elapsed delta
183 * (event-)time and add that to the generic event.
184 *
185 * Careful, not all hw sign-extends above the physical width
186 * of the count.
187 */
188 delta = (new_raw_count << shift) - (prev_raw_count << shift);
189 delta >>= shift;
190
64552396 191 sdelta = rapl_scale(delta, event->hw.config);
4788e5b4
SE
192
193 local64_add(sdelta, &event->count);
194
195 return new_raw_count;
196}
197
65661f96
SE
198static void rapl_start_hrtimer(struct rapl_pmu *pmu)
199{
514c2304
TG
200 hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
201 HRTIMER_MODE_REL_PINNED);
65661f96
SE
202}
203
65661f96
SE
204static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
205{
89cbc767 206 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
65661f96
SE
207 struct perf_event *event;
208 unsigned long flags;
209
210 if (!pmu->n_active)
211 return HRTIMER_NORESTART;
212
213 spin_lock_irqsave(&pmu->lock, flags);
214
7162b8fe 215 list_for_each_entry(event, &pmu->active_list, active_entry)
65661f96 216 rapl_event_update(event);
65661f96
SE
217
218 spin_unlock_irqrestore(&pmu->lock, flags);
219
220 hrtimer_forward_now(hrtimer, pmu->timer_interval);
221
222 return HRTIMER_RESTART;
223}
224
225static void rapl_hrtimer_init(struct rapl_pmu *pmu)
226{
227 struct hrtimer *hr = &pmu->hrtimer;
228
229 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
230 hr->function = rapl_hrtimer_handle;
231}
232
4788e5b4
SE
233static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
234 struct perf_event *event)
235{
236 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
237 return;
238
239 event->hw.state = 0;
240
241 list_add_tail(&event->active_entry, &pmu->active_list);
242
243 local64_set(&event->hw.prev_count, rapl_read_counter(event));
244
245 pmu->n_active++;
65661f96
SE
246 if (pmu->n_active == 1)
247 rapl_start_hrtimer(pmu);
4788e5b4
SE
248}
249
250static void rapl_pmu_event_start(struct perf_event *event, int mode)
251{
89cbc767 252 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
4788e5b4
SE
253 unsigned long flags;
254
255 spin_lock_irqsave(&pmu->lock, flags);
256 __rapl_pmu_event_start(pmu, event);
257 spin_unlock_irqrestore(&pmu->lock, flags);
258}
259
260static void rapl_pmu_event_stop(struct perf_event *event, int mode)
261{
89cbc767 262 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
4788e5b4
SE
263 struct hw_perf_event *hwc = &event->hw;
264 unsigned long flags;
265
266 spin_lock_irqsave(&pmu->lock, flags);
267
268 /* mark event as deactivated and stopped */
269 if (!(hwc->state & PERF_HES_STOPPED)) {
270 WARN_ON_ONCE(pmu->n_active <= 0);
271 pmu->n_active--;
65661f96 272 if (pmu->n_active == 0)
7162b8fe 273 hrtimer_cancel(&pmu->hrtimer);
4788e5b4
SE
274
275 list_del(&event->active_entry);
276
277 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
278 hwc->state |= PERF_HES_STOPPED;
279 }
280
281 /* check if update of sw counter is necessary */
282 if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
283 /*
284 * Drain the remaining delta count out of a event
285 * that we are disabling:
286 */
287 rapl_event_update(event);
288 hwc->state |= PERF_HES_UPTODATE;
289 }
290
291 spin_unlock_irqrestore(&pmu->lock, flags);
292}
293
294static int rapl_pmu_event_add(struct perf_event *event, int mode)
295{
89cbc767 296 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
4788e5b4
SE
297 struct hw_perf_event *hwc = &event->hw;
298 unsigned long flags;
299
300 spin_lock_irqsave(&pmu->lock, flags);
301
302 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
303
304 if (mode & PERF_EF_START)
305 __rapl_pmu_event_start(pmu, event);
306
307 spin_unlock_irqrestore(&pmu->lock, flags);
308
309 return 0;
310}
311
312static void rapl_pmu_event_del(struct perf_event *event, int flags)
313{
314 rapl_pmu_event_stop(event, PERF_EF_UPDATE);
315}
316
317static int rapl_pmu_event_init(struct perf_event *event)
318{
319 u64 cfg = event->attr.config & RAPL_EVENT_MASK;
320 int bit, msr, ret = 0;
321
322 /* only look at RAPL events */
323 if (event->attr.type != rapl_pmu_class.type)
324 return -ENOENT;
325
326 /* check only supported bits are set */
327 if (event->attr.config & ~RAPL_EVENT_MASK)
328 return -EINVAL;
329
330 /*
331 * check event is known (determines counter)
332 */
333 switch (cfg) {
334 case INTEL_RAPL_PP0:
335 bit = RAPL_IDX_PP0_NRG_STAT;
336 msr = MSR_PP0_ENERGY_STATUS;
337 break;
338 case INTEL_RAPL_PKG:
339 bit = RAPL_IDX_PKG_NRG_STAT;
340 msr = MSR_PKG_ENERGY_STATUS;
341 break;
342 case INTEL_RAPL_RAM:
343 bit = RAPL_IDX_RAM_NRG_STAT;
344 msr = MSR_DRAM_ENERGY_STATUS;
345 break;
f228c5b8
SE
346 case INTEL_RAPL_PP1:
347 bit = RAPL_IDX_PP1_NRG_STAT;
348 msr = MSR_PP1_ENERGY_STATUS;
349 break;
4788e5b4
SE
350 default:
351 return -EINVAL;
352 }
353 /* check event supported */
354 if (!(rapl_cntr_mask & (1 << bit)))
355 return -EINVAL;
356
357 /* unsupported modes and filters */
358 if (event->attr.exclude_user ||
359 event->attr.exclude_kernel ||
360 event->attr.exclude_hv ||
361 event->attr.exclude_idle ||
362 event->attr.exclude_host ||
363 event->attr.exclude_guest ||
364 event->attr.sample_period) /* no sampling */
365 return -EINVAL;
366
367 /* must be done before validate_group */
368 event->hw.event_base = msr;
369 event->hw.config = cfg;
370 event->hw.idx = bit;
371
372 return ret;
373}
374
375static void rapl_pmu_event_read(struct perf_event *event)
376{
377 rapl_event_update(event);
378}
379
380static ssize_t rapl_get_attr_cpumask(struct device *dev,
381 struct device_attribute *attr, char *buf)
382{
5aaba363 383 return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
4788e5b4
SE
384}
385
386static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
387
388static struct attribute *rapl_pmu_attrs[] = {
389 &dev_attr_cpumask.attr,
390 NULL,
391};
392
393static struct attribute_group rapl_pmu_attr_group = {
394 .attrs = rapl_pmu_attrs,
395};
396
433678bd
SE
397RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
398RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
399RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
400RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
4788e5b4 401
433678bd
SE
402RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
403RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
404RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
405RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
4788e5b4
SE
406
407/*
408 * we compute in 0.23 nJ increments regardless of MSR
409 */
433678bd
SE
410RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
411RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
412RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
413RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
4788e5b4
SE
414
415static struct attribute *rapl_events_srv_attr[] = {
416 EVENT_PTR(rapl_cores),
417 EVENT_PTR(rapl_pkg),
418 EVENT_PTR(rapl_ram),
419
420 EVENT_PTR(rapl_cores_unit),
421 EVENT_PTR(rapl_pkg_unit),
422 EVENT_PTR(rapl_ram_unit),
423
424 EVENT_PTR(rapl_cores_scale),
425 EVENT_PTR(rapl_pkg_scale),
426 EVENT_PTR(rapl_ram_scale),
427 NULL,
428};
429
430static struct attribute *rapl_events_cln_attr[] = {
431 EVENT_PTR(rapl_cores),
432 EVENT_PTR(rapl_pkg),
f228c5b8 433 EVENT_PTR(rapl_gpu),
4788e5b4
SE
434
435 EVENT_PTR(rapl_cores_unit),
436 EVENT_PTR(rapl_pkg_unit),
f228c5b8 437 EVENT_PTR(rapl_gpu_unit),
4788e5b4
SE
438
439 EVENT_PTR(rapl_cores_scale),
440 EVENT_PTR(rapl_pkg_scale),
f228c5b8 441 EVENT_PTR(rapl_gpu_scale),
4788e5b4
SE
442 NULL,
443};
444
e69af465
VW
445static struct attribute *rapl_events_hsw_attr[] = {
446 EVENT_PTR(rapl_cores),
447 EVENT_PTR(rapl_pkg),
448 EVENT_PTR(rapl_gpu),
449 EVENT_PTR(rapl_ram),
450
451 EVENT_PTR(rapl_cores_unit),
452 EVENT_PTR(rapl_pkg_unit),
453 EVENT_PTR(rapl_gpu_unit),
454 EVENT_PTR(rapl_ram_unit),
455
456 EVENT_PTR(rapl_cores_scale),
457 EVENT_PTR(rapl_pkg_scale),
458 EVENT_PTR(rapl_gpu_scale),
459 EVENT_PTR(rapl_ram_scale),
460 NULL,
461};
462
3a2a7797
DC
463static struct attribute *rapl_events_knl_attr[] = {
464 EVENT_PTR(rapl_pkg),
465 EVENT_PTR(rapl_ram),
466
467 EVENT_PTR(rapl_pkg_unit),
468 EVENT_PTR(rapl_ram_unit),
469
470 EVENT_PTR(rapl_pkg_scale),
471 EVENT_PTR(rapl_ram_scale),
472 NULL,
473};
474
4788e5b4
SE
475static struct attribute_group rapl_pmu_events_group = {
476 .name = "events",
477 .attrs = NULL, /* patched at runtime */
478};
479
480DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
481static struct attribute *rapl_formats_attr[] = {
482 &format_attr_event.attr,
483 NULL,
484};
485
486static struct attribute_group rapl_pmu_format_group = {
487 .name = "format",
488 .attrs = rapl_formats_attr,
489};
490
491const struct attribute_group *rapl_attr_groups[] = {
492 &rapl_pmu_attr_group,
493 &rapl_pmu_format_group,
494 &rapl_pmu_events_group,
495 NULL,
496};
497
498static struct pmu rapl_pmu_class = {
499 .attr_groups = rapl_attr_groups,
500 .task_ctx_nr = perf_invalid_context, /* system-wide only */
501 .event_init = rapl_pmu_event_init,
502 .add = rapl_pmu_event_add, /* must have */
503 .del = rapl_pmu_event_del, /* must have */
504 .start = rapl_pmu_event_start,
505 .stop = rapl_pmu_event_stop,
506 .read = rapl_pmu_event_read,
507};
508
509static void rapl_cpu_exit(int cpu)
510{
511 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
512 int i, phys_id = topology_physical_package_id(cpu);
513 int target = -1;
514
515 /* find a new cpu on same package */
516 for_each_online_cpu(i) {
517 if (i == cpu)
518 continue;
519 if (phys_id == topology_physical_package_id(i)) {
520 target = i;
521 break;
522 }
523 }
524 /*
525 * clear cpu from cpumask
526 * if was set in cpumask and still some cpu on package,
527 * then move to new cpu
528 */
529 if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
530 cpumask_set_cpu(target, &rapl_cpu_mask);
531
532 WARN_ON(cpumask_empty(&rapl_cpu_mask));
533 /*
534 * migrate events and context to new cpu
535 */
536 if (target >= 0)
537 perf_pmu_migrate_context(pmu->pmu, cpu, target);
65661f96
SE
538
539 /* cancel overflow polling timer for CPU */
7162b8fe 540 hrtimer_cancel(&pmu->hrtimer);
4788e5b4
SE
541}
542
543static void rapl_cpu_init(int cpu)
544{
545 int i, phys_id = topology_physical_package_id(cpu);
546
547 /* check if phys_is is already covered */
548 for_each_cpu(i, &rapl_cpu_mask) {
549 if (phys_id == topology_physical_package_id(i))
550 return;
551 }
552 /* was not found, so add it */
553 cpumask_set_cpu(cpu, &rapl_cpu_mask);
554}
555
556static int rapl_cpu_prepare(int cpu)
557{
558 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
559 int phys_id = topology_physical_package_id(cpu);
560
561 if (pmu)
562 return 0;
563
564 if (phys_id < 0)
565 return -1;
566
567 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
568 if (!pmu)
569 return -1;
4788e5b4
SE
570 spin_lock_init(&pmu->lock);
571
572 INIT_LIST_HEAD(&pmu->active_list);
573
4788e5b4
SE
574 pmu->pmu = &rapl_pmu_class;
575
75c7003f 576 pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
65661f96
SE
577
578 rapl_hrtimer_init(pmu);
579
4788e5b4
SE
580 /* set RAPL pmu for this cpu for now */
581 per_cpu(rapl_pmu, cpu) = pmu;
582 per_cpu(rapl_pmu_to_free, cpu) = NULL;
583
584 return 0;
585}
586
587static void rapl_cpu_kfree(int cpu)
588{
589 struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
590
591 kfree(pmu);
592
593 per_cpu(rapl_pmu_to_free, cpu) = NULL;
594}
595
596static int rapl_cpu_dying(int cpu)
597{
598 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
599
600 if (!pmu)
601 return 0;
602
603 per_cpu(rapl_pmu, cpu) = NULL;
604
605 per_cpu(rapl_pmu_to_free, cpu) = pmu;
606
607 return 0;
608}
609
610static int rapl_cpu_notifier(struct notifier_block *self,
611 unsigned long action, void *hcpu)
612{
613 unsigned int cpu = (long)hcpu;
614
615 switch (action & ~CPU_TASKS_FROZEN) {
616 case CPU_UP_PREPARE:
617 rapl_cpu_prepare(cpu);
618 break;
619 case CPU_STARTING:
620 rapl_cpu_init(cpu);
621 break;
622 case CPU_UP_CANCELED:
623 case CPU_DYING:
624 rapl_cpu_dying(cpu);
625 break;
626 case CPU_ONLINE:
627 case CPU_DEAD:
628 rapl_cpu_kfree(cpu);
629 break;
630 case CPU_DOWN_PREPARE:
631 rapl_cpu_exit(cpu);
632 break;
633 default:
634 break;
635 }
636
637 return NOTIFY_OK;
638}
639
b8b3319a
TG
640static __init void rapl_hsw_server_quirk(void)
641{
642 /*
643 * DRAM domain on HSW server has fixed energy unit which can be
644 * different than the unit from power unit MSR.
645 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
646 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
647 */
648 rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
649}
650
651static int rapl_check_hw_unit(void (*quirk)(void))
64552396
JP
652{
653 u64 msr_rapl_power_unit_bits;
654 int i;
655
656 /* protect rdmsrl() to handle virtualization */
657 if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
658 return -1;
659 for (i = 0; i < NR_RAPL_DOMAINS; i++)
660 rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
661
b8b3319a
TG
662 /* Apply cpu model quirk */
663 if (quirk)
664 quirk();
75c7003f
TG
665
666 /*
667 * Calculate the timer rate:
668 * Use reference of 200W for scaling the timeout to avoid counter
669 * overflows. 200W = 200 Joules/sec
670 * Divide interval by 2 to avoid lockstep (2 * 100)
671 * if hw unit is 32, then we use 2 ms 1/200/2
672 */
673 rapl_timer_ms = 2;
674 if (rapl_hw_unit[0] < 32) {
675 rapl_timer_ms = (1000 / (2 * 100));
676 rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
677 }
64552396
JP
678 return 0;
679}
680
512089d9
TG
681static void __init rapl_advertise(void)
682{
683 int i;
684
685 pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
686 hweight32(rapl_cntr_mask), rapl_timer_ms);
687
688 for (i = 0; i < NR_RAPL_DOMAINS; i++) {
689 if (rapl_cntr_mask & (1 << i)) {
690 pr_info("hw unit of domain %s 2^-%d Joules\n",
691 rapl_domain_names[i], rapl_hw_unit[i]);
692 }
693 }
694}
695
7162b8fe
TG
696static int __init rapl_prepare_cpus(void)
697{
698 unsigned int cpu;
699 int ret;
700
701 for_each_online_cpu(cpu) {
702 ret = rapl_cpu_prepare(cpu);
703 if (ret)
704 return ret;
705 rapl_cpu_init(cpu);
706 }
707 return 0;
708}
709
55f2890f
TG
710static void __init cleanup_rapl_pmus(void)
711{
712 int cpu;
713
714 for_each_online_cpu(cpu)
715 kfree(per_cpu(rapl_pmu, cpu));
716}
717
7162b8fe 718static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
4788e5b4
SE
719 [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
720 [1] = {},
721};
722
723static int __init rapl_pmu_init(void)
724{
b8b3319a 725 void (*quirk)(void) = NULL;
7162b8fe 726 int ret;
4788e5b4 727
4788e5b4 728 if (!x86_match_cpu(rapl_cpu_match))
55f2890f 729 return -ENODEV;
4788e5b4 730
4788e5b4
SE
731 switch (boot_cpu_data.x86_model) {
732 case 42: /* Sandy Bridge */
733 case 58: /* Ivy Bridge */
4788e5b4
SE
734 rapl_cntr_mask = RAPL_IDX_CLN;
735 rapl_pmu_events_group.attrs = rapl_events_cln_attr;
736 break;
64552396 737 case 63: /* Haswell-Server */
b8b3319a 738 quirk = rapl_hsw_server_quirk;
64552396
JP
739 rapl_cntr_mask = RAPL_IDX_SRV;
740 rapl_pmu_events_group.attrs = rapl_events_srv_attr;
741 break;
e69af465
VW
742 case 60: /* Haswell */
743 case 69: /* Haswell-Celeron */
44b11fee 744 case 61: /* Broadwell */
e69af465
VW
745 rapl_cntr_mask = RAPL_IDX_HSW;
746 rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
747 break;
4788e5b4
SE
748 case 45: /* Sandy Bridge-EP */
749 case 62: /* IvyTown */
750 rapl_cntr_mask = RAPL_IDX_SRV;
751 rapl_pmu_events_group.attrs = rapl_events_srv_attr;
752 break;
3a2a7797 753 case 87: /* Knights Landing */
b8b3319a 754 quirk = rapl_hsw_server_quirk;
3a2a7797
DC
755 rapl_cntr_mask = RAPL_IDX_KNL;
756 rapl_pmu_events_group.attrs = rapl_events_knl_attr;
4d120c53 757 break;
4788e5b4 758 default:
55f2890f 759 return -ENODEV;
4788e5b4 760 }
55f2890f 761
b8b3319a 762 ret = rapl_check_hw_unit(quirk);
64552396
JP
763 if (ret)
764 return ret;
fd537e56
SB
765
766 cpu_notifier_register_begin();
4788e5b4 767
7162b8fe
TG
768 ret = rapl_prepare_cpus();
769 if (ret)
770 goto out;
4788e5b4 771
4788e5b4 772 ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
512089d9 773 if (ret)
55f2890f 774 goto out;
4788e5b4 775
55f2890f 776 __perf_cpu_notifier(rapl_cpu_notifier);
75c7003f 777 cpu_notifier_register_done();
512089d9 778 rapl_advertise();
4788e5b4 779 return 0;
55f2890f
TG
780
781out:
512089d9 782 pr_warn("Initialization failed (%d), disabled\n", ret);
55f2890f
TG
783 cleanup_rapl_pmus();
784 cpu_notifier_register_done();
785 return ret;
4788e5b4
SE
786}
787device_initcall(rapl_pmu_init);