]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - arch/x86/events/intel/rapl.c
perf/x86/intel/rapl: Utilize event->pmu_private
[mirror_ubuntu-zesty-kernel.git] / arch / x86 / events / intel / rapl.c
1 /*
2 * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
3 * Copyright (C) 2013 Google, Inc., Stephane Eranian
4 *
5 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
6 * section 14.7.1 (September 2013)
7 *
8 * RAPL provides more controls than just reporting energy consumption
9 * however here we only expose the 3 energy consumption free running
10 * counters (pp0, pkg, dram).
11 *
12 * Each of those counters increments in a power unit defined by the
13 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
14 * but it can vary.
15 *
16 * Counter to rapl events mappings:
17 *
18 * pp0 counter: consumption of all physical cores (power plane 0)
19 * event: rapl_energy_cores
20 * perf code: 0x1
21 *
22 * pkg counter: consumption of the whole processor package
23 * event: rapl_energy_pkg
24 * perf code: 0x2
25 *
26 * dram counter: consumption of the dram domain (servers only)
27 * event: rapl_energy_dram
28 * perf code: 0x3
29 *
30 * dram counter: consumption of the builtin-gpu domain (client only)
31 * event: rapl_energy_gpu
32 * perf code: 0x4
33 *
34 * We manage those counters as free running (read-only). They may be
35 * use simultaneously by other tools, such as turbostat.
36 *
37 * The events only support system-wide mode counting. There is no
38 * sampling support because it does not make sense and is not
39 * supported by the RAPL hardware.
40 *
41 * Because we want to avoid floating-point operations in the kernel,
42 * the events are all reported in fixed point arithmetic (32.32).
43 * Tools must adjust the counts to convert them to Watts using
44 * the duration of the measurement. Tools may use a function such as
45 * ldexp(raw_count, -32);
46 */
47
48 #define pr_fmt(fmt) "RAPL PMU: " fmt
49
50 #include <linux/module.h>
51 #include <linux/slab.h>
52 #include <linux/perf_event.h>
53 #include <asm/cpu_device_id.h>
54 #include "../perf_event.h"
55
56 /*
57 * RAPL energy status counters
58 */
59 #define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
60 #define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
61 #define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
62 #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
63 #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
64 #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
65 #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
66 #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
67
68 #define NR_RAPL_DOMAINS 0x4
69 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
70 "pp0-core",
71 "package",
72 "dram",
73 "pp1-gpu",
74 };
75
76 /* Clients have PP0, PKG */
77 #define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
78 1<<RAPL_IDX_PKG_NRG_STAT|\
79 1<<RAPL_IDX_PP1_NRG_STAT)
80
81 /* Servers have PP0, PKG, RAM */
82 #define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
83 1<<RAPL_IDX_PKG_NRG_STAT|\
84 1<<RAPL_IDX_RAM_NRG_STAT)
85
86 /* Servers have PP0, PKG, RAM, PP1 */
87 #define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
88 1<<RAPL_IDX_PKG_NRG_STAT|\
89 1<<RAPL_IDX_RAM_NRG_STAT|\
90 1<<RAPL_IDX_PP1_NRG_STAT)
91
92 /* Knights Landing has PKG, RAM */
93 #define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\
94 1<<RAPL_IDX_RAM_NRG_STAT)
95
96 /*
97 * event code: LSB 8 bits, passed in attr->config
98 * any other bit is reserved
99 */
100 #define RAPL_EVENT_MASK 0xFFULL
101
102 #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
103 static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
104 struct kobj_attribute *attr, \
105 char *page) \
106 { \
107 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
108 return sprintf(page, _format "\n"); \
109 } \
110 static struct kobj_attribute format_attr_##_var = \
111 __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
112
113 #define RAPL_CNTR_WIDTH 32
114
115 #define RAPL_EVENT_ATTR_STR(_name, v, str) \
116 static struct perf_pmu_events_attr event_attr_##v = { \
117 .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
118 .id = 0, \
119 .event_str = str, \
120 };
121
122 struct rapl_pmu {
123 raw_spinlock_t lock;
124 int n_active;
125 int cpu;
126 struct list_head active_list;
127 struct pmu *pmu;
128 ktime_t timer_interval;
129 struct hrtimer hrtimer;
130 };
131
132 /* 1/2^hw_unit Joule */
133 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
134 static struct pmu rapl_pmu_class;
135 static cpumask_t rapl_cpu_mask;
136 static int rapl_cntr_mask;
137 static u64 rapl_timer_ms;
138
139 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
140 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
141
142 static inline u64 rapl_read_counter(struct perf_event *event)
143 {
144 u64 raw;
145 rdmsrl(event->hw.event_base, raw);
146 return raw;
147 }
148
149 static inline u64 rapl_scale(u64 v, int cfg)
150 {
151 if (cfg > NR_RAPL_DOMAINS) {
152 pr_warn("Invalid domain %d, failed to scale data\n", cfg);
153 return v;
154 }
155 /*
156 * scale delta to smallest unit (1/2^32)
157 * users must then scale back: count * 1/(1e9*2^32) to get Joules
158 * or use ldexp(count, -32).
159 * Watts = Joules/Time delta
160 */
161 return v << (32 - rapl_hw_unit[cfg - 1]);
162 }
163
164 static u64 rapl_event_update(struct perf_event *event)
165 {
166 struct hw_perf_event *hwc = &event->hw;
167 u64 prev_raw_count, new_raw_count;
168 s64 delta, sdelta;
169 int shift = RAPL_CNTR_WIDTH;
170
171 again:
172 prev_raw_count = local64_read(&hwc->prev_count);
173 rdmsrl(event->hw.event_base, new_raw_count);
174
175 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
176 new_raw_count) != prev_raw_count) {
177 cpu_relax();
178 goto again;
179 }
180
181 /*
182 * Now we have the new raw value and have updated the prev
183 * timestamp already. We can now calculate the elapsed delta
184 * (event-)time and add that to the generic event.
185 *
186 * Careful, not all hw sign-extends above the physical width
187 * of the count.
188 */
189 delta = (new_raw_count << shift) - (prev_raw_count << shift);
190 delta >>= shift;
191
192 sdelta = rapl_scale(delta, event->hw.config);
193
194 local64_add(sdelta, &event->count);
195
196 return new_raw_count;
197 }
198
199 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
200 {
201 hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
202 HRTIMER_MODE_REL_PINNED);
203 }
204
205 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
206 {
207 struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
208 struct perf_event *event;
209 unsigned long flags;
210
211 if (!pmu->n_active)
212 return HRTIMER_NORESTART;
213
214 raw_spin_lock_irqsave(&pmu->lock, flags);
215
216 list_for_each_entry(event, &pmu->active_list, active_entry)
217 rapl_event_update(event);
218
219 raw_spin_unlock_irqrestore(&pmu->lock, flags);
220
221 hrtimer_forward_now(hrtimer, pmu->timer_interval);
222
223 return HRTIMER_RESTART;
224 }
225
226 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
227 {
228 struct hrtimer *hr = &pmu->hrtimer;
229
230 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
231 hr->function = rapl_hrtimer_handle;
232 }
233
234 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
235 struct perf_event *event)
236 {
237 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
238 return;
239
240 event->hw.state = 0;
241
242 list_add_tail(&event->active_entry, &pmu->active_list);
243
244 local64_set(&event->hw.prev_count, rapl_read_counter(event));
245
246 pmu->n_active++;
247 if (pmu->n_active == 1)
248 rapl_start_hrtimer(pmu);
249 }
250
251 static void rapl_pmu_event_start(struct perf_event *event, int mode)
252 {
253 struct rapl_pmu *pmu = event->pmu_private;
254 unsigned long flags;
255
256 raw_spin_lock_irqsave(&pmu->lock, flags);
257 __rapl_pmu_event_start(pmu, event);
258 raw_spin_unlock_irqrestore(&pmu->lock, flags);
259 }
260
261 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
262 {
263 struct rapl_pmu *pmu = event->pmu_private;
264 struct hw_perf_event *hwc = &event->hw;
265 unsigned long flags;
266
267 raw_spin_lock_irqsave(&pmu->lock, flags);
268
269 /* mark event as deactivated and stopped */
270 if (!(hwc->state & PERF_HES_STOPPED)) {
271 WARN_ON_ONCE(pmu->n_active <= 0);
272 pmu->n_active--;
273 if (pmu->n_active == 0)
274 hrtimer_cancel(&pmu->hrtimer);
275
276 list_del(&event->active_entry);
277
278 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
279 hwc->state |= PERF_HES_STOPPED;
280 }
281
282 /* check if update of sw counter is necessary */
283 if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
284 /*
285 * Drain the remaining delta count out of a event
286 * that we are disabling:
287 */
288 rapl_event_update(event);
289 hwc->state |= PERF_HES_UPTODATE;
290 }
291
292 raw_spin_unlock_irqrestore(&pmu->lock, flags);
293 }
294
295 static int rapl_pmu_event_add(struct perf_event *event, int mode)
296 {
297 struct rapl_pmu *pmu = event->pmu_private;
298 struct hw_perf_event *hwc = &event->hw;
299 unsigned long flags;
300
301 raw_spin_lock_irqsave(&pmu->lock, flags);
302
303 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
304
305 if (mode & PERF_EF_START)
306 __rapl_pmu_event_start(pmu, event);
307
308 raw_spin_unlock_irqrestore(&pmu->lock, flags);
309
310 return 0;
311 }
312
313 static void rapl_pmu_event_del(struct perf_event *event, int flags)
314 {
315 rapl_pmu_event_stop(event, PERF_EF_UPDATE);
316 }
317
318 static int rapl_pmu_event_init(struct perf_event *event)
319 {
320 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
321 u64 cfg = event->attr.config & RAPL_EVENT_MASK;
322 int bit, msr, ret = 0;
323
324 /* only look at RAPL events */
325 if (event->attr.type != rapl_pmu_class.type)
326 return -ENOENT;
327
328 /* check only supported bits are set */
329 if (event->attr.config & ~RAPL_EVENT_MASK)
330 return -EINVAL;
331
332 if (event->cpu < 0)
333 return -EINVAL;
334
335 /*
336 * check event is known (determines counter)
337 */
338 switch (cfg) {
339 case INTEL_RAPL_PP0:
340 bit = RAPL_IDX_PP0_NRG_STAT;
341 msr = MSR_PP0_ENERGY_STATUS;
342 break;
343 case INTEL_RAPL_PKG:
344 bit = RAPL_IDX_PKG_NRG_STAT;
345 msr = MSR_PKG_ENERGY_STATUS;
346 break;
347 case INTEL_RAPL_RAM:
348 bit = RAPL_IDX_RAM_NRG_STAT;
349 msr = MSR_DRAM_ENERGY_STATUS;
350 break;
351 case INTEL_RAPL_PP1:
352 bit = RAPL_IDX_PP1_NRG_STAT;
353 msr = MSR_PP1_ENERGY_STATUS;
354 break;
355 default:
356 return -EINVAL;
357 }
358 /* check event supported */
359 if (!(rapl_cntr_mask & (1 << bit)))
360 return -EINVAL;
361
362 /* unsupported modes and filters */
363 if (event->attr.exclude_user ||
364 event->attr.exclude_kernel ||
365 event->attr.exclude_hv ||
366 event->attr.exclude_idle ||
367 event->attr.exclude_host ||
368 event->attr.exclude_guest ||
369 event->attr.sample_period) /* no sampling */
370 return -EINVAL;
371
372 /* must be done before validate_group */
373 event->cpu = pmu->cpu;
374 event->pmu_private = pmu;
375 event->hw.event_base = msr;
376 event->hw.config = cfg;
377 event->hw.idx = bit;
378
379 return ret;
380 }
381
382 static void rapl_pmu_event_read(struct perf_event *event)
383 {
384 rapl_event_update(event);
385 }
386
387 static ssize_t rapl_get_attr_cpumask(struct device *dev,
388 struct device_attribute *attr, char *buf)
389 {
390 return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
391 }
392
393 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
394
395 static struct attribute *rapl_pmu_attrs[] = {
396 &dev_attr_cpumask.attr,
397 NULL,
398 };
399
400 static struct attribute_group rapl_pmu_attr_group = {
401 .attrs = rapl_pmu_attrs,
402 };
403
404 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
405 RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
406 RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
407 RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
408
409 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
410 RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
411 RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
412 RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
413
414 /*
415 * we compute in 0.23 nJ increments regardless of MSR
416 */
417 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
418 RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
419 RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
420 RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
421
422 static struct attribute *rapl_events_srv_attr[] = {
423 EVENT_PTR(rapl_cores),
424 EVENT_PTR(rapl_pkg),
425 EVENT_PTR(rapl_ram),
426
427 EVENT_PTR(rapl_cores_unit),
428 EVENT_PTR(rapl_pkg_unit),
429 EVENT_PTR(rapl_ram_unit),
430
431 EVENT_PTR(rapl_cores_scale),
432 EVENT_PTR(rapl_pkg_scale),
433 EVENT_PTR(rapl_ram_scale),
434 NULL,
435 };
436
437 static struct attribute *rapl_events_cln_attr[] = {
438 EVENT_PTR(rapl_cores),
439 EVENT_PTR(rapl_pkg),
440 EVENT_PTR(rapl_gpu),
441
442 EVENT_PTR(rapl_cores_unit),
443 EVENT_PTR(rapl_pkg_unit),
444 EVENT_PTR(rapl_gpu_unit),
445
446 EVENT_PTR(rapl_cores_scale),
447 EVENT_PTR(rapl_pkg_scale),
448 EVENT_PTR(rapl_gpu_scale),
449 NULL,
450 };
451
452 static struct attribute *rapl_events_hsw_attr[] = {
453 EVENT_PTR(rapl_cores),
454 EVENT_PTR(rapl_pkg),
455 EVENT_PTR(rapl_gpu),
456 EVENT_PTR(rapl_ram),
457
458 EVENT_PTR(rapl_cores_unit),
459 EVENT_PTR(rapl_pkg_unit),
460 EVENT_PTR(rapl_gpu_unit),
461 EVENT_PTR(rapl_ram_unit),
462
463 EVENT_PTR(rapl_cores_scale),
464 EVENT_PTR(rapl_pkg_scale),
465 EVENT_PTR(rapl_gpu_scale),
466 EVENT_PTR(rapl_ram_scale),
467 NULL,
468 };
469
470 static struct attribute *rapl_events_knl_attr[] = {
471 EVENT_PTR(rapl_pkg),
472 EVENT_PTR(rapl_ram),
473
474 EVENT_PTR(rapl_pkg_unit),
475 EVENT_PTR(rapl_ram_unit),
476
477 EVENT_PTR(rapl_pkg_scale),
478 EVENT_PTR(rapl_ram_scale),
479 NULL,
480 };
481
482 static struct attribute_group rapl_pmu_events_group = {
483 .name = "events",
484 .attrs = NULL, /* patched at runtime */
485 };
486
487 DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
488 static struct attribute *rapl_formats_attr[] = {
489 &format_attr_event.attr,
490 NULL,
491 };
492
493 static struct attribute_group rapl_pmu_format_group = {
494 .name = "format",
495 .attrs = rapl_formats_attr,
496 };
497
498 const struct attribute_group *rapl_attr_groups[] = {
499 &rapl_pmu_attr_group,
500 &rapl_pmu_format_group,
501 &rapl_pmu_events_group,
502 NULL,
503 };
504
505 static struct pmu rapl_pmu_class = {
506 .attr_groups = rapl_attr_groups,
507 .task_ctx_nr = perf_invalid_context, /* system-wide only */
508 .event_init = rapl_pmu_event_init,
509 .add = rapl_pmu_event_add, /* must have */
510 .del = rapl_pmu_event_del, /* must have */
511 .start = rapl_pmu_event_start,
512 .stop = rapl_pmu_event_stop,
513 .read = rapl_pmu_event_read,
514 };
515
516 static void rapl_cpu_exit(int cpu)
517 {
518 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
519 int i, phys_id = topology_physical_package_id(cpu);
520 int target = -1;
521
522 /* find a new cpu on same package */
523 for_each_online_cpu(i) {
524 if (i == cpu)
525 continue;
526 if (phys_id == topology_physical_package_id(i)) {
527 target = i;
528 break;
529 }
530 }
531 /*
532 * clear cpu from cpumask
533 * if was set in cpumask and still some cpu on package,
534 * then move to new cpu
535 */
536 if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
537 cpumask_set_cpu(target, &rapl_cpu_mask);
538
539 WARN_ON(cpumask_empty(&rapl_cpu_mask));
540 /*
541 * migrate events and context to new cpu
542 */
543 if (target >= 0)
544 perf_pmu_migrate_context(pmu->pmu, cpu, target);
545
546 /* cancel overflow polling timer for CPU */
547 hrtimer_cancel(&pmu->hrtimer);
548 }
549
550 static void rapl_cpu_init(int cpu)
551 {
552 int i, phys_id = topology_physical_package_id(cpu);
553
554 /* check if phys_is is already covered */
555 for_each_cpu(i, &rapl_cpu_mask) {
556 if (phys_id == topology_physical_package_id(i))
557 return;
558 }
559 /* was not found, so add it */
560 cpumask_set_cpu(cpu, &rapl_cpu_mask);
561 }
562
563 static int rapl_cpu_prepare(int cpu)
564 {
565 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
566 int phys_id = topology_physical_package_id(cpu);
567
568 if (pmu)
569 return 0;
570
571 if (phys_id < 0)
572 return -1;
573
574 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
575 if (!pmu)
576 return -1;
577 raw_spin_lock_init(&pmu->lock);
578
579 INIT_LIST_HEAD(&pmu->active_list);
580
581 pmu->pmu = &rapl_pmu_class;
582 pmu->cpu = cpu;
583
584 pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
585
586 rapl_hrtimer_init(pmu);
587
588 /* set RAPL pmu for this cpu for now */
589 per_cpu(rapl_pmu, cpu) = pmu;
590 per_cpu(rapl_pmu_to_free, cpu) = NULL;
591
592 return 0;
593 }
594
595 static void rapl_cpu_kfree(int cpu)
596 {
597 struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
598
599 kfree(pmu);
600
601 per_cpu(rapl_pmu_to_free, cpu) = NULL;
602 }
603
604 static int rapl_cpu_dying(int cpu)
605 {
606 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
607
608 if (!pmu)
609 return 0;
610
611 per_cpu(rapl_pmu, cpu) = NULL;
612
613 per_cpu(rapl_pmu_to_free, cpu) = pmu;
614
615 return 0;
616 }
617
618 static int rapl_cpu_notifier(struct notifier_block *self,
619 unsigned long action, void *hcpu)
620 {
621 unsigned int cpu = (long)hcpu;
622
623 switch (action & ~CPU_TASKS_FROZEN) {
624 case CPU_UP_PREPARE:
625 rapl_cpu_prepare(cpu);
626 break;
627 case CPU_STARTING:
628 rapl_cpu_init(cpu);
629 break;
630 case CPU_UP_CANCELED:
631 case CPU_DYING:
632 rapl_cpu_dying(cpu);
633 break;
634 case CPU_ONLINE:
635 case CPU_DEAD:
636 rapl_cpu_kfree(cpu);
637 break;
638 case CPU_DOWN_PREPARE:
639 rapl_cpu_exit(cpu);
640 break;
641 default:
642 break;
643 }
644
645 return NOTIFY_OK;
646 }
647
648 static __init void rapl_hsw_server_quirk(void)
649 {
650 /*
651 * DRAM domain on HSW server has fixed energy unit which can be
652 * different than the unit from power unit MSR.
653 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
654 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
655 */
656 rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
657 }
658
659 static int rapl_check_hw_unit(void (*quirk)(void))
660 {
661 u64 msr_rapl_power_unit_bits;
662 int i;
663
664 /* protect rdmsrl() to handle virtualization */
665 if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
666 return -1;
667 for (i = 0; i < NR_RAPL_DOMAINS; i++)
668 rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
669
670 /* Apply cpu model quirk */
671 if (quirk)
672 quirk();
673
674 /*
675 * Calculate the timer rate:
676 * Use reference of 200W for scaling the timeout to avoid counter
677 * overflows. 200W = 200 Joules/sec
678 * Divide interval by 2 to avoid lockstep (2 * 100)
679 * if hw unit is 32, then we use 2 ms 1/200/2
680 */
681 rapl_timer_ms = 2;
682 if (rapl_hw_unit[0] < 32) {
683 rapl_timer_ms = (1000 / (2 * 100));
684 rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
685 }
686 return 0;
687 }
688
689 static void __init rapl_advertise(void)
690 {
691 int i;
692
693 pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
694 hweight32(rapl_cntr_mask), rapl_timer_ms);
695
696 for (i = 0; i < NR_RAPL_DOMAINS; i++) {
697 if (rapl_cntr_mask & (1 << i)) {
698 pr_info("hw unit of domain %s 2^-%d Joules\n",
699 rapl_domain_names[i], rapl_hw_unit[i]);
700 }
701 }
702 }
703
704 static int __init rapl_prepare_cpus(void)
705 {
706 unsigned int cpu;
707 int ret;
708
709 for_each_online_cpu(cpu) {
710 ret = rapl_cpu_prepare(cpu);
711 if (ret)
712 return ret;
713 rapl_cpu_init(cpu);
714 }
715 return 0;
716 }
717
718 static void __init cleanup_rapl_pmus(void)
719 {
720 int cpu;
721
722 for_each_online_cpu(cpu)
723 kfree(per_cpu(rapl_pmu, cpu));
724 }
725
726 static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
727 [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
728 [1] = {},
729 };
730
731 static int __init rapl_pmu_init(void)
732 {
733 void (*quirk)(void) = NULL;
734 int ret;
735
736 if (!x86_match_cpu(rapl_cpu_match))
737 return -ENODEV;
738
739 switch (boot_cpu_data.x86_model) {
740 case 42: /* Sandy Bridge */
741 case 58: /* Ivy Bridge */
742 rapl_cntr_mask = RAPL_IDX_CLN;
743 rapl_pmu_events_group.attrs = rapl_events_cln_attr;
744 break;
745 case 63: /* Haswell-Server */
746 quirk = rapl_hsw_server_quirk;
747 rapl_cntr_mask = RAPL_IDX_SRV;
748 rapl_pmu_events_group.attrs = rapl_events_srv_attr;
749 break;
750 case 60: /* Haswell */
751 case 69: /* Haswell-Celeron */
752 case 61: /* Broadwell */
753 rapl_cntr_mask = RAPL_IDX_HSW;
754 rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
755 break;
756 case 45: /* Sandy Bridge-EP */
757 case 62: /* IvyTown */
758 rapl_cntr_mask = RAPL_IDX_SRV;
759 rapl_pmu_events_group.attrs = rapl_events_srv_attr;
760 break;
761 case 87: /* Knights Landing */
762 quirk = rapl_hsw_server_quirk;
763 rapl_cntr_mask = RAPL_IDX_KNL;
764 rapl_pmu_events_group.attrs = rapl_events_knl_attr;
765 break;
766 default:
767 return -ENODEV;
768 }
769
770 ret = rapl_check_hw_unit(quirk);
771 if (ret)
772 return ret;
773
774 cpu_notifier_register_begin();
775
776 ret = rapl_prepare_cpus();
777 if (ret)
778 goto out;
779
780 ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
781 if (ret)
782 goto out;
783
784 __perf_cpu_notifier(rapl_cpu_notifier);
785 cpu_notifier_register_done();
786 rapl_advertise();
787 return 0;
788
789 out:
790 pr_warn("Initialization failed (%d), disabled\n", ret);
791 cleanup_rapl_pmus();
792 cpu_notifier_register_done();
793 return ret;
794 }
795 device_initcall(rapl_pmu_init);