]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blob - arch/s390/kernel/perf_cpum_sf.c
s390/cpum_sf: Avoid SBD overflow condition in irq handler
[mirror_ubuntu-eoan-kernel.git] / arch / s390 / kernel / perf_cpum_sf.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Performance event support for the System z CPU-measurement Sampling Facility
4 *
5 * Copyright IBM Corp. 2013, 2018
6 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
7 */
8 #define KMSG_COMPONENT "cpum_sf"
9 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
10
11 #include <linux/kernel.h>
12 #include <linux/kernel_stat.h>
13 #include <linux/perf_event.h>
14 #include <linux/percpu.h>
15 #include <linux/pid.h>
16 #include <linux/notifier.h>
17 #include <linux/export.h>
18 #include <linux/slab.h>
19 #include <linux/mm.h>
20 #include <linux/moduleparam.h>
21 #include <asm/cpu_mf.h>
22 #include <asm/irq.h>
23 #include <asm/debug.h>
24 #include <asm/timex.h>
25
26 /* Minimum number of sample-data-block-tables:
27 * At least one table is required for the sampling buffer structure.
28 * A single table contains up to 511 pointers to sample-data-blocks.
29 */
30 #define CPUM_SF_MIN_SDBT 1
31
32 /* Number of sample-data-blocks per sample-data-block-table (SDBT):
33 * A table contains SDB pointers (8 bytes) and one table-link entry
34 * that points to the origin of the next SDBT.
35 */
36 #define CPUM_SF_SDB_PER_TABLE ((PAGE_SIZE - 8) / 8)
37
38 /* Maximum page offset for an SDBT table-link entry:
39 * If this page offset is reached, a table-link entry to the next SDBT
40 * must be added.
41 */
42 #define CPUM_SF_SDBT_TL_OFFSET (CPUM_SF_SDB_PER_TABLE * 8)
43 static inline int require_table_link(const void *sdbt)
44 {
45 return ((unsigned long) sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET;
46 }
47
48 /* Minimum and maximum sampling buffer sizes:
49 *
50 * This number represents the maximum size of the sampling buffer taking
51 * the number of sample-data-block-tables into account. Note that these
52 * numbers apply to the basic-sampling function only.
53 * The maximum number of SDBs is increased by CPUM_SF_SDB_DIAG_FACTOR if
54 * the diagnostic-sampling function is active.
55 *
56 * Sampling buffer size Buffer characteristics
57 * ---------------------------------------------------
58 * 64KB == 16 pages (4KB per page)
59 * 1 page for SDB-tables
60 * 15 pages for SDBs
61 *
62 * 32MB == 8192 pages (4KB per page)
63 * 16 pages for SDB-tables
64 * 8176 pages for SDBs
65 */
66 static unsigned long __read_mostly CPUM_SF_MIN_SDB = 15;
67 static unsigned long __read_mostly CPUM_SF_MAX_SDB = 8176;
68 static unsigned long __read_mostly CPUM_SF_SDB_DIAG_FACTOR = 1;
69
70 struct sf_buffer {
71 unsigned long *sdbt; /* Sample-data-block-table origin */
72 /* buffer characteristics (required for buffer increments) */
73 unsigned long num_sdb; /* Number of sample-data-blocks */
74 unsigned long num_sdbt; /* Number of sample-data-block-tables */
75 unsigned long *tail; /* last sample-data-block-table */
76 };
77
78 struct aux_buffer {
79 struct sf_buffer sfb;
80 unsigned long head; /* index of SDB of buffer head */
81 unsigned long alert_mark; /* index of SDB of alert request position */
82 unsigned long empty_mark; /* mark of SDB not marked full */
83 unsigned long *sdb_index; /* SDB address for fast lookup */
84 unsigned long *sdbt_index; /* SDBT address for fast lookup */
85 };
86
87 struct cpu_hw_sf {
88 /* CPU-measurement sampling information block */
89 struct hws_qsi_info_block qsi;
90 /* CPU-measurement sampling control block */
91 struct hws_lsctl_request_block lsctl;
92 struct sf_buffer sfb; /* Sampling buffer */
93 unsigned int flags; /* Status flags */
94 struct perf_event *event; /* Scheduled perf event */
95 struct perf_output_handle handle; /* AUX buffer output handle */
96 };
97 static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf);
98
99 /* Debug feature */
100 static debug_info_t *sfdbg;
101
102 /*
103 * sf_disable() - Switch off sampling facility
104 */
105 static int sf_disable(void)
106 {
107 struct hws_lsctl_request_block sreq;
108
109 memset(&sreq, 0, sizeof(sreq));
110 return lsctl(&sreq);
111 }
112
113 /*
114 * sf_buffer_available() - Check for an allocated sampling buffer
115 */
116 static int sf_buffer_available(struct cpu_hw_sf *cpuhw)
117 {
118 return !!cpuhw->sfb.sdbt;
119 }
120
121 /*
122 * deallocate sampling facility buffer
123 */
124 static void free_sampling_buffer(struct sf_buffer *sfb)
125 {
126 unsigned long *sdbt, *curr;
127
128 if (!sfb->sdbt)
129 return;
130
131 sdbt = sfb->sdbt;
132 curr = sdbt;
133
134 /* Free the SDBT after all SDBs are processed... */
135 while (1) {
136 if (!*curr || !sdbt)
137 break;
138
139 /* Process table-link entries */
140 if (is_link_entry(curr)) {
141 curr = get_next_sdbt(curr);
142 if (sdbt)
143 free_page((unsigned long) sdbt);
144
145 /* If the origin is reached, sampling buffer is freed */
146 if (curr == sfb->sdbt)
147 break;
148 else
149 sdbt = curr;
150 } else {
151 /* Process SDB pointer */
152 if (*curr) {
153 free_page(*curr);
154 curr++;
155 }
156 }
157 }
158
159 debug_sprintf_event(sfdbg, 5,
160 "free_sampling_buffer: freed sdbt=%p\n", sfb->sdbt);
161 memset(sfb, 0, sizeof(*sfb));
162 }
163
164 static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags)
165 {
166 unsigned long sdb, *trailer;
167
168 /* Allocate and initialize sample-data-block */
169 sdb = get_zeroed_page(gfp_flags);
170 if (!sdb)
171 return -ENOMEM;
172 trailer = trailer_entry_ptr(sdb);
173 *trailer = SDB_TE_ALERT_REQ_MASK;
174
175 /* Link SDB into the sample-data-block-table */
176 *sdbt = sdb;
177
178 return 0;
179 }
180
181 /*
182 * realloc_sampling_buffer() - extend sampler memory
183 *
184 * Allocates new sample-data-blocks and adds them to the specified sampling
185 * buffer memory.
186 *
187 * Important: This modifies the sampling buffer and must be called when the
188 * sampling facility is disabled.
189 *
190 * Returns zero on success, non-zero otherwise.
191 */
192 static int realloc_sampling_buffer(struct sf_buffer *sfb,
193 unsigned long num_sdb, gfp_t gfp_flags)
194 {
195 int i, rc;
196 unsigned long *new, *tail, *tail_prev = NULL;
197
198 if (!sfb->sdbt || !sfb->tail)
199 return -EINVAL;
200
201 if (!is_link_entry(sfb->tail))
202 return -EINVAL;
203
204 /* Append to the existing sampling buffer, overwriting the table-link
205 * register.
206 * The tail variables always points to the "tail" (last and table-link)
207 * entry in an SDB-table.
208 */
209 tail = sfb->tail;
210
211 /* Do a sanity check whether the table-link entry points to
212 * the sampling buffer origin.
213 */
214 if (sfb->sdbt != get_next_sdbt(tail)) {
215 debug_sprintf_event(sfdbg, 3, "realloc_sampling_buffer: "
216 "sampling buffer is not linked: origin=%p"
217 "tail=%p\n",
218 (void *) sfb->sdbt, (void *) tail);
219 return -EINVAL;
220 }
221
222 /* Allocate remaining SDBs */
223 rc = 0;
224 for (i = 0; i < num_sdb; i++) {
225 /* Allocate a new SDB-table if it is full. */
226 if (require_table_link(tail)) {
227 new = (unsigned long *) get_zeroed_page(gfp_flags);
228 if (!new) {
229 rc = -ENOMEM;
230 break;
231 }
232 sfb->num_sdbt++;
233 /* Link current page to tail of chain */
234 *tail = (unsigned long)(void *) new + 1;
235 tail_prev = tail;
236 tail = new;
237 }
238
239 /* Allocate a new sample-data-block.
240 * If there is not enough memory, stop the realloc process
241 * and simply use what was allocated. If this is a temporary
242 * issue, a new realloc call (if required) might succeed.
243 */
244 rc = alloc_sample_data_block(tail, gfp_flags);
245 if (rc) {
246 /* Undo last SDBT. An SDBT with no SDB at its first
247 * entry but with an SDBT entry instead can not be
248 * handled by the interrupt handler code.
249 * Avoid this situation.
250 */
251 if (tail_prev) {
252 sfb->num_sdbt--;
253 free_page((unsigned long) new);
254 tail = tail_prev;
255 }
256 break;
257 }
258 sfb->num_sdb++;
259 tail++;
260 tail_prev = new = NULL; /* Allocated at least one SBD */
261 }
262
263 /* Link sampling buffer to its origin */
264 *tail = (unsigned long) sfb->sdbt + 1;
265 sfb->tail = tail;
266
267 debug_sprintf_event(sfdbg, 4, "realloc_sampling_buffer: new buffer"
268 " settings: sdbt=%lu sdb=%lu\n",
269 sfb->num_sdbt, sfb->num_sdb);
270 return rc;
271 }
272
273 /*
274 * allocate_sampling_buffer() - allocate sampler memory
275 *
276 * Allocates and initializes a sampling buffer structure using the
277 * specified number of sample-data-blocks (SDB). For each allocation,
278 * a 4K page is used. The number of sample-data-block-tables (SDBT)
279 * are calculated from SDBs.
280 * Also set the ALERT_REQ mask in each SDBs trailer.
281 *
282 * Returns zero on success, non-zero otherwise.
283 */
284 static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb)
285 {
286 int rc;
287
288 if (sfb->sdbt)
289 return -EINVAL;
290
291 /* Allocate the sample-data-block-table origin */
292 sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
293 if (!sfb->sdbt)
294 return -ENOMEM;
295 sfb->num_sdb = 0;
296 sfb->num_sdbt = 1;
297
298 /* Link the table origin to point to itself to prepare for
299 * realloc_sampling_buffer() invocation.
300 */
301 sfb->tail = sfb->sdbt;
302 *sfb->tail = (unsigned long)(void *) sfb->sdbt + 1;
303
304 /* Allocate requested number of sample-data-blocks */
305 rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL);
306 if (rc) {
307 free_sampling_buffer(sfb);
308 debug_sprintf_event(sfdbg, 4, "alloc_sampling_buffer: "
309 "realloc_sampling_buffer failed with rc=%i\n", rc);
310 } else
311 debug_sprintf_event(sfdbg, 4,
312 "alloc_sampling_buffer: tear=%p dear=%p\n",
313 sfb->sdbt, (void *) *sfb->sdbt);
314 return rc;
315 }
316
317 static void sfb_set_limits(unsigned long min, unsigned long max)
318 {
319 struct hws_qsi_info_block si;
320
321 CPUM_SF_MIN_SDB = min;
322 CPUM_SF_MAX_SDB = max;
323
324 memset(&si, 0, sizeof(si));
325 if (!qsi(&si))
326 CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes);
327 }
328
329 static unsigned long sfb_max_limit(struct hw_perf_event *hwc)
330 {
331 return SAMPL_DIAG_MODE(hwc) ? CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR
332 : CPUM_SF_MAX_SDB;
333 }
334
335 static unsigned long sfb_pending_allocs(struct sf_buffer *sfb,
336 struct hw_perf_event *hwc)
337 {
338 if (!sfb->sdbt)
339 return SFB_ALLOC_REG(hwc);
340 if (SFB_ALLOC_REG(hwc) > sfb->num_sdb)
341 return SFB_ALLOC_REG(hwc) - sfb->num_sdb;
342 return 0;
343 }
344
345 static int sfb_has_pending_allocs(struct sf_buffer *sfb,
346 struct hw_perf_event *hwc)
347 {
348 return sfb_pending_allocs(sfb, hwc) > 0;
349 }
350
351 static void sfb_account_allocs(unsigned long num, struct hw_perf_event *hwc)
352 {
353 /* Limit the number of SDBs to not exceed the maximum */
354 num = min_t(unsigned long, num, sfb_max_limit(hwc) - SFB_ALLOC_REG(hwc));
355 if (num)
356 SFB_ALLOC_REG(hwc) += num;
357 }
358
359 static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc)
360 {
361 SFB_ALLOC_REG(hwc) = 0;
362 sfb_account_allocs(num, hwc);
363 }
364
365 static void deallocate_buffers(struct cpu_hw_sf *cpuhw)
366 {
367 if (cpuhw->sfb.sdbt)
368 free_sampling_buffer(&cpuhw->sfb);
369 }
370
371 static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
372 {
373 unsigned long n_sdb, freq, factor;
374 size_t sample_size;
375
376 /* Calculate sampling buffers using 4K pages
377 *
378 * 1. Determine the sample data size which depends on the used
379 * sampling functions, for example, basic-sampling or
380 * basic-sampling with diagnostic-sampling.
381 *
382 * 2. Use the sampling frequency as input. The sampling buffer is
383 * designed for almost one second. This can be adjusted through
384 * the "factor" variable.
385 * In any case, alloc_sampling_buffer() sets the Alert Request
386 * Control indicator to trigger a measurement-alert to harvest
387 * sample-data-blocks (sdb).
388 *
389 * 3. Compute the number of sample-data-blocks and ensure a minimum
390 * of CPUM_SF_MIN_SDB. Also ensure the upper limit does not
391 * exceed a "calculated" maximum. The symbolic maximum is
392 * designed for basic-sampling only and needs to be increased if
393 * diagnostic-sampling is active.
394 * See also the remarks for these symbolic constants.
395 *
396 * 4. Compute the number of sample-data-block-tables (SDBT) and
397 * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up
398 * to 511 SDBs).
399 */
400 sample_size = sizeof(struct hws_basic_entry);
401 freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc));
402 factor = 1;
403 n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size));
404 if (n_sdb < CPUM_SF_MIN_SDB)
405 n_sdb = CPUM_SF_MIN_SDB;
406
407 /* If there is already a sampling buffer allocated, it is very likely
408 * that the sampling facility is enabled too. If the event to be
409 * initialized requires a greater sampling buffer, the allocation must
410 * be postponed. Changing the sampling buffer requires the sampling
411 * facility to be in the disabled state. So, account the number of
412 * required SDBs and let cpumsf_pmu_enable() resize the buffer just
413 * before the event is started.
414 */
415 sfb_init_allocs(n_sdb, hwc);
416 if (sf_buffer_available(cpuhw))
417 return 0;
418
419 debug_sprintf_event(sfdbg, 3,
420 "allocate_buffers: rate=%lu f=%lu sdb=%lu/%lu"
421 " sample_size=%lu cpuhw=%p\n",
422 SAMPL_RATE(hwc), freq, n_sdb, sfb_max_limit(hwc),
423 sample_size, cpuhw);
424
425 return alloc_sampling_buffer(&cpuhw->sfb,
426 sfb_pending_allocs(&cpuhw->sfb, hwc));
427 }
428
429 static unsigned long min_percent(unsigned int percent, unsigned long base,
430 unsigned long min)
431 {
432 return min_t(unsigned long, min, DIV_ROUND_UP(percent * base, 100));
433 }
434
435 static unsigned long compute_sfb_extent(unsigned long ratio, unsigned long base)
436 {
437 /* Use a percentage-based approach to extend the sampling facility
438 * buffer. Accept up to 5% sample data loss.
439 * Vary the extents between 1% to 5% of the current number of
440 * sample-data-blocks.
441 */
442 if (ratio <= 5)
443 return 0;
444 if (ratio <= 25)
445 return min_percent(1, base, 1);
446 if (ratio <= 50)
447 return min_percent(1, base, 1);
448 if (ratio <= 75)
449 return min_percent(2, base, 2);
450 if (ratio <= 100)
451 return min_percent(3, base, 3);
452 if (ratio <= 250)
453 return min_percent(4, base, 4);
454
455 return min_percent(5, base, 8);
456 }
457
458 static void sfb_account_overflows(struct cpu_hw_sf *cpuhw,
459 struct hw_perf_event *hwc)
460 {
461 unsigned long ratio, num;
462
463 if (!OVERFLOW_REG(hwc))
464 return;
465
466 /* The sample_overflow contains the average number of sample data
467 * that has been lost because sample-data-blocks were full.
468 *
469 * Calculate the total number of sample data entries that has been
470 * discarded. Then calculate the ratio of lost samples to total samples
471 * per second in percent.
472 */
473 ratio = DIV_ROUND_UP(100 * OVERFLOW_REG(hwc) * cpuhw->sfb.num_sdb,
474 sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)));
475
476 /* Compute number of sample-data-blocks */
477 num = compute_sfb_extent(ratio, cpuhw->sfb.num_sdb);
478 if (num)
479 sfb_account_allocs(num, hwc);
480
481 debug_sprintf_event(sfdbg, 5, "sfb: overflow: overflow=%llu ratio=%lu"
482 " num=%lu\n", OVERFLOW_REG(hwc), ratio, num);
483 OVERFLOW_REG(hwc) = 0;
484 }
485
486 /* extend_sampling_buffer() - Extend sampling buffer
487 * @sfb: Sampling buffer structure (for local CPU)
488 * @hwc: Perf event hardware structure
489 *
490 * Use this function to extend the sampling buffer based on the overflow counter
491 * and postponed allocation extents stored in the specified Perf event hardware.
492 *
493 * Important: This function disables the sampling facility in order to safely
494 * change the sampling buffer structure. Do not call this function
495 * when the PMU is active.
496 */
497 static void extend_sampling_buffer(struct sf_buffer *sfb,
498 struct hw_perf_event *hwc)
499 {
500 unsigned long num, num_old;
501 int rc;
502
503 num = sfb_pending_allocs(sfb, hwc);
504 if (!num)
505 return;
506 num_old = sfb->num_sdb;
507
508 /* Disable the sampling facility to reset any states and also
509 * clear pending measurement alerts.
510 */
511 sf_disable();
512
513 /* Extend the sampling buffer.
514 * This memory allocation typically happens in an atomic context when
515 * called by perf. Because this is a reallocation, it is fine if the
516 * new SDB-request cannot be satisfied immediately.
517 */
518 rc = realloc_sampling_buffer(sfb, num, GFP_ATOMIC);
519 if (rc)
520 debug_sprintf_event(sfdbg, 5, "sfb: extend: realloc "
521 "failed with rc=%i\n", rc);
522
523 if (sfb_has_pending_allocs(sfb, hwc))
524 debug_sprintf_event(sfdbg, 5, "sfb: extend: "
525 "req=%lu alloc=%lu remaining=%lu\n",
526 num, sfb->num_sdb - num_old,
527 sfb_pending_allocs(sfb, hwc));
528 }
529
530
531 /* Number of perf events counting hardware events */
532 static atomic_t num_events;
533 /* Used to avoid races in calling reserve/release_cpumf_hardware */
534 static DEFINE_MUTEX(pmc_reserve_mutex);
535
536 #define PMC_INIT 0
537 #define PMC_RELEASE 1
538 #define PMC_FAILURE 2
539 static void setup_pmc_cpu(void *flags)
540 {
541 int err;
542 struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf);
543
544 err = 0;
545 switch (*((int *) flags)) {
546 case PMC_INIT:
547 memset(cpusf, 0, sizeof(*cpusf));
548 err = qsi(&cpusf->qsi);
549 if (err)
550 break;
551 cpusf->flags |= PMU_F_RESERVED;
552 err = sf_disable();
553 if (err)
554 pr_err("Switching off the sampling facility failed "
555 "with rc=%i\n", err);
556 debug_sprintf_event(sfdbg, 5,
557 "setup_pmc_cpu: initialized: cpuhw=%p\n", cpusf);
558 break;
559 case PMC_RELEASE:
560 cpusf->flags &= ~PMU_F_RESERVED;
561 err = sf_disable();
562 if (err) {
563 pr_err("Switching off the sampling facility failed "
564 "with rc=%i\n", err);
565 } else
566 deallocate_buffers(cpusf);
567 debug_sprintf_event(sfdbg, 5,
568 "setup_pmc_cpu: released: cpuhw=%p\n", cpusf);
569 break;
570 }
571 if (err)
572 *((int *) flags) |= PMC_FAILURE;
573 }
574
575 static void release_pmc_hardware(void)
576 {
577 int flags = PMC_RELEASE;
578
579 irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
580 on_each_cpu(setup_pmc_cpu, &flags, 1);
581 }
582
583 static int reserve_pmc_hardware(void)
584 {
585 int flags = PMC_INIT;
586
587 on_each_cpu(setup_pmc_cpu, &flags, 1);
588 if (flags & PMC_FAILURE) {
589 release_pmc_hardware();
590 return -ENODEV;
591 }
592 irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
593
594 return 0;
595 }
596
597 static void hw_perf_event_destroy(struct perf_event *event)
598 {
599 /* Release PMC if this is the last perf event */
600 if (!atomic_add_unless(&num_events, -1, 1)) {
601 mutex_lock(&pmc_reserve_mutex);
602 if (atomic_dec_return(&num_events) == 0)
603 release_pmc_hardware();
604 mutex_unlock(&pmc_reserve_mutex);
605 }
606 }
607
608 static void hw_init_period(struct hw_perf_event *hwc, u64 period)
609 {
610 hwc->sample_period = period;
611 hwc->last_period = hwc->sample_period;
612 local64_set(&hwc->period_left, hwc->sample_period);
613 }
614
615 static void hw_reset_registers(struct hw_perf_event *hwc,
616 unsigned long *sdbt_origin)
617 {
618 /* (Re)set to first sample-data-block-table */
619 TEAR_REG(hwc) = (unsigned long) sdbt_origin;
620 }
621
622 static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si,
623 unsigned long rate)
624 {
625 return clamp_t(unsigned long, rate,
626 si->min_sampl_rate, si->max_sampl_rate);
627 }
628
629 static u32 cpumsf_pid_type(struct perf_event *event,
630 u32 pid, enum pid_type type)
631 {
632 struct task_struct *tsk;
633
634 /* Idle process */
635 if (!pid)
636 goto out;
637
638 tsk = find_task_by_pid_ns(pid, &init_pid_ns);
639 pid = -1;
640 if (tsk) {
641 /*
642 * Only top level events contain the pid namespace in which
643 * they are created.
644 */
645 if (event->parent)
646 event = event->parent;
647 pid = __task_pid_nr_ns(tsk, type, event->ns);
648 /*
649 * See also 1d953111b648
650 * "perf/core: Don't report zero PIDs for exiting tasks".
651 */
652 if (!pid && !pid_alive(tsk))
653 pid = -1;
654 }
655 out:
656 return pid;
657 }
658
659 static void cpumsf_output_event_pid(struct perf_event *event,
660 struct perf_sample_data *data,
661 struct pt_regs *regs)
662 {
663 u32 pid;
664 struct perf_event_header header;
665 struct perf_output_handle handle;
666
667 /*
668 * Obtain the PID from the basic-sampling data entry and
669 * correct the data->tid_entry.pid value.
670 */
671 pid = data->tid_entry.pid;
672
673 /* Protect callchain buffers, tasks */
674 rcu_read_lock();
675
676 perf_prepare_sample(&header, data, event, regs);
677 if (perf_output_begin(&handle, event, header.size))
678 goto out;
679
680 /* Update the process ID (see also kernel/events/core.c) */
681 data->tid_entry.pid = cpumsf_pid_type(event, pid, PIDTYPE_TGID);
682 data->tid_entry.tid = cpumsf_pid_type(event, pid, PIDTYPE_PID);
683
684 perf_output_sample(&handle, &header, data, event);
685 perf_output_end(&handle);
686 out:
687 rcu_read_unlock();
688 }
689
690 static int __hw_perf_event_init(struct perf_event *event)
691 {
692 struct cpu_hw_sf *cpuhw;
693 struct hws_qsi_info_block si;
694 struct perf_event_attr *attr = &event->attr;
695 struct hw_perf_event *hwc = &event->hw;
696 unsigned long rate;
697 int cpu, err;
698
699 /* Reserve CPU-measurement sampling facility */
700 err = 0;
701 if (!atomic_inc_not_zero(&num_events)) {
702 mutex_lock(&pmc_reserve_mutex);
703 if (atomic_read(&num_events) == 0 && reserve_pmc_hardware())
704 err = -EBUSY;
705 else
706 atomic_inc(&num_events);
707 mutex_unlock(&pmc_reserve_mutex);
708 }
709 event->destroy = hw_perf_event_destroy;
710
711 if (err)
712 goto out;
713
714 /* Access per-CPU sampling information (query sampling info) */
715 /*
716 * The event->cpu value can be -1 to count on every CPU, for example,
717 * when attaching to a task. If this is specified, use the query
718 * sampling info from the current CPU, otherwise use event->cpu to
719 * retrieve the per-CPU information.
720 * Later, cpuhw indicates whether to allocate sampling buffers for a
721 * particular CPU (cpuhw!=NULL) or each online CPU (cpuw==NULL).
722 */
723 memset(&si, 0, sizeof(si));
724 cpuhw = NULL;
725 if (event->cpu == -1)
726 qsi(&si);
727 else {
728 /* Event is pinned to a particular CPU, retrieve the per-CPU
729 * sampling structure for accessing the CPU-specific QSI.
730 */
731 cpuhw = &per_cpu(cpu_hw_sf, event->cpu);
732 si = cpuhw->qsi;
733 }
734
735 /* Check sampling facility authorization and, if not authorized,
736 * fall back to other PMUs. It is safe to check any CPU because
737 * the authorization is identical for all configured CPUs.
738 */
739 if (!si.as) {
740 err = -ENOENT;
741 goto out;
742 }
743
744 if (si.ribm & CPU_MF_SF_RIBM_NOTAV) {
745 pr_warn("CPU Measurement Facility sampling is temporarily not available\n");
746 err = -EBUSY;
747 goto out;
748 }
749
750 /* Always enable basic sampling */
751 SAMPL_FLAGS(hwc) = PERF_CPUM_SF_BASIC_MODE;
752
753 /* Check if diagnostic sampling is requested. Deny if the required
754 * sampling authorization is missing.
755 */
756 if (attr->config == PERF_EVENT_CPUM_SF_DIAG) {
757 if (!si.ad) {
758 err = -EPERM;
759 goto out;
760 }
761 SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE;
762 }
763
764 /* Check and set other sampling flags */
765 if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS)
766 SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS;
767
768 /* The sampling information (si) contains information about the
769 * min/max sampling intervals and the CPU speed. So calculate the
770 * correct sampling interval and avoid the whole period adjust
771 * feedback loop.
772 */
773 rate = 0;
774 if (attr->freq) {
775 if (!attr->sample_freq) {
776 err = -EINVAL;
777 goto out;
778 }
779 rate = freq_to_sample_rate(&si, attr->sample_freq);
780 rate = hw_limit_rate(&si, rate);
781 attr->freq = 0;
782 attr->sample_period = rate;
783 } else {
784 /* The min/max sampling rates specifies the valid range
785 * of sample periods. If the specified sample period is
786 * out of range, limit the period to the range boundary.
787 */
788 rate = hw_limit_rate(&si, hwc->sample_period);
789
790 /* The perf core maintains a maximum sample rate that is
791 * configurable through the sysctl interface. Ensure the
792 * sampling rate does not exceed this value. This also helps
793 * to avoid throttling when pushing samples with
794 * perf_event_overflow().
795 */
796 if (sample_rate_to_freq(&si, rate) >
797 sysctl_perf_event_sample_rate) {
798 err = -EINVAL;
799 debug_sprintf_event(sfdbg, 1, "Sampling rate exceeds maximum perf sample rate\n");
800 goto out;
801 }
802 }
803 SAMPL_RATE(hwc) = rate;
804 hw_init_period(hwc, SAMPL_RATE(hwc));
805
806 /* Initialize sample data overflow accounting */
807 hwc->extra_reg.reg = REG_OVERFLOW;
808 OVERFLOW_REG(hwc) = 0;
809
810 /* Use AUX buffer. No need to allocate it by ourself */
811 if (attr->config == PERF_EVENT_CPUM_SF_DIAG)
812 return 0;
813
814 /* Allocate the per-CPU sampling buffer using the CPU information
815 * from the event. If the event is not pinned to a particular
816 * CPU (event->cpu == -1; or cpuhw == NULL), allocate sampling
817 * buffers for each online CPU.
818 */
819 if (cpuhw)
820 /* Event is pinned to a particular CPU */
821 err = allocate_buffers(cpuhw, hwc);
822 else {
823 /* Event is not pinned, allocate sampling buffer on
824 * each online CPU
825 */
826 for_each_online_cpu(cpu) {
827 cpuhw = &per_cpu(cpu_hw_sf, cpu);
828 err = allocate_buffers(cpuhw, hwc);
829 if (err)
830 break;
831 }
832 }
833
834 /* If PID/TID sampling is active, replace the default overflow
835 * handler to extract and resolve the PIDs from the basic-sampling
836 * data entries.
837 */
838 if (event->attr.sample_type & PERF_SAMPLE_TID)
839 if (is_default_overflow_handler(event))
840 event->overflow_handler = cpumsf_output_event_pid;
841 out:
842 return err;
843 }
844
845 static int cpumsf_pmu_event_init(struct perf_event *event)
846 {
847 int err;
848
849 /* No support for taken branch sampling */
850 if (has_branch_stack(event))
851 return -EOPNOTSUPP;
852
853 switch (event->attr.type) {
854 case PERF_TYPE_RAW:
855 if ((event->attr.config != PERF_EVENT_CPUM_SF) &&
856 (event->attr.config != PERF_EVENT_CPUM_SF_DIAG))
857 return -ENOENT;
858 break;
859 case PERF_TYPE_HARDWARE:
860 /* Support sampling of CPU cycles in addition to the
861 * counter facility. However, the counter facility
862 * is more precise and, hence, restrict this PMU to
863 * sampling events only.
864 */
865 if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES)
866 return -ENOENT;
867 if (!is_sampling_event(event))
868 return -ENOENT;
869 break;
870 default:
871 return -ENOENT;
872 }
873
874 /* Check online status of the CPU to which the event is pinned */
875 if (event->cpu >= 0 && !cpu_online(event->cpu))
876 return -ENODEV;
877
878 /* Force reset of idle/hv excludes regardless of what the
879 * user requested.
880 */
881 if (event->attr.exclude_hv)
882 event->attr.exclude_hv = 0;
883 if (event->attr.exclude_idle)
884 event->attr.exclude_idle = 0;
885
886 err = __hw_perf_event_init(event);
887 if (unlikely(err))
888 if (event->destroy)
889 event->destroy(event);
890 return err;
891 }
892
893 static void cpumsf_pmu_enable(struct pmu *pmu)
894 {
895 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
896 struct hw_perf_event *hwc;
897 int err;
898
899 if (cpuhw->flags & PMU_F_ENABLED)
900 return;
901
902 if (cpuhw->flags & PMU_F_ERR_MASK)
903 return;
904
905 /* Check whether to extent the sampling buffer.
906 *
907 * Two conditions trigger an increase of the sampling buffer for a
908 * perf event:
909 * 1. Postponed buffer allocations from the event initialization.
910 * 2. Sampling overflows that contribute to pending allocations.
911 *
912 * Note that the extend_sampling_buffer() function disables the sampling
913 * facility, but it can be fully re-enabled using sampling controls that
914 * have been saved in cpumsf_pmu_disable().
915 */
916 if (cpuhw->event) {
917 hwc = &cpuhw->event->hw;
918 if (!(SAMPL_DIAG_MODE(hwc))) {
919 /*
920 * Account number of overflow-designated
921 * buffer extents
922 */
923 sfb_account_overflows(cpuhw, hwc);
924 if (sfb_has_pending_allocs(&cpuhw->sfb, hwc))
925 extend_sampling_buffer(&cpuhw->sfb, hwc);
926 }
927 }
928
929 /* (Re)enable the PMU and sampling facility */
930 cpuhw->flags |= PMU_F_ENABLED;
931 barrier();
932
933 err = lsctl(&cpuhw->lsctl);
934 if (err) {
935 cpuhw->flags &= ~PMU_F_ENABLED;
936 pr_err("Loading sampling controls failed: op=%i err=%i\n",
937 1, err);
938 return;
939 }
940
941 /* Load current program parameter */
942 lpp(&S390_lowcore.lpp);
943
944 debug_sprintf_event(sfdbg, 6, "pmu_enable: es=%i cs=%i ed=%i cd=%i "
945 "tear=%p dear=%p\n", cpuhw->lsctl.es, cpuhw->lsctl.cs,
946 cpuhw->lsctl.ed, cpuhw->lsctl.cd,
947 (void *) cpuhw->lsctl.tear, (void *) cpuhw->lsctl.dear);
948 }
949
950 static void cpumsf_pmu_disable(struct pmu *pmu)
951 {
952 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
953 struct hws_lsctl_request_block inactive;
954 struct hws_qsi_info_block si;
955 int err;
956
957 if (!(cpuhw->flags & PMU_F_ENABLED))
958 return;
959
960 if (cpuhw->flags & PMU_F_ERR_MASK)
961 return;
962
963 /* Switch off sampling activation control */
964 inactive = cpuhw->lsctl;
965 inactive.cs = 0;
966 inactive.cd = 0;
967
968 err = lsctl(&inactive);
969 if (err) {
970 pr_err("Loading sampling controls failed: op=%i err=%i\n",
971 2, err);
972 return;
973 }
974
975 /* Save state of TEAR and DEAR register contents */
976 if (!qsi(&si)) {
977 /* TEAR/DEAR values are valid only if the sampling facility is
978 * enabled. Note that cpumsf_pmu_disable() might be called even
979 * for a disabled sampling facility because cpumsf_pmu_enable()
980 * controls the enable/disable state.
981 */
982 if (si.es) {
983 cpuhw->lsctl.tear = si.tear;
984 cpuhw->lsctl.dear = si.dear;
985 }
986 } else
987 debug_sprintf_event(sfdbg, 3, "cpumsf_pmu_disable: "
988 "qsi() failed with err=%i\n", err);
989
990 cpuhw->flags &= ~PMU_F_ENABLED;
991 }
992
993 /* perf_exclude_event() - Filter event
994 * @event: The perf event
995 * @regs: pt_regs structure
996 * @sde_regs: Sample-data-entry (sde) regs structure
997 *
998 * Filter perf events according to their exclude specification.
999 *
1000 * Return non-zero if the event shall be excluded.
1001 */
1002 static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs,
1003 struct perf_sf_sde_regs *sde_regs)
1004 {
1005 if (event->attr.exclude_user && user_mode(regs))
1006 return 1;
1007 if (event->attr.exclude_kernel && !user_mode(regs))
1008 return 1;
1009 if (event->attr.exclude_guest && sde_regs->in_guest)
1010 return 1;
1011 if (event->attr.exclude_host && !sde_regs->in_guest)
1012 return 1;
1013 return 0;
1014 }
1015
1016 /* perf_push_sample() - Push samples to perf
1017 * @event: The perf event
1018 * @sample: Hardware sample data
1019 *
1020 * Use the hardware sample data to create perf event sample. The sample
1021 * is the pushed to the event subsystem and the function checks for
1022 * possible event overflows. If an event overflow occurs, the PMU is
1023 * stopped.
1024 *
1025 * Return non-zero if an event overflow occurred.
1026 */
1027 static int perf_push_sample(struct perf_event *event,
1028 struct hws_basic_entry *basic)
1029 {
1030 int overflow;
1031 struct pt_regs regs;
1032 struct perf_sf_sde_regs *sde_regs;
1033 struct perf_sample_data data;
1034
1035 /* Setup perf sample */
1036 perf_sample_data_init(&data, 0, event->hw.last_period);
1037
1038 /* Setup pt_regs to look like an CPU-measurement external interrupt
1039 * using the Program Request Alert code. The regs.int_parm_long
1040 * field which is unused contains additional sample-data-entry related
1041 * indicators.
1042 */
1043 memset(&regs, 0, sizeof(regs));
1044 regs.int_code = 0x1407;
1045 regs.int_parm = CPU_MF_INT_SF_PRA;
1046 sde_regs = (struct perf_sf_sde_regs *) &regs.int_parm_long;
1047
1048 psw_bits(regs.psw).ia = basic->ia;
1049 psw_bits(regs.psw).dat = basic->T;
1050 psw_bits(regs.psw).wait = basic->W;
1051 psw_bits(regs.psw).pstate = basic->P;
1052 psw_bits(regs.psw).as = basic->AS;
1053
1054 /*
1055 * Use the hardware provided configuration level to decide if the
1056 * sample belongs to a guest or host. If that is not available,
1057 * fall back to the following heuristics:
1058 * A non-zero guest program parameter always indicates a guest
1059 * sample. Some early samples or samples from guests without
1060 * lpp usage would be misaccounted to the host. We use the asn
1061 * value as an addon heuristic to detect most of these guest samples.
1062 * If the value differs from 0xffff (the host value), we assume to
1063 * be a KVM guest.
1064 */
1065 switch (basic->CL) {
1066 case 1: /* logical partition */
1067 sde_regs->in_guest = 0;
1068 break;
1069 case 2: /* virtual machine */
1070 sde_regs->in_guest = 1;
1071 break;
1072 default: /* old machine, use heuristics */
1073 if (basic->gpp || basic->prim_asn != 0xffff)
1074 sde_regs->in_guest = 1;
1075 break;
1076 }
1077
1078 /*
1079 * Store the PID value from the sample-data-entry to be
1080 * processed and resolved by cpumsf_output_event_pid().
1081 */
1082 data.tid_entry.pid = basic->hpp & LPP_PID_MASK;
1083
1084 overflow = 0;
1085 if (perf_exclude_event(event, &regs, sde_regs))
1086 goto out;
1087 if (perf_event_overflow(event, &data, &regs)) {
1088 overflow = 1;
1089 event->pmu->stop(event, 0);
1090 }
1091 perf_event_update_userpage(event);
1092 out:
1093 return overflow;
1094 }
1095
1096 static void perf_event_count_update(struct perf_event *event, u64 count)
1097 {
1098 local64_add(count, &event->count);
1099 }
1100
1101 static void debug_sample_entry(struct hws_basic_entry *sample,
1102 struct hws_trailer_entry *te)
1103 {
1104 debug_sprintf_event(sfdbg, 4, "hw_collect_samples: Found unknown "
1105 "sampling data entry: te->f=%i basic.def=%04x (%p)\n",
1106 te->f, sample->def, sample);
1107 }
1108
1109 /* hw_collect_samples() - Walk through a sample-data-block and collect samples
1110 * @event: The perf event
1111 * @sdbt: Sample-data-block table
1112 * @overflow: Event overflow counter
1113 *
1114 * Walks through a sample-data-block and collects sampling data entries that are
1115 * then pushed to the perf event subsystem. Depending on the sampling function,
1116 * there can be either basic-sampling or combined-sampling data entries. A
1117 * combined-sampling data entry consists of a basic- and a diagnostic-sampling
1118 * data entry. The sampling function is determined by the flags in the perf
1119 * event hardware structure. The function always works with a combined-sampling
1120 * data entry but ignores the the diagnostic portion if it is not available.
1121 *
1122 * Note that the implementation focuses on basic-sampling data entries and, if
1123 * such an entry is not valid, the entire combined-sampling data entry is
1124 * ignored.
1125 *
1126 * The overflow variables counts the number of samples that has been discarded
1127 * due to a perf event overflow.
1128 */
1129 static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
1130 unsigned long long *overflow)
1131 {
1132 struct hws_trailer_entry *te;
1133 struct hws_basic_entry *sample;
1134
1135 te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
1136 sample = (struct hws_basic_entry *) *sdbt;
1137 while ((unsigned long *) sample < (unsigned long *) te) {
1138 /* Check for an empty sample */
1139 if (!sample->def)
1140 break;
1141
1142 /* Update perf event period */
1143 perf_event_count_update(event, SAMPL_RATE(&event->hw));
1144
1145 /* Check whether sample is valid */
1146 if (sample->def == 0x0001) {
1147 /* If an event overflow occurred, the PMU is stopped to
1148 * throttle event delivery. Remaining sample data is
1149 * discarded.
1150 */
1151 if (!*overflow) {
1152 /* Check whether sample is consistent */
1153 if (sample->I == 0 && sample->W == 0) {
1154 /* Deliver sample data to perf */
1155 *overflow = perf_push_sample(event,
1156 sample);
1157 }
1158 } else
1159 /* Count discarded samples */
1160 *overflow += 1;
1161 } else {
1162 debug_sample_entry(sample, te);
1163 /* Sample slot is not yet written or other record.
1164 *
1165 * This condition can occur if the buffer was reused
1166 * from a combined basic- and diagnostic-sampling.
1167 * If only basic-sampling is then active, entries are
1168 * written into the larger diagnostic entries.
1169 * This is typically the case for sample-data-blocks
1170 * that are not full. Stop processing if the first
1171 * invalid format was detected.
1172 */
1173 if (!te->f)
1174 break;
1175 }
1176
1177 /* Reset sample slot and advance to next sample */
1178 sample->def = 0;
1179 sample++;
1180 }
1181 }
1182
1183 /* hw_perf_event_update() - Process sampling buffer
1184 * @event: The perf event
1185 * @flush_all: Flag to also flush partially filled sample-data-blocks
1186 *
1187 * Processes the sampling buffer and create perf event samples.
1188 * The sampling buffer position are retrieved and saved in the TEAR_REG
1189 * register of the specified perf event.
1190 *
1191 * Only full sample-data-blocks are processed. Specify the flash_all flag
1192 * to also walk through partially filled sample-data-blocks. It is ignored
1193 * if PERF_CPUM_SF_FULL_BLOCKS is set. The PERF_CPUM_SF_FULL_BLOCKS flag
1194 * enforces the processing of full sample-data-blocks only (trailer entries
1195 * with the block-full-indicator bit set).
1196 */
1197 static void hw_perf_event_update(struct perf_event *event, int flush_all)
1198 {
1199 struct hw_perf_event *hwc = &event->hw;
1200 struct hws_trailer_entry *te;
1201 unsigned long *sdbt;
1202 unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags;
1203 int done;
1204
1205 /*
1206 * AUX buffer is used when in diagnostic sampling mode.
1207 * No perf events/samples are created.
1208 */
1209 if (SAMPL_DIAG_MODE(&event->hw))
1210 return;
1211
1212 if (flush_all && SDB_FULL_BLOCKS(hwc))
1213 flush_all = 0;
1214
1215 sdbt = (unsigned long *) TEAR_REG(hwc);
1216 done = event_overflow = sampl_overflow = num_sdb = 0;
1217 while (!done) {
1218 /* Get the trailer entry of the sample-data-block */
1219 te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
1220
1221 /* Leave loop if no more work to do (block full indicator) */
1222 if (!te->f) {
1223 done = 1;
1224 if (!flush_all)
1225 break;
1226 }
1227
1228 /* Check the sample overflow count */
1229 if (te->overflow)
1230 /* Account sample overflows and, if a particular limit
1231 * is reached, extend the sampling buffer.
1232 * For details, see sfb_account_overflows().
1233 */
1234 sampl_overflow += te->overflow;
1235
1236 /* Timestamps are valid for full sample-data-blocks only */
1237 debug_sprintf_event(sfdbg, 6, "hw_perf_event_update: sdbt=%p "
1238 "overflow=%llu timestamp=0x%llx\n",
1239 sdbt, te->overflow,
1240 (te->f) ? trailer_timestamp(te) : 0ULL);
1241
1242 /* Collect all samples from a single sample-data-block and
1243 * flag if an (perf) event overflow happened. If so, the PMU
1244 * is stopped and remaining samples will be discarded.
1245 */
1246 hw_collect_samples(event, sdbt, &event_overflow);
1247 num_sdb++;
1248
1249 /* Reset trailer (using compare-double-and-swap) */
1250 do {
1251 te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
1252 te_flags |= SDB_TE_ALERT_REQ_MASK;
1253 } while (!cmpxchg_double(&te->flags, &te->overflow,
1254 te->flags, te->overflow,
1255 te_flags, 0ULL));
1256
1257 /* Advance to next sample-data-block */
1258 sdbt++;
1259 if (is_link_entry(sdbt))
1260 sdbt = get_next_sdbt(sdbt);
1261
1262 /* Update event hardware registers */
1263 TEAR_REG(hwc) = (unsigned long) sdbt;
1264
1265 /* Stop processing sample-data if all samples of the current
1266 * sample-data-block were flushed even if it was not full.
1267 */
1268 if (flush_all && done)
1269 break;
1270 }
1271
1272 /* Account sample overflows in the event hardware structure */
1273 if (sampl_overflow)
1274 OVERFLOW_REG(hwc) = DIV_ROUND_UP(OVERFLOW_REG(hwc) +
1275 sampl_overflow, 1 + num_sdb);
1276
1277 /* Perf_event_overflow() and perf_event_account_interrupt() limit
1278 * the interrupt rate to an upper limit. Roughly 1000 samples per
1279 * task tick.
1280 * Hitting this limit results in a large number
1281 * of throttled REF_REPORT_THROTTLE entries and the samples
1282 * are dropped.
1283 * Slightly increase the interval to avoid hitting this limit.
1284 */
1285 if (event_overflow) {
1286 SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10);
1287 debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n",
1288 __func__,
1289 DIV_ROUND_UP(SAMPL_RATE(hwc), 10));
1290 }
1291
1292 if (sampl_overflow || event_overflow)
1293 debug_sprintf_event(sfdbg, 4, "hw_perf_event_update: "
1294 "overflow stats: sample=%llu event=%llu\n",
1295 sampl_overflow, event_overflow);
1296 }
1297
1298 #define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb)
1299 #define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0)
1300 #define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark)
1301 #define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark)
1302
1303 /*
1304 * Get trailer entry by index of SDB.
1305 */
1306 static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux,
1307 unsigned long index)
1308 {
1309 unsigned long sdb;
1310
1311 index = AUX_SDB_INDEX(aux, index);
1312 sdb = aux->sdb_index[index];
1313 return (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
1314 }
1315
1316 /*
1317 * Finish sampling on the cpu. Called by cpumsf_pmu_del() with pmu
1318 * disabled. Collect the full SDBs in AUX buffer which have not reached
1319 * the point of alert indicator. And ignore the SDBs which are not
1320 * full.
1321 *
1322 * 1. Scan SDBs to see how much data is there and consume them.
1323 * 2. Remove alert indicator in the buffer.
1324 */
1325 static void aux_output_end(struct perf_output_handle *handle)
1326 {
1327 unsigned long i, range_scan, idx;
1328 struct aux_buffer *aux;
1329 struct hws_trailer_entry *te;
1330
1331 aux = perf_get_aux(handle);
1332 if (!aux)
1333 return;
1334
1335 range_scan = AUX_SDB_NUM_ALERT(aux);
1336 for (i = 0, idx = aux->head; i < range_scan; i++, idx++) {
1337 te = aux_sdb_trailer(aux, idx);
1338 if (!(te->flags & SDB_TE_BUFFER_FULL_MASK))
1339 break;
1340 }
1341 /* i is num of SDBs which are full */
1342 perf_aux_output_end(handle, i << PAGE_SHIFT);
1343
1344 /* Remove alert indicators in the buffer */
1345 te = aux_sdb_trailer(aux, aux->alert_mark);
1346 te->flags &= ~SDB_TE_ALERT_REQ_MASK;
1347
1348 debug_sprintf_event(sfdbg, 6, "aux_output_end: collect %lx SDBs\n", i);
1349 }
1350
1351 /*
1352 * Start sampling on the CPU. Called by cpumsf_pmu_add() when an event
1353 * is first added to the CPU or rescheduled again to the CPU. It is called
1354 * with pmu disabled.
1355 *
1356 * 1. Reset the trailer of SDBs to get ready for new data.
1357 * 2. Tell the hardware where to put the data by reset the SDBs buffer
1358 * head(tear/dear).
1359 */
1360 static int aux_output_begin(struct perf_output_handle *handle,
1361 struct aux_buffer *aux,
1362 struct cpu_hw_sf *cpuhw)
1363 {
1364 unsigned long range;
1365 unsigned long i, range_scan, idx;
1366 unsigned long head, base, offset;
1367 struct hws_trailer_entry *te;
1368
1369 if (WARN_ON_ONCE(handle->head & ~PAGE_MASK))
1370 return -EINVAL;
1371
1372 aux->head = handle->head >> PAGE_SHIFT;
1373 range = (handle->size + 1) >> PAGE_SHIFT;
1374 if (range <= 1)
1375 return -ENOMEM;
1376
1377 /*
1378 * SDBs between aux->head and aux->empty_mark are already ready
1379 * for new data. range_scan is num of SDBs not within them.
1380 */
1381 if (range > AUX_SDB_NUM_EMPTY(aux)) {
1382 range_scan = range - AUX_SDB_NUM_EMPTY(aux);
1383 idx = aux->empty_mark + 1;
1384 for (i = 0; i < range_scan; i++, idx++) {
1385 te = aux_sdb_trailer(aux, idx);
1386 te->flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
1387 te->flags = te->flags & ~SDB_TE_ALERT_REQ_MASK;
1388 te->overflow = 0;
1389 }
1390 /* Save the position of empty SDBs */
1391 aux->empty_mark = aux->head + range - 1;
1392 }
1393
1394 /* Set alert indicator */
1395 aux->alert_mark = aux->head + range/2 - 1;
1396 te = aux_sdb_trailer(aux, aux->alert_mark);
1397 te->flags = te->flags | SDB_TE_ALERT_REQ_MASK;
1398
1399 /* Reset hardware buffer head */
1400 head = AUX_SDB_INDEX(aux, aux->head);
1401 base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE];
1402 offset = head % CPUM_SF_SDB_PER_TABLE;
1403 cpuhw->lsctl.tear = base + offset * sizeof(unsigned long);
1404 cpuhw->lsctl.dear = aux->sdb_index[head];
1405
1406 debug_sprintf_event(sfdbg, 6, "aux_output_begin: "
1407 "head->alert_mark->empty_mark (num_alert, range)"
1408 "[%lx -> %lx -> %lx] (%lx, %lx) "
1409 "tear index %lx, tear %lx dear %lx\n",
1410 aux->head, aux->alert_mark, aux->empty_mark,
1411 AUX_SDB_NUM_ALERT(aux), range,
1412 head / CPUM_SF_SDB_PER_TABLE,
1413 cpuhw->lsctl.tear,
1414 cpuhw->lsctl.dear);
1415
1416 return 0;
1417 }
1418
1419 /*
1420 * Set alert indicator on SDB at index @alert_index while sampler is running.
1421 *
1422 * Return true if successfully.
1423 * Return false if full indicator is already set by hardware sampler.
1424 */
1425 static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
1426 unsigned long long *overflow)
1427 {
1428 unsigned long long orig_overflow, orig_flags, new_flags;
1429 struct hws_trailer_entry *te;
1430
1431 te = aux_sdb_trailer(aux, alert_index);
1432 do {
1433 orig_flags = te->flags;
1434 orig_overflow = te->overflow;
1435 *overflow = orig_overflow;
1436 if (orig_flags & SDB_TE_BUFFER_FULL_MASK) {
1437 /*
1438 * SDB is already set by hardware.
1439 * Abort and try to set somewhere
1440 * behind.
1441 */
1442 return false;
1443 }
1444 new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK;
1445 } while (!cmpxchg_double(&te->flags, &te->overflow,
1446 orig_flags, orig_overflow,
1447 new_flags, 0ULL));
1448 return true;
1449 }
1450
1451 /*
1452 * aux_reset_buffer() - Scan and setup SDBs for new samples
1453 * @aux: The AUX buffer to set
1454 * @range: The range of SDBs to scan started from aux->head
1455 * @overflow: Set to overflow count
1456 *
1457 * Set alert indicator on the SDB at index of aux->alert_mark. If this SDB is
1458 * marked as empty, check if it is already set full by the hardware sampler.
1459 * If yes, that means new data is already there before we can set an alert
1460 * indicator. Caller should try to set alert indicator to some position behind.
1461 *
1462 * Scan the SDBs in AUX buffer from behind aux->empty_mark. They are used
1463 * previously and have already been consumed by user space. Reset these SDBs
1464 * (clear full indicator and alert indicator) for new data.
1465 * If aux->alert_mark fall in this area, just set it. Overflow count is
1466 * recorded while scanning.
1467 *
1468 * SDBs between aux->head and aux->empty_mark are already reset at last time.
1469 * and ready for new samples. So scanning on this area could be skipped.
1470 *
1471 * Return true if alert indicator is set successfully and false if not.
1472 */
1473 static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
1474 unsigned long long *overflow)
1475 {
1476 unsigned long long orig_overflow, orig_flags, new_flags;
1477 unsigned long i, range_scan, idx;
1478 struct hws_trailer_entry *te;
1479
1480 if (range <= AUX_SDB_NUM_EMPTY(aux))
1481 /*
1482 * No need to scan. All SDBs in range are marked as empty.
1483 * Just set alert indicator. Should check race with hardware
1484 * sampler.
1485 */
1486 return aux_set_alert(aux, aux->alert_mark, overflow);
1487
1488 if (aux->alert_mark <= aux->empty_mark)
1489 /*
1490 * Set alert indicator on empty SDB. Should check race
1491 * with hardware sampler.
1492 */
1493 if (!aux_set_alert(aux, aux->alert_mark, overflow))
1494 return false;
1495
1496 /*
1497 * Scan the SDBs to clear full and alert indicator used previously.
1498 * Start scanning from one SDB behind empty_mark. If the new alert
1499 * indicator fall into this range, set it.
1500 */
1501 range_scan = range - AUX_SDB_NUM_EMPTY(aux);
1502 idx = aux->empty_mark + 1;
1503 for (i = 0; i < range_scan; i++, idx++) {
1504 te = aux_sdb_trailer(aux, idx);
1505 do {
1506 orig_flags = te->flags;
1507 orig_overflow = te->overflow;
1508 new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK;
1509 if (idx == aux->alert_mark)
1510 new_flags |= SDB_TE_ALERT_REQ_MASK;
1511 else
1512 new_flags &= ~SDB_TE_ALERT_REQ_MASK;
1513 } while (!cmpxchg_double(&te->flags, &te->overflow,
1514 orig_flags, orig_overflow,
1515 new_flags, 0ULL));
1516 *overflow += orig_overflow;
1517 }
1518
1519 /* Update empty_mark to new position */
1520 aux->empty_mark = aux->head + range - 1;
1521
1522 return true;
1523 }
1524
1525 /*
1526 * Measurement alert handler for diagnostic mode sampling.
1527 */
1528 static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
1529 {
1530 struct aux_buffer *aux;
1531 int done = 0;
1532 unsigned long range = 0, size;
1533 unsigned long long overflow = 0;
1534 struct perf_output_handle *handle = &cpuhw->handle;
1535 unsigned long num_sdb;
1536
1537 aux = perf_get_aux(handle);
1538 if (WARN_ON_ONCE(!aux))
1539 return;
1540
1541 /* Inform user space new data arrived */
1542 size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT;
1543 perf_aux_output_end(handle, size);
1544 num_sdb = aux->sfb.num_sdb;
1545
1546 while (!done) {
1547 /* Get an output handle */
1548 aux = perf_aux_output_begin(handle, cpuhw->event);
1549 if (handle->size == 0) {
1550 pr_err("The AUX buffer with %lu pages for the "
1551 "diagnostic-sampling mode is full\n",
1552 num_sdb);
1553 debug_sprintf_event(sfdbg, 1, "AUX buffer used up\n");
1554 break;
1555 }
1556 if (WARN_ON_ONCE(!aux))
1557 return;
1558
1559 /* Update head and alert_mark to new position */
1560 aux->head = handle->head >> PAGE_SHIFT;
1561 range = (handle->size + 1) >> PAGE_SHIFT;
1562 if (range == 1)
1563 aux->alert_mark = aux->head;
1564 else
1565 aux->alert_mark = aux->head + range/2 - 1;
1566
1567 if (aux_reset_buffer(aux, range, &overflow)) {
1568 if (!overflow) {
1569 done = 1;
1570 break;
1571 }
1572 size = range << PAGE_SHIFT;
1573 perf_aux_output_end(&cpuhw->handle, size);
1574 pr_err("Sample data caused the AUX buffer with %lu "
1575 "pages to overflow\n", num_sdb);
1576 debug_sprintf_event(sfdbg, 1, "head %lx range %lx "
1577 "overflow %llx\n",
1578 aux->head, range, overflow);
1579 } else {
1580 size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT;
1581 perf_aux_output_end(&cpuhw->handle, size);
1582 debug_sprintf_event(sfdbg, 6, "head %lx alert %lx "
1583 "already full, try another\n",
1584 aux->head, aux->alert_mark);
1585 }
1586 }
1587
1588 if (done)
1589 debug_sprintf_event(sfdbg, 6, "aux_reset_buffer: "
1590 "[%lx -> %lx -> %lx] (%lx, %lx)\n",
1591 aux->head, aux->alert_mark, aux->empty_mark,
1592 AUX_SDB_NUM_ALERT(aux), range);
1593 }
1594
1595 /*
1596 * Callback when freeing AUX buffers.
1597 */
1598 static void aux_buffer_free(void *data)
1599 {
1600 struct aux_buffer *aux = data;
1601 unsigned long i, num_sdbt;
1602
1603 if (!aux)
1604 return;
1605
1606 /* Free SDBT. SDB is freed by the caller */
1607 num_sdbt = aux->sfb.num_sdbt;
1608 for (i = 0; i < num_sdbt; i++)
1609 free_page(aux->sdbt_index[i]);
1610
1611 kfree(aux->sdbt_index);
1612 kfree(aux->sdb_index);
1613 kfree(aux);
1614
1615 debug_sprintf_event(sfdbg, 4, "aux_buffer_free: free "
1616 "%lu SDBTs\n", num_sdbt);
1617 }
1618
1619 static void aux_sdb_init(unsigned long sdb)
1620 {
1621 struct hws_trailer_entry *te;
1622
1623 te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
1624
1625 /* Save clock base */
1626 te->clock_base = 1;
1627 memcpy(&te->progusage2, &tod_clock_base[1], 8);
1628 }
1629
1630 /*
1631 * aux_buffer_setup() - Setup AUX buffer for diagnostic mode sampling
1632 * @event: Event the buffer is setup for, event->cpu == -1 means current
1633 * @pages: Array of pointers to buffer pages passed from perf core
1634 * @nr_pages: Total pages
1635 * @snapshot: Flag for snapshot mode
1636 *
1637 * This is the callback when setup an event using AUX buffer. Perf tool can
1638 * trigger this by an additional mmap() call on the event. Unlike the buffer
1639 * for basic samples, AUX buffer belongs to the event. It is scheduled with
1640 * the task among online cpus when it is a per-thread event.
1641 *
1642 * Return the private AUX buffer structure if success or NULL if fails.
1643 */
1644 static void *aux_buffer_setup(struct perf_event *event, void **pages,
1645 int nr_pages, bool snapshot)
1646 {
1647 struct sf_buffer *sfb;
1648 struct aux_buffer *aux;
1649 unsigned long *new, *tail;
1650 int i, n_sdbt;
1651
1652 if (!nr_pages || !pages)
1653 return NULL;
1654
1655 if (nr_pages > CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR) {
1656 pr_err("AUX buffer size (%i pages) is larger than the "
1657 "maximum sampling buffer limit\n",
1658 nr_pages);
1659 return NULL;
1660 } else if (nr_pages < CPUM_SF_MIN_SDB * CPUM_SF_SDB_DIAG_FACTOR) {
1661 pr_err("AUX buffer size (%i pages) is less than the "
1662 "minimum sampling buffer limit\n",
1663 nr_pages);
1664 return NULL;
1665 }
1666
1667 /* Allocate aux_buffer struct for the event */
1668 aux = kmalloc(sizeof(struct aux_buffer), GFP_KERNEL);
1669 if (!aux)
1670 goto no_aux;
1671 sfb = &aux->sfb;
1672
1673 /* Allocate sdbt_index for fast reference */
1674 n_sdbt = (nr_pages + CPUM_SF_SDB_PER_TABLE - 1) / CPUM_SF_SDB_PER_TABLE;
1675 aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL);
1676 if (!aux->sdbt_index)
1677 goto no_sdbt_index;
1678
1679 /* Allocate sdb_index for fast reference */
1680 aux->sdb_index = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
1681 if (!aux->sdb_index)
1682 goto no_sdb_index;
1683
1684 /* Allocate the first SDBT */
1685 sfb->num_sdbt = 0;
1686 sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
1687 if (!sfb->sdbt)
1688 goto no_sdbt;
1689 aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt;
1690 tail = sfb->tail = sfb->sdbt;
1691
1692 /*
1693 * Link the provided pages of AUX buffer to SDBT.
1694 * Allocate SDBT if needed.
1695 */
1696 for (i = 0; i < nr_pages; i++, tail++) {
1697 if (require_table_link(tail)) {
1698 new = (unsigned long *) get_zeroed_page(GFP_KERNEL);
1699 if (!new)
1700 goto no_sdbt;
1701 aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new;
1702 /* Link current page to tail of chain */
1703 *tail = (unsigned long)(void *) new + 1;
1704 tail = new;
1705 }
1706 /* Tail is the entry in a SDBT */
1707 *tail = (unsigned long)pages[i];
1708 aux->sdb_index[i] = (unsigned long)pages[i];
1709 aux_sdb_init((unsigned long)pages[i]);
1710 }
1711 sfb->num_sdb = nr_pages;
1712
1713 /* Link the last entry in the SDBT to the first SDBT */
1714 *tail = (unsigned long) sfb->sdbt + 1;
1715 sfb->tail = tail;
1716
1717 /*
1718 * Initial all SDBs are zeroed. Mark it as empty.
1719 * So there is no need to clear the full indicator
1720 * when this event is first added.
1721 */
1722 aux->empty_mark = sfb->num_sdb - 1;
1723
1724 debug_sprintf_event(sfdbg, 4, "aux_buffer_setup: setup %lu SDBTs"
1725 " and %lu SDBs\n",
1726 sfb->num_sdbt, sfb->num_sdb);
1727
1728 return aux;
1729
1730 no_sdbt:
1731 /* SDBs (AUX buffer pages) are freed by caller */
1732 for (i = 0; i < sfb->num_sdbt; i++)
1733 free_page(aux->sdbt_index[i]);
1734 kfree(aux->sdb_index);
1735 no_sdb_index:
1736 kfree(aux->sdbt_index);
1737 no_sdbt_index:
1738 kfree(aux);
1739 no_aux:
1740 return NULL;
1741 }
1742
1743 static void cpumsf_pmu_read(struct perf_event *event)
1744 {
1745 /* Nothing to do ... updates are interrupt-driven */
1746 }
1747
1748 /* Activate sampling control.
1749 * Next call of pmu_enable() starts sampling.
1750 */
1751 static void cpumsf_pmu_start(struct perf_event *event, int flags)
1752 {
1753 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
1754
1755 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1756 return;
1757
1758 if (flags & PERF_EF_RELOAD)
1759 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1760
1761 perf_pmu_disable(event->pmu);
1762 event->hw.state = 0;
1763 cpuhw->lsctl.cs = 1;
1764 if (SAMPL_DIAG_MODE(&event->hw))
1765 cpuhw->lsctl.cd = 1;
1766 perf_pmu_enable(event->pmu);
1767 }
1768
1769 /* Deactivate sampling control.
1770 * Next call of pmu_enable() stops sampling.
1771 */
1772 static void cpumsf_pmu_stop(struct perf_event *event, int flags)
1773 {
1774 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
1775
1776 if (event->hw.state & PERF_HES_STOPPED)
1777 return;
1778
1779 perf_pmu_disable(event->pmu);
1780 cpuhw->lsctl.cs = 0;
1781 cpuhw->lsctl.cd = 0;
1782 event->hw.state |= PERF_HES_STOPPED;
1783
1784 if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {
1785 hw_perf_event_update(event, 1);
1786 event->hw.state |= PERF_HES_UPTODATE;
1787 }
1788 perf_pmu_enable(event->pmu);
1789 }
1790
1791 static int cpumsf_pmu_add(struct perf_event *event, int flags)
1792 {
1793 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
1794 struct aux_buffer *aux;
1795 int err;
1796
1797 if (cpuhw->flags & PMU_F_IN_USE)
1798 return -EAGAIN;
1799
1800 if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt)
1801 return -EINVAL;
1802
1803 err = 0;
1804 perf_pmu_disable(event->pmu);
1805
1806 event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1807
1808 /* Set up sampling controls. Always program the sampling register
1809 * using the SDB-table start. Reset TEAR_REG event hardware register
1810 * that is used by hw_perf_event_update() to store the sampling buffer
1811 * position after samples have been flushed.
1812 */
1813 cpuhw->lsctl.s = 0;
1814 cpuhw->lsctl.h = 1;
1815 cpuhw->lsctl.interval = SAMPL_RATE(&event->hw);
1816 if (!SAMPL_DIAG_MODE(&event->hw)) {
1817 cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt;
1818 cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt;
1819 hw_reset_registers(&event->hw, cpuhw->sfb.sdbt);
1820 }
1821
1822 /* Ensure sampling functions are in the disabled state. If disabled,
1823 * switch on sampling enable control. */
1824 if (WARN_ON_ONCE(cpuhw->lsctl.es == 1 || cpuhw->lsctl.ed == 1)) {
1825 err = -EAGAIN;
1826 goto out;
1827 }
1828 if (SAMPL_DIAG_MODE(&event->hw)) {
1829 aux = perf_aux_output_begin(&cpuhw->handle, event);
1830 if (!aux) {
1831 err = -EINVAL;
1832 goto out;
1833 }
1834 err = aux_output_begin(&cpuhw->handle, aux, cpuhw);
1835 if (err)
1836 goto out;
1837 cpuhw->lsctl.ed = 1;
1838 }
1839 cpuhw->lsctl.es = 1;
1840
1841 /* Set in_use flag and store event */
1842 cpuhw->event = event;
1843 cpuhw->flags |= PMU_F_IN_USE;
1844
1845 if (flags & PERF_EF_START)
1846 cpumsf_pmu_start(event, PERF_EF_RELOAD);
1847 out:
1848 perf_event_update_userpage(event);
1849 perf_pmu_enable(event->pmu);
1850 return err;
1851 }
1852
1853 static void cpumsf_pmu_del(struct perf_event *event, int flags)
1854 {
1855 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
1856
1857 perf_pmu_disable(event->pmu);
1858 cpumsf_pmu_stop(event, PERF_EF_UPDATE);
1859
1860 cpuhw->lsctl.es = 0;
1861 cpuhw->lsctl.ed = 0;
1862 cpuhw->flags &= ~PMU_F_IN_USE;
1863 cpuhw->event = NULL;
1864
1865 if (SAMPL_DIAG_MODE(&event->hw))
1866 aux_output_end(&cpuhw->handle);
1867 perf_event_update_userpage(event);
1868 perf_pmu_enable(event->pmu);
1869 }
1870
1871 CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF);
1872 CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG);
1873
1874 /* Attribute list for CPU_SF.
1875 *
1876 * The availablitiy depends on the CPU_MF sampling facility authorization
1877 * for basic + diagnositic samples. This is determined at initialization
1878 * time by the sampling facility device driver.
1879 * If the authorization for basic samples is turned off, it should be
1880 * also turned off for diagnostic sampling.
1881 *
1882 * During initialization of the device driver, check the authorization
1883 * level for diagnostic sampling and installs the attribute
1884 * file for diagnostic sampling if necessary.
1885 *
1886 * For now install a placeholder to reference all possible attributes:
1887 * SF_CYCLES_BASIC and SF_CYCLES_BASIC_DIAG.
1888 * Add another entry for the final NULL pointer.
1889 */
1890 enum {
1891 SF_CYCLES_BASIC_ATTR_IDX = 0,
1892 SF_CYCLES_BASIC_DIAG_ATTR_IDX,
1893 SF_CYCLES_ATTR_MAX
1894 };
1895
1896 static struct attribute *cpumsf_pmu_events_attr[SF_CYCLES_ATTR_MAX + 1] = {
1897 [SF_CYCLES_BASIC_ATTR_IDX] = CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC)
1898 };
1899
1900 PMU_FORMAT_ATTR(event, "config:0-63");
1901
1902 static struct attribute *cpumsf_pmu_format_attr[] = {
1903 &format_attr_event.attr,
1904 NULL,
1905 };
1906
1907 static struct attribute_group cpumsf_pmu_events_group = {
1908 .name = "events",
1909 .attrs = cpumsf_pmu_events_attr,
1910 };
1911 static struct attribute_group cpumsf_pmu_format_group = {
1912 .name = "format",
1913 .attrs = cpumsf_pmu_format_attr,
1914 };
1915 static const struct attribute_group *cpumsf_pmu_attr_groups[] = {
1916 &cpumsf_pmu_events_group,
1917 &cpumsf_pmu_format_group,
1918 NULL,
1919 };
1920
1921 static struct pmu cpumf_sampling = {
1922 .pmu_enable = cpumsf_pmu_enable,
1923 .pmu_disable = cpumsf_pmu_disable,
1924
1925 .event_init = cpumsf_pmu_event_init,
1926 .add = cpumsf_pmu_add,
1927 .del = cpumsf_pmu_del,
1928
1929 .start = cpumsf_pmu_start,
1930 .stop = cpumsf_pmu_stop,
1931 .read = cpumsf_pmu_read,
1932
1933 .attr_groups = cpumsf_pmu_attr_groups,
1934
1935 .setup_aux = aux_buffer_setup,
1936 .free_aux = aux_buffer_free,
1937 };
1938
1939 static void cpumf_measurement_alert(struct ext_code ext_code,
1940 unsigned int alert, unsigned long unused)
1941 {
1942 struct cpu_hw_sf *cpuhw;
1943
1944 if (!(alert & CPU_MF_INT_SF_MASK))
1945 return;
1946 inc_irq_stat(IRQEXT_CMS);
1947 cpuhw = this_cpu_ptr(&cpu_hw_sf);
1948
1949 /* Measurement alerts are shared and might happen when the PMU
1950 * is not reserved. Ignore these alerts in this case. */
1951 if (!(cpuhw->flags & PMU_F_RESERVED))
1952 return;
1953
1954 /* The processing below must take care of multiple alert events that
1955 * might be indicated concurrently. */
1956
1957 /* Program alert request */
1958 if (alert & CPU_MF_INT_SF_PRA) {
1959 if (cpuhw->flags & PMU_F_IN_USE)
1960 if (SAMPL_DIAG_MODE(&cpuhw->event->hw))
1961 hw_collect_aux(cpuhw);
1962 else
1963 hw_perf_event_update(cpuhw->event, 0);
1964 else
1965 WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE));
1966 }
1967
1968 /* Report measurement alerts only for non-PRA codes */
1969 if (alert != CPU_MF_INT_SF_PRA)
1970 debug_sprintf_event(sfdbg, 6, "measurement alert: 0x%x\n", alert);
1971
1972 /* Sampling authorization change request */
1973 if (alert & CPU_MF_INT_SF_SACA)
1974 qsi(&cpuhw->qsi);
1975
1976 /* Loss of sample data due to high-priority machine activities */
1977 if (alert & CPU_MF_INT_SF_LSDA) {
1978 pr_err("Sample data was lost\n");
1979 cpuhw->flags |= PMU_F_ERR_LSDA;
1980 sf_disable();
1981 }
1982
1983 /* Invalid sampling buffer entry */
1984 if (alert & (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE)) {
1985 pr_err("A sampling buffer entry is incorrect (alert=0x%x)\n",
1986 alert);
1987 cpuhw->flags |= PMU_F_ERR_IBE;
1988 sf_disable();
1989 }
1990 }
1991 static int cpusf_pmu_setup(unsigned int cpu, int flags)
1992 {
1993 /* Ignore the notification if no events are scheduled on the PMU.
1994 * This might be racy...
1995 */
1996 if (!atomic_read(&num_events))
1997 return 0;
1998
1999 local_irq_disable();
2000 setup_pmc_cpu(&flags);
2001 local_irq_enable();
2002 return 0;
2003 }
2004
2005 static int s390_pmu_sf_online_cpu(unsigned int cpu)
2006 {
2007 return cpusf_pmu_setup(cpu, PMC_INIT);
2008 }
2009
2010 static int s390_pmu_sf_offline_cpu(unsigned int cpu)
2011 {
2012 return cpusf_pmu_setup(cpu, PMC_RELEASE);
2013 }
2014
2015 static int param_get_sfb_size(char *buffer, const struct kernel_param *kp)
2016 {
2017 if (!cpum_sf_avail())
2018 return -ENODEV;
2019 return sprintf(buffer, "%lu,%lu", CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB);
2020 }
2021
2022 static int param_set_sfb_size(const char *val, const struct kernel_param *kp)
2023 {
2024 int rc;
2025 unsigned long min, max;
2026
2027 if (!cpum_sf_avail())
2028 return -ENODEV;
2029 if (!val || !strlen(val))
2030 return -EINVAL;
2031
2032 /* Valid parameter values: "min,max" or "max" */
2033 min = CPUM_SF_MIN_SDB;
2034 max = CPUM_SF_MAX_SDB;
2035 if (strchr(val, ','))
2036 rc = (sscanf(val, "%lu,%lu", &min, &max) == 2) ? 0 : -EINVAL;
2037 else
2038 rc = kstrtoul(val, 10, &max);
2039
2040 if (min < 2 || min >= max || max > get_num_physpages())
2041 rc = -EINVAL;
2042 if (rc)
2043 return rc;
2044
2045 sfb_set_limits(min, max);
2046 pr_info("The sampling buffer limits have changed to: "
2047 "min=%lu max=%lu (diag=x%lu)\n",
2048 CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB, CPUM_SF_SDB_DIAG_FACTOR);
2049 return 0;
2050 }
2051
2052 #define param_check_sfb_size(name, p) __param_check(name, p, void)
2053 static const struct kernel_param_ops param_ops_sfb_size = {
2054 .set = param_set_sfb_size,
2055 .get = param_get_sfb_size,
2056 };
2057
2058 #define RS_INIT_FAILURE_QSI 0x0001
2059 #define RS_INIT_FAILURE_BSDES 0x0002
2060 #define RS_INIT_FAILURE_ALRT 0x0003
2061 #define RS_INIT_FAILURE_PERF 0x0004
2062 static void __init pr_cpumsf_err(unsigned int reason)
2063 {
2064 pr_err("Sampling facility support for perf is not available: "
2065 "reason=%04x\n", reason);
2066 }
2067
2068 static int __init init_cpum_sampling_pmu(void)
2069 {
2070 struct hws_qsi_info_block si;
2071 int err;
2072
2073 if (!cpum_sf_avail())
2074 return -ENODEV;
2075
2076 memset(&si, 0, sizeof(si));
2077 if (qsi(&si)) {
2078 pr_cpumsf_err(RS_INIT_FAILURE_QSI);
2079 return -ENODEV;
2080 }
2081
2082 if (!si.as && !si.ad)
2083 return -ENODEV;
2084
2085 if (si.bsdes != sizeof(struct hws_basic_entry)) {
2086 pr_cpumsf_err(RS_INIT_FAILURE_BSDES);
2087 return -EINVAL;
2088 }
2089
2090 if (si.ad) {
2091 sfb_set_limits(CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB);
2092 /* Sampling of diagnostic data authorized,
2093 * install event into attribute list of PMU device.
2094 */
2095 cpumsf_pmu_events_attr[SF_CYCLES_BASIC_DIAG_ATTR_IDX] =
2096 CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG);
2097 }
2098
2099 sfdbg = debug_register(KMSG_COMPONENT, 2, 1, 80);
2100 if (!sfdbg) {
2101 pr_err("Registering for s390dbf failed\n");
2102 return -ENOMEM;
2103 }
2104 debug_register_view(sfdbg, &debug_sprintf_view);
2105
2106 err = register_external_irq(EXT_IRQ_MEASURE_ALERT,
2107 cpumf_measurement_alert);
2108 if (err) {
2109 pr_cpumsf_err(RS_INIT_FAILURE_ALRT);
2110 debug_unregister(sfdbg);
2111 goto out;
2112 }
2113
2114 err = perf_pmu_register(&cpumf_sampling, "cpum_sf", PERF_TYPE_RAW);
2115 if (err) {
2116 pr_cpumsf_err(RS_INIT_FAILURE_PERF);
2117 unregister_external_irq(EXT_IRQ_MEASURE_ALERT,
2118 cpumf_measurement_alert);
2119 debug_unregister(sfdbg);
2120 goto out;
2121 }
2122
2123 cpuhp_setup_state(CPUHP_AP_PERF_S390_SF_ONLINE, "perf/s390/sf:online",
2124 s390_pmu_sf_online_cpu, s390_pmu_sf_offline_cpu);
2125 out:
2126 return err;
2127 }
2128 arch_initcall(init_cpum_sampling_pmu);
2129 core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0640);