]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/x86/kernel/ds.c
mm, x86, ptrace, bts: defer branch trace stopping
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kernel / ds.c
CommitLineData
eee3af4a
MM
1/*
2 * Debug Store support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
93fa7636 5 * feature that is used for branch trace store (BTS) and
eee3af4a
MM
6 * precise-event based sampling (PEBS).
7 *
93fa7636 8 * It manages:
c2724775 9 * - DS and BTS hardware configuration
6abb11ae 10 * - buffer overflow handling (to be done)
93fa7636 11 * - buffer access
eee3af4a 12 *
c2724775
MM
13 * It does not do:
14 * - security checking (is the caller allowed to trace the task)
15 * - buffer allocation (memory accounting)
eee3af4a 16 *
eee3af4a 17 *
ba2607fe
MM
18 * Copyright (C) 2007-2009 Intel Corporation.
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
eee3af4a
MM
20 */
21
e9a22d1f 22#include <linux/kernel.h>
eee3af4a 23#include <linux/string.h>
e9a22d1f 24#include <linux/errno.h>
93fa7636 25#include <linux/sched.h>
e9a22d1f 26#include <linux/slab.h>
3c933904 27#include <linux/mm.h>
e9a22d1f
IM
28
29#include <asm/ds.h>
93fa7636 30
8a327f6d 31#include "ds_selftest.h"
93fa7636
MM
32
33/*
e9a22d1f 34 * The configuration for a particular DS hardware implementation:
93fa7636
MM
35 */
36struct ds_configuration {
e9a22d1f
IM
37 /* The name of the configuration: */
38 const char *name;
39
40 /* The size of pointer-typed fields in DS, BTS, and PEBS: */
41 unsigned char sizeof_ptr_field;
42
43 /* The size of a BTS/PEBS record in bytes: */
44 unsigned char sizeof_rec[2];
45
46 /* Control bit-masks indexed by enum ds_feature: */
47 unsigned long ctl[dsf_ctl_max];
93fa7636 48};
c2724775
MM
49static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
50
51#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
52
e9a22d1f
IM
53/* Maximal size of a DS configuration: */
54#define MAX_SIZEOF_DS (12 * 8)
55
56/* Maximal size of a BTS record: */
57#define MAX_SIZEOF_BTS (3 * 8)
c2724775 58
e9a22d1f
IM
59/* BTS and PEBS buffer alignment: */
60#define DS_ALIGNMENT (1 << 3)
c2724775 61
e9a22d1f
IM
62/* Mask of control bits in the DS MSR register: */
63#define BTS_CONTROL \
64 ( ds_cfg.ctl[dsf_bts] | \
65 ds_cfg.ctl[dsf_bts_kernel] | \
66 ds_cfg.ctl[dsf_bts_user] | \
67 ds_cfg.ctl[dsf_bts_overflow] )
eee3af4a 68
ca0002a1
MM
69/*
70 * A BTS or PEBS tracer.
71 *
72 * This holds the configuration of the tracer and serves as a handle
73 * to identify tracers.
74 */
75struct ds_tracer {
b8e47195 76 /* The DS context (partially) owned by this tracer. */
e9a22d1f 77 struct ds_context *context;
b8e47195 78 /* The buffer provided on ds_request() and its size in bytes. */
e9a22d1f
IM
79 void *buffer;
80 size_t size;
ca0002a1
MM
81};
82
83struct bts_tracer {
e9a22d1f
IM
84 /* The common DS part: */
85 struct ds_tracer ds;
86
87 /* The trace including the DS configuration: */
88 struct bts_trace trace;
89
90 /* Buffer overflow notification function: */
91 bts_ovfl_callback_t ovfl;
cac94f97
MM
92
93 /* Active flags affecting trace collection. */
94 unsigned int flags;
ca0002a1
MM
95};
96
97struct pebs_tracer {
e9a22d1f
IM
98 /* The common DS part: */
99 struct ds_tracer ds;
100
101 /* The trace including the DS configuration: */
102 struct pebs_trace trace;
103
104 /* Buffer overflow notification function: */
105 pebs_ovfl_callback_t ovfl;
ca0002a1 106};
eee3af4a
MM
107
108/*
109 * Debug Store (DS) save area configuration (see Intel64 and IA32
110 * Architectures Software Developer's Manual, section 18.5)
111 *
112 * The DS configuration consists of the following fields; different
113 * architetures vary in the size of those fields.
e9a22d1f 114 *
eee3af4a
MM
115 * - double-word aligned base linear address of the BTS buffer
116 * - write pointer into the BTS buffer
117 * - end linear address of the BTS buffer (one byte beyond the end of
118 * the buffer)
119 * - interrupt pointer into BTS buffer
120 * (interrupt occurs when write pointer passes interrupt pointer)
121 * - double-word aligned base linear address of the PEBS buffer
122 * - write pointer into the PEBS buffer
123 * - end linear address of the PEBS buffer (one byte beyond the end of
124 * the buffer)
125 * - interrupt pointer into PEBS buffer
126 * (interrupt occurs when write pointer passes interrupt pointer)
127 * - value to which counter is reset following counter overflow
128 *
93fa7636
MM
129 * Later architectures use 64bit pointers throughout, whereas earlier
130 * architectures use 32bit pointers in 32bit mode.
eee3af4a 131 *
eee3af4a 132 *
93fa7636
MM
133 * We compute the base address for the first 8 fields based on:
134 * - the field size stored in the DS configuration
135 * - the relative field position
136 * - an offset giving the start of the respective region
eee3af4a 137 *
93fa7636
MM
138 * This offset is further used to index various arrays holding
139 * information for BTS and PEBS at the respective index.
eee3af4a 140 *
93fa7636
MM
141 * On later 32bit processors, we only access the lower 32bit of the
142 * 64bit pointer fields. The upper halves will be zeroed out.
eee3af4a
MM
143 */
144
93fa7636
MM
145enum ds_field {
146 ds_buffer_base = 0,
147 ds_index,
148 ds_absolute_maximum,
149 ds_interrupt_threshold,
150};
eee3af4a 151
93fa7636 152enum ds_qualifier {
e9a22d1f 153 ds_bts = 0,
93fa7636 154 ds_pebs
eee3af4a
MM
155};
156
e9a22d1f
IM
157static inline unsigned long
158ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
93fa7636 159{
bc44fb5f 160 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
93fa7636
MM
161 return *(unsigned long *)base;
162}
163
e9a22d1f
IM
164static inline void
165ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
166 unsigned long value)
93fa7636 167{
bc44fb5f 168 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
93fa7636
MM
169 (*(unsigned long *)base) = value;
170}
171
172
eee3af4a 173/*
6abb11ae 174 * Locking is done only for allocating BTS or PEBS resources.
eee3af4a 175 */
c2724775 176static DEFINE_SPINLOCK(ds_lock);
eee3af4a 177
eee3af4a 178/*
93fa7636
MM
179 * We either support (system-wide) per-cpu or per-thread allocation.
180 * We distinguish the two based on the task_struct pointer, where a
181 * NULL pointer indicates per-cpu allocation for the current cpu.
182 *
183 * Allocations are use-counted. As soon as resources are allocated,
184 * further allocations must be of the same type (per-cpu or
185 * per-thread). We model this by counting allocations (i.e. the number
186 * of tracers of a certain type) for one type negatively:
187 * =0 no tracers
188 * >0 number of per-thread tracers
189 * <0 number of per-cpu tracers
190 *
93fa7636
MM
191 * Tracers essentially gives the number of ds contexts for a certain
192 * type of allocation.
eee3af4a 193 */
c2724775 194static atomic_t tracers = ATOMIC_INIT(0);
93fa7636
MM
195
196static inline void get_tracer(struct task_struct *task)
eee3af4a 197{
c2724775
MM
198 if (task)
199 atomic_inc(&tracers);
200 else
201 atomic_dec(&tracers);
eee3af4a 202}
93fa7636
MM
203
204static inline void put_tracer(struct task_struct *task)
eee3af4a 205{
c2724775
MM
206 if (task)
207 atomic_dec(&tracers);
208 else
209 atomic_inc(&tracers);
eee3af4a 210}
93fa7636
MM
211
212static inline int check_tracer(struct task_struct *task)
eee3af4a 213{
c2724775
MM
214 return task ?
215 (atomic_read(&tracers) >= 0) :
216 (atomic_read(&tracers) <= 0);
eee3af4a 217}
93fa7636
MM
218
219
220/*
221 * The DS context is either attached to a thread or to a cpu:
222 * - in the former case, the thread_struct contains a pointer to the
223 * attached context.
224 * - in the latter case, we use a static array of per-cpu context
225 * pointers.
226 *
227 * Contexts are use-counted. They are allocated on first access and
228 * deallocated when the last user puts the context.
93fa7636 229 */
c2724775 230struct ds_context {
e9a22d1f
IM
231 /* The DS configuration; goes into MSR_IA32_DS_AREA: */
232 unsigned char ds[MAX_SIZEOF_DS];
233
234 /* The owner of the BTS and PEBS configuration, respectively: */
235 struct bts_tracer *bts_master;
236 struct pebs_tracer *pebs_master;
237
238 /* Use count: */
c2724775 239 unsigned long count;
e9a22d1f
IM
240
241 /* Pointer to the context pointer field: */
242 struct ds_context **this;
243
244 /* The traced task; NULL for current cpu: */
245 struct task_struct *task;
c2724775
MM
246};
247
248static DEFINE_PER_CPU(struct ds_context *, system_context_array);
93fa7636 249
c2724775 250#define system_context per_cpu(system_context_array, smp_processor_id())
93fa7636 251
cc1dc6d0
MM
252
253static inline struct ds_context *ds_get_context(struct task_struct *task)
eee3af4a 254{
93fa7636 255 struct ds_context **p_context =
c2724775 256 (task ? &task->thread.ds_ctx : &system_context);
cc1dc6d0
MM
257 struct ds_context *context = NULL;
258 struct ds_context *new_context = NULL;
de90add3 259 unsigned long irq;
93fa7636 260
c78a3956
MM
261 /*
262 * Chances are small that we already have a context.
263 *
264 * Contexts for per-cpu tracing are allocated using
265 * smp_call_function(). We must not sleep.
266 */
267 new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC);
cc1dc6d0
MM
268 if (!new_context)
269 return NULL;
de90add3 270
cc1dc6d0 271 spin_lock_irqsave(&ds_lock, irq);
93fa7636 272
cc1dc6d0
MM
273 context = *p_context;
274 if (!context) {
275 context = new_context;
93fa7636 276
cc1dc6d0
MM
277 context->this = p_context;
278 context->task = task;
279 context->count = 0;
93fa7636 280
cc1dc6d0
MM
281 if (task)
282 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
c2724775 283
cc1dc6d0
MM
284 if (!task || (task == current))
285 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
c2724775 286
cc1dc6d0
MM
287 *p_context = context;
288 }
c2724775 289
cc1dc6d0 290 context->count++;
c2724775 291
cc1dc6d0 292 spin_unlock_irqrestore(&ds_lock, irq);
93fa7636 293
cc1dc6d0
MM
294 if (context != new_context)
295 kfree(new_context);
93fa7636
MM
296
297 return context;
eee3af4a 298}
93fa7636 299
93fa7636 300static inline void ds_put_context(struct ds_context *context)
eee3af4a 301{
de90add3
MM
302 unsigned long irq;
303
93fa7636
MM
304 if (!context)
305 return;
306
de90add3 307 spin_lock_irqsave(&ds_lock, irq);
93fa7636 308
c2724775
MM
309 if (--context->count) {
310 spin_unlock_irqrestore(&ds_lock, irq);
311 return;
312 }
93fa7636 313
573da422 314 *(context->this) = NULL;
93fa7636
MM
315
316 if (context->task)
317 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
318
319 if (!context->task || (context->task == current))
320 wrmsrl(MSR_IA32_DS_AREA, 0);
321
de90add3 322 spin_unlock_irqrestore(&ds_lock, irq);
c2724775
MM
323
324 kfree(context);
eee3af4a 325}
93fa7636
MM
326
327
328/*
c2724775 329 * Call the tracer's callback on a buffer overflow.
93fa7636 330 *
93fa7636
MM
331 * context: the ds context
332 * qual: the buffer type
333 */
ca0002a1
MM
334static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
335{
336 switch (qual) {
c2724775
MM
337 case ds_bts:
338 if (context->bts_master &&
339 context->bts_master->ovfl)
340 context->bts_master->ovfl(context->bts_master);
341 break;
342 case ds_pebs:
343 if (context->pebs_master &&
344 context->pebs_master->ovfl)
345 context->pebs_master->ovfl(context->pebs_master);
ca0002a1 346 break;
ca0002a1 347 }
c2724775
MM
348}
349
350
351/*
352 * Write raw data into the BTS or PEBS buffer.
353 *
354 * The remainder of any partially written record is zeroed out.
355 *
356 * context: the DS context
e9a22d1f
IM
357 * qual: the buffer type
358 * record: the data to write
359 * size: the size of the data
c2724775
MM
360 */
361static int ds_write(struct ds_context *context, enum ds_qualifier qual,
362 const void *record, size_t size)
363{
364 int bytes_written = 0;
365
366 if (!record)
367 return -EINVAL;
368
369 while (size) {
370 unsigned long base, index, end, write_end, int_th;
371 unsigned long write_size, adj_write_size;
372
373 /*
b8e47195 374 * Write as much as possible without producing an
c2724775
MM
375 * overflow interrupt.
376 *
b8e47195 377 * Interrupt_threshold must either be
c2724775
MM
378 * - bigger than absolute_maximum or
379 * - point to a record between buffer_base and absolute_maximum
380 *
b8e47195 381 * Index points to a valid record.
c2724775
MM
382 */
383 base = ds_get(context->ds, qual, ds_buffer_base);
384 index = ds_get(context->ds, qual, ds_index);
385 end = ds_get(context->ds, qual, ds_absolute_maximum);
386 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
387
388 write_end = min(end, int_th);
389
b8e47195
MM
390 /*
391 * If we are already beyond the interrupt threshold,
392 * we fill the entire buffer.
393 */
c2724775
MM
394 if (write_end <= index)
395 write_end = end;
396
397 if (write_end <= index)
398 break;
399
400 write_size = min((unsigned long) size, write_end - index);
401 memcpy((void *)index, record, write_size);
402
403 record = (const char *)record + write_size;
404 size -= write_size;
405 bytes_written += write_size;
406
407 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
408 adj_write_size *= ds_cfg.sizeof_rec[qual];
409
b8e47195 410 /* Zero out trailing bytes. */
c2724775
MM
411 memset((char *)index + write_size, 0,
412 adj_write_size - write_size);
413 index += adj_write_size;
414
415 if (index >= end)
416 index = base;
417 ds_set(context->ds, qual, ds_index, index);
418
419 if (index >= int_th)
420 ds_overflow(context, qual);
421 }
422
423 return bytes_written;
424}
425
426
427/*
428 * Branch Trace Store (BTS) uses the following format. Different
429 * architectures vary in the size of those fields.
430 * - source linear address
431 * - destination linear address
432 * - flags
433 *
434 * Later architectures use 64bit pointers throughout, whereas earlier
435 * architectures use 32bit pointers in 32bit mode.
436 *
bc44fb5f 437 * We compute the base address for the fields based on:
c2724775
MM
438 * - the field size stored in the DS configuration
439 * - the relative field position
440 *
441 * In order to store additional information in the BTS buffer, we use
442 * a special source address to indicate that the record requires
443 * special interpretation.
444 *
445 * Netburst indicated via a bit in the flags field whether the branch
446 * was predicted; this is ignored.
447 *
448 * We use two levels of abstraction:
449 * - the raw data level defined here
450 * - an arch-independent level defined in ds.h
451 */
452
453enum bts_field {
454 bts_from,
455 bts_to,
456 bts_flags,
457
e9a22d1f
IM
458 bts_qual = bts_from,
459 bts_jiffies = bts_to,
460 bts_pid = bts_flags,
c2724775 461
e9a22d1f
IM
462 bts_qual_mask = (bts_qual_max - 1),
463 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
c2724775
MM
464};
465
466static inline unsigned long bts_get(const char *base, enum bts_field field)
467{
bc44fb5f 468 base += (ds_cfg.sizeof_ptr_field * field);
c2724775
MM
469 return *(unsigned long *)base;
470}
471
472static inline void bts_set(char *base, enum bts_field field, unsigned long val)
473{
bc44fb5f 474 base += (ds_cfg.sizeof_ptr_field * field);;
c2724775
MM
475 (*(unsigned long *)base) = val;
476}
477
478
479/*
480 * The raw BTS data is architecture dependent.
481 *
482 * For higher-level users, we give an arch-independent view.
483 * - ds.h defines struct bts_struct
484 * - bts_read translates one raw bts record into a bts_struct
485 * - bts_write translates one bts_struct into the raw format and
486 * writes it into the top of the parameter tracer's buffer.
487 *
488 * return: bytes read/written on success; -Eerrno, otherwise
489 */
e9a22d1f
IM
490static int
491bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
c2724775
MM
492{
493 if (!tracer)
494 return -EINVAL;
495
496 if (at < tracer->trace.ds.begin)
497 return -EINVAL;
498
499 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
500 return -EINVAL;
501
502 memset(out, 0, sizeof(*out));
503 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
504 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
505 out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
506 out->variant.timestamp.pid = bts_get(at, bts_pid);
507 } else {
508 out->qualifier = bts_branch;
509 out->variant.lbr.from = bts_get(at, bts_from);
510 out->variant.lbr.to = bts_get(at, bts_to);
d072c25f
MM
511
512 if (!out->variant.lbr.from && !out->variant.lbr.to)
513 out->qualifier = bts_invalid;
c2724775
MM
514 }
515
516 return ds_cfg.sizeof_rec[ds_bts];
517}
518
519static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
520{
521 unsigned char raw[MAX_SIZEOF_BTS];
522
523 if (!tracer)
524 return -EINVAL;
525
526 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
527 return -EOVERFLOW;
528
529 switch (in->qualifier) {
530 case bts_invalid:
531 bts_set(raw, bts_from, 0);
532 bts_set(raw, bts_to, 0);
533 bts_set(raw, bts_flags, 0);
534 break;
535 case bts_branch:
536 bts_set(raw, bts_from, in->variant.lbr.from);
537 bts_set(raw, bts_to, in->variant.lbr.to);
538 bts_set(raw, bts_flags, 0);
539 break;
540 case bts_task_arrives:
541 case bts_task_departs:
542 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
543 bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
544 bts_set(raw, bts_pid, in->variant.timestamp.pid);
ca0002a1 545 break;
c2724775
MM
546 default:
547 return -EINVAL;
ca0002a1 548 }
c2724775
MM
549
550 return ds_write(tracer->ds.context, ds_bts, raw,
551 ds_cfg.sizeof_rec[ds_bts]);
eee3af4a 552}
93fa7636
MM
553
554
c2724775
MM
555static void ds_write_config(struct ds_context *context,
556 struct ds_trace *cfg, enum ds_qualifier qual)
557{
558 unsigned char *ds = context->ds;
559
560 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
561 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
562 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
563 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
564}
565
566static void ds_read_config(struct ds_context *context,
567 struct ds_trace *cfg, enum ds_qualifier qual)
eee3af4a 568{
c2724775
MM
569 unsigned char *ds = context->ds;
570
571 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
572 cfg->top = (void *)ds_get(ds, qual, ds_index);
573 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
574 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
575}
576
577static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
578 void *base, size_t size, size_t ith,
579 unsigned int flags) {
93fa7636 580 unsigned long buffer, adj;
ca0002a1 581
b8e47195
MM
582 /*
583 * Adjust the buffer address and size to meet alignment
ca0002a1
MM
584 * constraints:
585 * - buffer is double-word aligned
586 * - size is multiple of record size
587 *
588 * We checked the size at the very beginning; we have enough
589 * space to do the adjustment.
590 */
591 buffer = (unsigned long)base;
592
593 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
594 buffer += adj;
595 size -= adj;
596
c2724775
MM
597 trace->n = size / ds_cfg.sizeof_rec[qual];
598 trace->size = ds_cfg.sizeof_rec[qual];
ca0002a1 599
c2724775 600 size = (trace->n * trace->size);
ca0002a1 601
c2724775
MM
602 trace->begin = (void *)buffer;
603 trace->top = trace->begin;
604 trace->end = (void *)(buffer + size);
b8e47195
MM
605 /*
606 * The value for 'no threshold' is -1, which will set the
ca0002a1
MM
607 * threshold outside of the buffer, just like we want it.
608 */
c2724775
MM
609 trace->ith = (void *)(buffer + size - ith);
610
611 trace->flags = flags;
ca0002a1
MM
612}
613
c2724775
MM
614
615static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
616 enum ds_qualifier qual, struct task_struct *task,
617 void *base, size_t size, size_t th, unsigned int flags)
ca0002a1
MM
618{
619 struct ds_context *context;
ca0002a1 620 int error;
93fa7636 621
bc44fb5f
MM
622 error = -EOPNOTSUPP;
623 if (!ds_cfg.sizeof_rec[qual])
624 goto out;
625
6abb11ae
MM
626 error = -EINVAL;
627 if (!base)
628 goto out;
629
b8e47195 630 /* We require some space to do alignment adjustments below. */
ca0002a1
MM
631 error = -EINVAL;
632 if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
633 goto out;
93fa7636 634
ca0002a1
MM
635 if (th != (size_t)-1) {
636 th *= ds_cfg.sizeof_rec[qual];
637
638 error = -EINVAL;
639 if (size <= th)
640 goto out;
641 }
642
ca0002a1
MM
643 tracer->buffer = base;
644 tracer->size = size;
93fa7636 645
ca0002a1
MM
646 error = -ENOMEM;
647 context = ds_get_context(task);
93fa7636 648 if (!context)
ca0002a1
MM
649 goto out;
650 tracer->context = context;
651
c2724775 652 ds_init_ds_trace(trace, qual, base, size, th, flags);
de90add3 653
c2724775 654 error = 0;
ca0002a1 655 out:
93fa7636 656 return error;
eee3af4a 657}
93fa7636 658
ca0002a1
MM
659struct bts_tracer *ds_request_bts(struct task_struct *task,
660 void *base, size_t size,
c2724775
MM
661 bts_ovfl_callback_t ovfl, size_t th,
662 unsigned int flags)
eee3af4a 663{
ca0002a1 664 struct bts_tracer *tracer;
c2724775 665 unsigned long irq;
ca0002a1 666 int error;
93fa7636 667
b8e47195 668 /* Buffer overflow notification is not yet implemented. */
ca0002a1
MM
669 error = -EOPNOTSUPP;
670 if (ovfl)
671 goto out;
672
c78a3956
MM
673 /*
674 * Per-cpu tracing is typically requested using smp_call_function().
675 * We must not sleep.
676 */
ca0002a1 677 error = -ENOMEM;
c78a3956 678 tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
ca0002a1
MM
679 if (!tracer)
680 goto out;
681 tracer->ovfl = ovfl;
682
c2724775
MM
683 error = ds_request(&tracer->ds, &tracer->trace.ds,
684 ds_bts, task, base, size, th, flags);
ca0002a1
MM
685 if (error < 0)
686 goto out_tracer;
687
c2724775
MM
688
689 spin_lock_irqsave(&ds_lock, irq);
690
691 error = -EPERM;
692 if (!check_tracer(task))
693 goto out_unlock;
694 get_tracer(task);
695
696 error = -EPERM;
697 if (tracer->ds.context->bts_master)
698 goto out_put_tracer;
699 tracer->ds.context->bts_master = tracer;
700
701 spin_unlock_irqrestore(&ds_lock, irq);
702
703
704 tracer->trace.read = bts_read;
705 tracer->trace.write = bts_write;
706
707 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
708 ds_resume_bts(tracer);
709
ca0002a1
MM
710 return tracer;
711
c2724775
MM
712 out_put_tracer:
713 put_tracer(task);
714 out_unlock:
715 spin_unlock_irqrestore(&ds_lock, irq);
716 ds_put_context(tracer->ds.context);
ca0002a1 717 out_tracer:
6abb11ae 718 kfree(tracer);
ca0002a1
MM
719 out:
720 return ERR_PTR(error);
eee3af4a 721}
93fa7636 722
ca0002a1
MM
723struct pebs_tracer *ds_request_pebs(struct task_struct *task,
724 void *base, size_t size,
c2724775
MM
725 pebs_ovfl_callback_t ovfl, size_t th,
726 unsigned int flags)
eee3af4a 727{
ca0002a1 728 struct pebs_tracer *tracer;
c2724775 729 unsigned long irq;
93fa7636
MM
730 int error;
731
b8e47195 732 /* Buffer overflow notification is not yet implemented. */
ca0002a1
MM
733 error = -EOPNOTSUPP;
734 if (ovfl)
93fa7636
MM
735 goto out;
736
c78a3956
MM
737 /*
738 * Per-cpu tracing is typically requested using smp_call_function().
739 * We must not sleep.
740 */
ca0002a1 741 error = -ENOMEM;
c78a3956 742 tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
ca0002a1
MM
743 if (!tracer)
744 goto out;
745 tracer->ovfl = ovfl;
93fa7636 746
c2724775
MM
747 error = ds_request(&tracer->ds, &tracer->trace.ds,
748 ds_pebs, task, base, size, th, flags);
ca0002a1
MM
749 if (error < 0)
750 goto out_tracer;
93fa7636 751
c2724775
MM
752 spin_lock_irqsave(&ds_lock, irq);
753
754 error = -EPERM;
755 if (!check_tracer(task))
756 goto out_unlock;
757 get_tracer(task);
758
759 error = -EPERM;
760 if (tracer->ds.context->pebs_master)
761 goto out_put_tracer;
762 tracer->ds.context->pebs_master = tracer;
763
764 spin_unlock_irqrestore(&ds_lock, irq);
765
73bf1b62 766 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
c2724775
MM
767 ds_resume_pebs(tracer);
768
ca0002a1
MM
769 return tracer;
770
c2724775
MM
771 out_put_tracer:
772 put_tracer(task);
773 out_unlock:
774 spin_unlock_irqrestore(&ds_lock, irq);
775 ds_put_context(tracer->ds.context);
ca0002a1 776 out_tracer:
6abb11ae 777 kfree(tracer);
93fa7636 778 out:
ca0002a1
MM
779 return ERR_PTR(error);
780}
781
c2724775 782void ds_release_bts(struct bts_tracer *tracer)
eee3af4a 783{
ca0002a1 784 if (!tracer)
c2724775 785 return;
ca0002a1 786
c2724775 787 ds_suspend_bts(tracer);
ca0002a1 788
c2724775
MM
789 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
790 tracer->ds.context->bts_master = NULL;
93fa7636 791
c2724775
MM
792 put_tracer(tracer->ds.context->task);
793 ds_put_context(tracer->ds.context);
ca0002a1 794
ca0002a1 795 kfree(tracer);
eee3af4a 796}
93fa7636 797
c2724775 798void ds_suspend_bts(struct bts_tracer *tracer)
eee3af4a 799{
c2724775 800 struct task_struct *task;
ca0002a1 801
ca0002a1 802 if (!tracer)
c2724775 803 return;
ca0002a1 804
cac94f97
MM
805 tracer->flags = 0;
806
c2724775 807 task = tracer->ds.context->task;
ca0002a1 808
c2724775
MM
809 if (!task || (task == current))
810 update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
ca0002a1 811
c2724775
MM
812 if (task) {
813 task->thread.debugctlmsr &= ~BTS_CONTROL;
eee3af4a 814
c2724775
MM
815 if (!task->thread.debugctlmsr)
816 clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
817 }
93fa7636 818}
eee3af4a 819
c2724775 820void ds_resume_bts(struct bts_tracer *tracer)
93fa7636 821{
c2724775
MM
822 struct task_struct *task;
823 unsigned long control;
eee3af4a 824
ca0002a1 825 if (!tracer)
c2724775 826 return;
eee3af4a 827
cac94f97
MM
828 tracer->flags = tracer->trace.ds.flags;
829
c2724775 830 task = tracer->ds.context->task;
ca0002a1 831
c2724775
MM
832 control = ds_cfg.ctl[dsf_bts];
833 if (!(tracer->trace.ds.flags & BTS_KERNEL))
834 control |= ds_cfg.ctl[dsf_bts_kernel];
835 if (!(tracer->trace.ds.flags & BTS_USER))
836 control |= ds_cfg.ctl[dsf_bts_user];
eee3af4a 837
c2724775
MM
838 if (task) {
839 task->thread.debugctlmsr |= control;
840 set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
841 }
ca0002a1 842
c2724775
MM
843 if (!task || (task == current))
844 update_debugctlmsr(get_debugctlmsr() | control);
eee3af4a
MM
845}
846
c2724775 847void ds_release_pebs(struct pebs_tracer *tracer)
eee3af4a 848{
ca0002a1 849 if (!tracer)
c2724775 850 return;
93fa7636 851
c2724775 852 ds_suspend_pebs(tracer);
93fa7636 853
c2724775
MM
854 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
855 tracer->ds.context->pebs_master = NULL;
eee3af4a 856
c2724775
MM
857 put_tracer(tracer->ds.context->task);
858 ds_put_context(tracer->ds.context);
eee3af4a 859
c2724775 860 kfree(tracer);
a95d67f8
MM
861}
862
c2724775 863void ds_suspend_pebs(struct pebs_tracer *tracer)
a95d67f8 864{
a95d67f8 865
93fa7636 866}
eee3af4a 867
c2724775 868void ds_resume_pebs(struct pebs_tracer *tracer)
93fa7636 869{
eee3af4a 870
eee3af4a
MM
871}
872
c2724775 873const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
eee3af4a 874{
ca0002a1 875 if (!tracer)
c2724775 876 return NULL;
ca0002a1 877
c2724775
MM
878 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
879 return &tracer->trace;
93fa7636 880}
eee3af4a 881
c2724775 882const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
93fa7636 883{
ca0002a1 884 if (!tracer)
c2724775 885 return NULL;
ca0002a1 886
c2724775
MM
887 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
888 tracer->trace.reset_value =
bc44fb5f
MM
889 *(u64 *)(tracer->ds.context->ds +
890 (ds_cfg.sizeof_ptr_field * 8));
ca0002a1 891
c2724775 892 return &tracer->trace;
93fa7636 893}
eee3af4a 894
c2724775 895int ds_reset_bts(struct bts_tracer *tracer)
93fa7636 896{
ca0002a1
MM
897 if (!tracer)
898 return -EINVAL;
899
c2724775 900 tracer->trace.ds.top = tracer->trace.ds.begin;
ca0002a1 901
c2724775
MM
902 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
903 (unsigned long)tracer->trace.ds.top);
ca0002a1
MM
904
905 return 0;
93fa7636 906}
eee3af4a 907
c2724775 908int ds_reset_pebs(struct pebs_tracer *tracer)
93fa7636 909{
ca0002a1
MM
910 if (!tracer)
911 return -EINVAL;
eee3af4a 912
c2724775 913 tracer->trace.ds.top = tracer->trace.ds.begin;
eee3af4a 914
c2724775
MM
915 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
916 (unsigned long)tracer->trace.ds.top);
93fa7636 917
ca0002a1 918 return 0;
eee3af4a
MM
919}
920
ca0002a1 921int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
eee3af4a 922{
ca0002a1
MM
923 if (!tracer)
924 return -EINVAL;
eee3af4a 925
bc44fb5f
MM
926 *(u64 *)(tracer->ds.context->ds +
927 (ds_cfg.sizeof_ptr_field * 8)) = value;
93fa7636 928
ca0002a1 929 return 0;
93fa7636
MM
930}
931
c2724775 932static const struct ds_configuration ds_cfg_netburst = {
ba2607fe 933 .name = "Netburst",
c2724775
MM
934 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
935 .ctl[dsf_bts_kernel] = (1 << 5),
936 .ctl[dsf_bts_user] = (1 << 6),
eee3af4a 937};
c2724775 938static const struct ds_configuration ds_cfg_pentium_m = {
ba2607fe 939 .name = "Pentium M",
c2724775 940 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
eee3af4a 941};
ba2607fe
MM
942static const struct ds_configuration ds_cfg_core2_atom = {
943 .name = "Core 2/Atom",
c2724775
MM
944 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
945 .ctl[dsf_bts_kernel] = (1 << 9),
946 .ctl[dsf_bts_user] = (1 << 10),
c2724775 947};
eee3af4a 948
c2724775 949static void
bc44fb5f
MM
950ds_configure(const struct ds_configuration *cfg,
951 struct cpuinfo_x86 *cpu)
eee3af4a 952{
bc44fb5f
MM
953 unsigned long nr_pebs_fields = 0;
954
955 printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
956
957#ifdef __i386__
958 nr_pebs_fields = 10;
959#else
960 nr_pebs_fields = 18;
961#endif
962
c2724775 963 memset(&ds_cfg, 0, sizeof(ds_cfg));
eee3af4a 964 ds_cfg = *cfg;
ca0002a1 965
bc44fb5f
MM
966 ds_cfg.sizeof_ptr_field =
967 (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
c2724775 968
bc44fb5f
MM
969 ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
970 ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
971
972 if (!cpu_has(cpu, X86_FEATURE_BTS)) {
973 ds_cfg.sizeof_rec[ds_bts] = 0;
c2724775
MM
974 printk(KERN_INFO "[ds] bts not available\n");
975 }
bc44fb5f
MM
976 if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
977 ds_cfg.sizeof_rec[ds_pebs] = 0;
c2724775 978 printk(KERN_INFO "[ds] pebs not available\n");
bc44fb5f
MM
979 }
980
8a327f6d
MM
981 if (ds_cfg.sizeof_rec[ds_bts]) {
982 int error;
983
984 error = ds_selftest_bts();
985 if (error) {
986 WARN(1, "[ds] selftest failed. disabling bts.\n");
987 ds_cfg.sizeof_rec[ds_bts] = 0;
988 }
989 }
990
991 if (ds_cfg.sizeof_rec[ds_pebs]) {
992 int error;
993
994 error = ds_selftest_pebs();
995 if (error) {
996 WARN(1, "[ds] selftest failed. disabling pebs.\n");
997 ds_cfg.sizeof_rec[ds_pebs] = 0;
998 }
999 }
1000
bc44fb5f
MM
1001 printk(KERN_INFO "[ds] sizes: address: %u bit, ",
1002 8 * ds_cfg.sizeof_ptr_field);
1003 printk("bts/pebs record: %u/%u bytes\n",
1004 ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
ca0002a1 1005
79258a35 1006 WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field));
eee3af4a
MM
1007}
1008
1009void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
1010{
1011 switch (c->x86) {
1012 case 0x6:
1013 switch (c->x86_model) {
ba2607fe
MM
1014 case 0x9:
1015 case 0xd: /* Pentium M */
bc44fb5f 1016 ds_configure(&ds_cfg_pentium_m, c);
eee3af4a 1017 break;
ba2607fe
MM
1018 case 0xf:
1019 case 0x17: /* Core2 */
1020 case 0x1c: /* Atom */
bc44fb5f 1021 ds_configure(&ds_cfg_core2_atom, c);
ba2607fe 1022 break;
b8e47195 1023 case 0x1a: /* Core i7 */
ba2607fe 1024 default:
b8e47195 1025 /* Sorry, don't know about them. */
eee3af4a 1026 break;
eee3af4a
MM
1027 }
1028 break;
ba2607fe 1029 case 0xf:
eee3af4a 1030 switch (c->x86_model) {
eee3af4a
MM
1031 case 0x0:
1032 case 0x1:
1033 case 0x2: /* Netburst */
bc44fb5f 1034 ds_configure(&ds_cfg_netburst, c);
eee3af4a 1035 break;
eee3af4a 1036 default:
b8e47195 1037 /* Sorry, don't know about them. */
eee3af4a
MM
1038 break;
1039 }
1040 break;
1041 default:
b8e47195 1042 /* Sorry, don't know about them. */
eee3af4a
MM
1043 break;
1044 }
1045}
93fa7636 1046
cac94f97
MM
1047static inline void ds_take_timestamp(struct ds_context *context,
1048 enum bts_qualifier qualifier,
1049 struct task_struct *task)
1050{
1051 struct bts_tracer *tracer = context->bts_master;
1052 struct bts_struct ts;
1053
1054 /* Prevent compilers from reading the tracer pointer twice. */
1055 barrier();
1056
1057 if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
1058 return;
1059
1060 memset(&ts, 0, sizeof(ts));
1061 ts.qualifier = qualifier;
1062 ts.variant.timestamp.jiffies = jiffies_64;
1063 ts.variant.timestamp.pid = task->pid;
1064
1065 bts_write(tracer, &ts);
1066}
1067
c2724775
MM
1068/*
1069 * Change the DS configuration from tracing prev to tracing next.
1070 */
1071void ds_switch_to(struct task_struct *prev, struct task_struct *next)
93fa7636 1072{
cac94f97
MM
1073 struct ds_context *prev_ctx = prev->thread.ds_ctx;
1074 struct ds_context *next_ctx = next->thread.ds_ctx;
1075 unsigned long debugctlmsr = next->thread.debugctlmsr;
1076
1077 /* Make sure all data is read before we start. */
1078 barrier();
c2724775
MM
1079
1080 if (prev_ctx) {
1081 update_debugctlmsr(0);
1082
cac94f97 1083 ds_take_timestamp(prev_ctx, bts_task_departs, prev);
c2724775
MM
1084 }
1085
1086 if (next_ctx) {
cac94f97 1087 ds_take_timestamp(next_ctx, bts_task_arrives, next);
c2724775
MM
1088
1089 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
ca0002a1 1090 }
c2724775 1091
cac94f97 1092 update_debugctlmsr(debugctlmsr);
93fa7636 1093}
bf53de90
MM
1094
1095void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
1096{
1097 clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
1098 tsk->thread.ds_ctx = NULL;
1099}
1100
1101void ds_exit_thread(struct task_struct *tsk)
1102{
bf53de90 1103}